"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "utf8convert.cc" between
xapian-omega-1.4.18.tar.xz and xapian-omega-1.4.19.tar.xz

About: Xapian Omega is an application built on Xapian, consisting of indexers and a CGI search frontend.

utf8convert.cc  (xapian-omega-1.4.18.tar.xz):utf8convert.cc  (xapian-omega-1.4.19.tar.xz)
/** @file /** @file
* @brief convert a string to UTF-8 encoding. * @brief convert a string to UTF-8 encoding.
*/ */
/* Copyright (C) 2006,2007,2008,2010,2013,2017,2019 Olly Betts /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021 Olly Betts
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version. * (at your option) any later version.
* *
* This program is distributed in the hope that it will be useful, * This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details. * GNU General Public License for more details.
skipping to change at line 38 skipping to change at line 38
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
# include <iconv.h> # include <iconv.h>
#endif #endif
#include <xapian.h> #include <xapian.h>
#include "strcasecmp.h" #include "strcasecmp.h"
#include "stringutils.h" #include "stringutils.h"
using namespace std; using namespace std;
void bool
convert_to_utf8(string & text, const string & charset) convert_to_utf8_(const string& text, const string& charset, string& output)
{ {
// Shortcut if it's already in utf8! // Shortcut if it's already in utf8!
if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0) if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
return; return false;
if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0) if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
return; return false;
if (charset.size() == 8 && strcasecmp(charset.c_str(), "us-ascii") == 0) if (charset.size() == 8 && strcasecmp(charset.c_str(), "us-ascii") == 0)
return; return false;
// Nobody has told us what charset it's in, so do as little work as // Nobody has told us what charset it's in, so do as little work as
// possible! // possible!
if (charset.empty()) if (charset.empty())
return; return false;
char buf[1024]; char buf[1024];
string tmp; string tmp;
/* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2, /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
* utf-16be/ucs-2be, and utf-16le/ucs-2le. */ * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
const char * p = charset.c_str(); const char * p = charset.c_str();
bool utf16 = false; bool utf16 = false;
if (strncasecmp(p, "utf", 3) == 0) { if (strncasecmp(p, "utf", 3) == 0) {
skipping to change at line 77 skipping to change at line 77
utf16 = true; utf16 = true;
} else if (strncasecmp(p, "ucs", 3) == 0) { } else if (strncasecmp(p, "ucs", 3) == 0) {
p += 3; p += 3;
if (*p == '-' || *p == '_' || *p == ' ') ++p; if (*p == '-' || *p == '_' || *p == ' ') ++p;
if (*p != '2') goto try_iconv; if (*p != '2') goto try_iconv;
++p; ++p;
utf16 = true; utf16 = true;
} }
if (utf16) { if (utf16) {
if (text.size() < 2) return; if (text.size() < 2) return false;
bool big_endian = true; bool big_endian = true;
string::const_iterator i = text.begin(); string::const_iterator i = text.begin();
if (*p == '\0') { if (*p == '\0') {
// GNU iconv doesn't seem to handle BOMs. // GNU iconv doesn't seem to handle BOMs.
if (startswith(text, "\xfe\xff")) { if (startswith(text, "\xfe\xff")) {
i += 2; i += 2;
} else if (startswith(text, "\xff\xfe")) { } else if (startswith(text, "\xff\xfe")) {
big_endian = false; big_endian = false;
i += 2; i += 2;
skipping to change at line 102 skipping to change at line 102
// assume it's UTF-16 mislabelled, which is easy and sane. // assume it's UTF-16 mislabelled, which is easy and sane.
} else if (strcasecmp(p, "LE") == 0) { } else if (strcasecmp(p, "LE") == 0) {
big_endian = false; big_endian = false;
} else if (!(strcasecmp(p, "BE") == 0)) { } else if (!(strcasecmp(p, "BE") == 0)) {
goto try_iconv; goto try_iconv;
} }
tmp.reserve(text.size() / 2); tmp.reserve(text.size() / 2);
size_t start = 0; size_t start = 0;
auto text_end = text.end();
if (text.size() & 1) { if (text.size() & 1) {
// If there's a half-character at the end, nuke it now to make the // If there's a half-character at the end, nuke it now to make the
// conversion loop below simpler. // conversion loop below simpler.
text.resize(text.size() - 1); --text_end;
} }
while (i != text.end()) { while (i != text_end) {
unsigned ch = static_cast<unsigned char>(*i++); unsigned ch = static_cast<unsigned char>(*i++);
unsigned ch2 = static_cast<unsigned char>(*i++); unsigned ch2 = static_cast<unsigned char>(*i++);
if (big_endian) { if (big_endian) {
ch = (ch << 8) | ch2; ch = (ch << 8) | ch2;
} else { } else {
ch = (ch2 << 8) | ch; ch = (ch2 << 8) | ch;
} }
if (ch >> 10 == 0xd800 >> 10) { if (ch >> 10 == 0xd800 >> 10) {
// Surrogate pair. // Surrogate pair.
if (i == text.end()) break; if (i == text_end) break;
unsigned hi = (ch & 0x3ff); unsigned hi = (ch & 0x3ff);
ch = static_cast<unsigned char>(*i++); ch = static_cast<unsigned char>(*i++);
ch2 = static_cast<unsigned char>(*i++); ch2 = static_cast<unsigned char>(*i++);
if (big_endian) { if (big_endian) {
ch = (ch << 8) | ch2; ch = (ch << 8) | ch2;
} else { } else {
ch = (ch2 << 8) | ch; ch = (ch2 << 8) | ch;
} }
if (ch >> 10 == 0xdc00 >> 10) { if (ch >> 10 == 0xdc00 >> 10) {
ch &= 0x3ff; ch &= 0x3ff;
skipping to change at line 202 skipping to change at line 203
} }
} }
if (start) tmp.append(buf, start); if (start) tmp.append(buf, start);
} }
if (false) { if (false) {
try_iconv: try_iconv:
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
iconv_t conv = iconv_open("UTF-8", charset.c_str()); iconv_t conv = iconv_open("UTF-8", charset.c_str());
if (conv == reinterpret_cast<iconv_t>(-1)) if (conv == reinterpret_cast<iconv_t>(-1))
return; return false;
ICONV_CONST char* in = const_cast<char *>(text.c_str()); ICONV_CONST char* in = const_cast<char *>(text.c_str());
size_t in_len = text.size(); size_t in_len = text.size();
while (in_len) { while (in_len) {
char * out = buf; char * out = buf;
size_t out_len = sizeof(buf); size_t out_len = sizeof(buf);
if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) && if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
errno != E2BIG) { errno != E2BIG) {
// FIXME: how to handle this? // FIXME: how to handle this?
break; break;
} }
tmp.append(buf, out - buf); tmp.append(buf, out - buf);
} }
(void)iconv_close(conv); (void)iconv_close(conv);
#else #else
return; return false;
#endif #endif
} }
if (false) { if (false) {
iso8859_15: iso8859_15:
tmp.reserve(text.size()); tmp.reserve(text.size());
size_t start = 0; size_t start = 0;
for (string::const_iterator i = text.begin(); i != text.end(); ++i) { for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
static const unsigned iso8859_15_to_unicode[] = { static const unsigned iso8859_15_to_unicode[] = {
skipping to change at line 248 skipping to change at line 249
ch = iso8859_15_to_unicode[ch - 164]; ch = iso8859_15_to_unicode[ch - 164];
start += Xapian::Unicode::to_utf8(ch, buf + start); start += Xapian::Unicode::to_utf8(ch, buf + start);
if (start >= sizeof(buf) - 4) { if (start >= sizeof(buf) - 4) {
tmp.append(buf, start); tmp.append(buf, start);
start = 0; start = 0;
} }
} }
if (start) tmp.append(buf, start); if (start) tmp.append(buf, start);
} }
swap(text, tmp); // `output` may be a reference to the same string object as `text` so we
// only switch after we've done converting.
output = std::move(tmp);
return true;
} }
 End of changes. 14 change blocks. 
14 lines changed or deleted 18 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)