"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "utf8truncate.cc" between
xapian-omega-1.4.18.tar.xz and xapian-omega-1.4.19.tar.xz

About: Xapian Omega is an application built on Xapian, consisting of indexers and a CGI search frontend.

utf8truncate.cc  (xapian-omega-1.4.18.tar.xz):utf8truncate.cc  (xapian-omega-1.4.19.tar.xz)
/** @file /** @file
* @brief truncate a utf-8 string, ideally without splitting words. * @brief truncate a utf-8 string, ideally without splitting words.
*/ */
/* Copyright (C) 2007 Olly Betts /* Copyright (C) 2007,2021 Olly Betts
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version. * (at your option) any later version.
* *
* This program is distributed in the hope that it will be useful, * This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details. * GNU General Public License for more details.
skipping to change at line 29 skipping to change at line 29
*/ */
#include <config.h> #include <config.h>
#include "utf8truncate.h" #include "utf8truncate.h"
#include <string> #include <string>
using namespace std; using namespace std;
bool bool
utf8_truncate(string & value, string::size_type maxlen) utf8_truncate(string& value, string::size_type maxlen)
{ {
if (value.size() <= maxlen) return false; if (value.size() <= maxlen) return false;
string::size_type len = maxlen; string::size_type len = maxlen + 1;
// Skip back to (and past) the last whitespace. // Skip back to (and past) the last whitespace. We start one past the
// length we want to correctly handle the case where a word ends exactly
// maxlen bytes in.
while (len && static_cast<unsigned char>(value[len - 1]) > 32) --len; while (len && static_cast<unsigned char>(value[len - 1]) > 32) --len;
while (len && static_cast<unsigned char>(value[len - 1]) <= 32) --len; while (len && static_cast<unsigned char>(value[len - 1]) <= 32) --len;
// If the first word is too long, truncate it. // If the first word is too long, truncate it.
if (!len) { if (!len) {
len = maxlen; len = maxlen;
// Skip back to before any UTF-8 character which spans the cut point. // If the bytes of a UTF-8 character span the maxlen position we need
// We can just look at the byte after the cut point - if it's a // to remove some extra bytes to avoid leaving a partial UTF-8
// "sequence start" byte then we're OK, otherwise step back until we // character.
// get to a "sequence start" byte. //
// We start at the byte after the cut point. If it's a continuation
// byte we step back until we find the start of the character and
// truncate right before that.
while (len && (value[len] & 0xc0) == 0x80) --len; while (len && (value[len] & 0xc0) == 0x80) --len;
} }
value.resize(len); value.resize(len);
return true; return true;
} }
 End of changes. 4 change blocks. 
8 lines changed or deleted 13 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)