"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "core/src/main/java/com/google/zxing/common/StringUtils.java" between
zxing-zxing-3.4.1.tar.gz and zxing-zxing-3.5.0.tar.gz

About: ZXing ("zebra crossing") is a multi-format 1D/2D barcode image processing library implemented in Java, with ports to other languages. Info: Project is in maintenance mode (no active development).

StringUtils.java  (zxing-zxing-3.4.1):StringUtils.java  (zxing-zxing-3.5.0)
skipping to change at line 20 skipping to change at line 20
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package com.google.zxing.common; package com.google.zxing.common;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Map; import java.util.Map;
import com.google.zxing.DecodeHintType; import com.google.zxing.DecodeHintType;
/** /**
* Common string-related functions. * Common string-related functions.
* *
* @author Sean Owen * @author Sean Owen
* @author Alex Dupre * @author Alex Dupre
*/ */
public final class StringUtils { public final class StringUtils {
private static final String PLATFORM_DEFAULT_ENCODING = Charset.defaultCharset private static final Charset PLATFORM_DEFAULT_ENCODING = Charset.defaultCharse
().name(); t();
public static final Charset SHIFT_JIS_CHARSET = Charset.forName("SJIS");
public static final Charset GB2312_CHARSET = Charset.forName("GB2312");
private static final Charset EUC_JP = Charset.forName("EUC_JP");
private static final boolean ASSUME_SHIFT_JIS =
SHIFT_JIS_CHARSET.equals(PLATFORM_DEFAULT_ENCODING) ||
EUC_JP.equals(PLATFORM_DEFAULT_ENCODING);
// Retained for ABI compatibility with earlier versions
public static final String SHIFT_JIS = "SJIS"; public static final String SHIFT_JIS = "SJIS";
public static final String GB2312 = "GB2312"; public static final String GB2312 = "GB2312";
private static final String EUC_JP = "EUC_JP";
private static final String UTF8 = "UTF8";
private static final String ISO88591 = "ISO8859_1";
private static final boolean ASSUME_SHIFT_JIS =
SHIFT_JIS.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING) ||
EUC_JP.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING);
private StringUtils() { } private StringUtils() { }
/** /**
* @param bytes bytes encoding a string, whose encoding should be guessed * @param bytes bytes encoding a string, whose encoding should be guessed
* @param hints decode hints if applicable * @param hints decode hints if applicable
* @return name of guessed encoding; at the moment will only guess one of: * @return name of guessed encoding; at the moment will only guess one of:
* {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform * "SJIS", "UTF8", "ISO8859_1", or the platform default encoding if none
* default encoding if none of these can possibly be correct * of these can possibly be correct
*/ */
public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) { public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) {
Charset c = guessCharset(bytes, hints);
if (c == SHIFT_JIS_CHARSET) {
return "SJIS";
} else if (c == StandardCharsets.UTF_8) {
return "UTF8";
} else if (c == StandardCharsets.ISO_8859_1) {
return "ISO8859_1";
}
return c.name();
}
/**
* @param bytes bytes encoding a string, whose encoding should be guessed
* @param hints decode hints if applicable
* @return Charset of guessed encoding; at the moment will only guess one of:
* {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8},
* {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16},
* or the platform default encoding if
* none of these can possibly be correct
*/
public static Charset guessCharset(byte[] bytes, Map<DecodeHintType,?> hints)
{
if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) { if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) {
return hints.get(DecodeHintType.CHARACTER_SET).toString(); return Charset.forName(hints.get(DecodeHintType.CHARACTER_SET).toString()) ;
} }
// First try UTF-16, assuming anything with its BOM is UTF-16
if (bytes.length > 2 &&
((bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) ||
(bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE))) {
return StandardCharsets.UTF_16;
}
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
// which should be by far the most common encodings. // which should be by far the most common encodings.
int length = bytes.length; int length = bytes.length;
boolean canBeISO88591 = true; boolean canBeISO88591 = true;
boolean canBeShiftJIS = true; boolean canBeShiftJIS = true;
boolean canBeUTF8 = true; boolean canBeUTF8 = true;
int utf8BytesLeft = 0; int utf8BytesLeft = 0;
int utf2BytesChars = 0; int utf2BytesChars = 0;
int utf3BytesChars = 0; int utf3BytesChars = 0;
int utf4BytesChars = 0; int utf4BytesChars = 0;
skipping to change at line 167 skipping to change at line 199
if (canBeUTF8 && utf8BytesLeft > 0) { if (canBeUTF8 && utf8BytesLeft > 0) {
canBeUTF8 = false; canBeUTF8 = false;
} }
if (canBeShiftJIS && sjisBytesLeft > 0) { if (canBeShiftJIS && sjisBytesLeft > 0) {
canBeShiftJIS = false; canBeShiftJIS = false;
} }
// Easy -- if there is BOM or at least 1 valid not-single byte character (an d no evidence it can't be UTF-8), done // Easy -- if there is BOM or at least 1 valid not-single byte character (an d no evidence it can't be UTF-8), done
if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChar s > 0)) { if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChar s > 0)) {
return UTF8; return StandardCharsets.UTF_8;
} }
// Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii charact ers (and no evidence it can't be), done // Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii charact ers (and no evidence it can't be), done
if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) { if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
return SHIFT_JIS; return SHIFT_JIS_CHARSET;
} }
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short w ords. The crude heuristic is: // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short w ords. The crude heuristic is:
// - If we saw // - If we saw
// - only two consecutive katakana chars in the whole text, or // - only two consecutive katakana chars in the whole text, or
// - at least 10% of bytes that could be "upper" not-alphanumeric Latin1, // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
// - then we conclude Shift_JIS, else ISO-8859-1 // - then we conclude Shift_JIS, else ISO-8859-1
if (canBeISO88591 && canBeShiftJIS) { if (canBeISO88591 && canBeShiftJIS) {
return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHi ghOther * 10 >= length return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHi ghOther * 10 >= length
? SHIFT_JIS : ISO88591; ? SHIFT_JIS_CHARSET : StandardCharsets.ISO_8859_1;
} }
// Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to def ault platform encoding // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to def ault platform encoding
if (canBeISO88591) { if (canBeISO88591) {
return ISO88591; return StandardCharsets.ISO_8859_1;
} }
if (canBeShiftJIS) { if (canBeShiftJIS) {
return SHIFT_JIS; return SHIFT_JIS_CHARSET;
} }
if (canBeUTF8) { if (canBeUTF8) {
return UTF8; return StandardCharsets.UTF_8;
} }
// Otherwise, we take a wild guess with platform encoding // Otherwise, we take a wild guess with platform encoding
return PLATFORM_DEFAULT_ENCODING; return PLATFORM_DEFAULT_ENCODING;
} }
} }
 End of changes. 13 change blocks. 
17 lines changed or deleted 50 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)