StringUtils.java (zxing-zxing-3.4.1) | : | StringUtils.java (zxing-zxing-3.5.0) | ||
---|---|---|---|---|
skipping to change at line 20 | skipping to change at line 20 | |||
* Unless required by applicable law or agreed to in writing, software | * Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | * distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | * See the License for the specific language governing permissions and | |||
* limitations under the License. | * limitations under the License. | |||
*/ | */ | |||
package com.google.zxing.common; | package com.google.zxing.common; | |||
import java.nio.charset.Charset; | import java.nio.charset.Charset; | |||
import java.nio.charset.StandardCharsets; | ||||
import java.util.Map; | import java.util.Map; | |||
import com.google.zxing.DecodeHintType; | import com.google.zxing.DecodeHintType; | |||
/** | /** | |||
* Common string-related functions. | * Common string-related functions. | |||
* | * | |||
* @author Sean Owen | * @author Sean Owen | |||
* @author Alex Dupre | * @author Alex Dupre | |||
*/ | */ | |||
public final class StringUtils { | public final class StringUtils { | |||
private static final String PLATFORM_DEFAULT_ENCODING = Charset.defaultCharset | private static final Charset PLATFORM_DEFAULT_ENCODING = Charset.defaultCharse | |||
().name(); | t(); | |||
public static final Charset SHIFT_JIS_CHARSET = Charset.forName("SJIS"); | ||||
public static final Charset GB2312_CHARSET = Charset.forName("GB2312"); | ||||
private static final Charset EUC_JP = Charset.forName("EUC_JP"); | ||||
private static final boolean ASSUME_SHIFT_JIS = | ||||
SHIFT_JIS_CHARSET.equals(PLATFORM_DEFAULT_ENCODING) || | ||||
EUC_JP.equals(PLATFORM_DEFAULT_ENCODING); | ||||
// Retained for ABI compatibility with earlier versions | ||||
public static final String SHIFT_JIS = "SJIS"; | public static final String SHIFT_JIS = "SJIS"; | |||
public static final String GB2312 = "GB2312"; | public static final String GB2312 = "GB2312"; | |||
private static final String EUC_JP = "EUC_JP"; | ||||
private static final String UTF8 = "UTF8"; | ||||
private static final String ISO88591 = "ISO8859_1"; | ||||
private static final boolean ASSUME_SHIFT_JIS = | ||||
SHIFT_JIS.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING) || | ||||
EUC_JP.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING); | ||||
private StringUtils() { } | private StringUtils() { } | |||
/** | /** | |||
* @param bytes bytes encoding a string, whose encoding should be guessed | * @param bytes bytes encoding a string, whose encoding should be guessed | |||
* @param hints decode hints if applicable | * @param hints decode hints if applicable | |||
* @return name of guessed encoding; at the moment will only guess one of: | * @return name of guessed encoding; at the moment will only guess one of: | |||
* {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform | * "SJIS", "UTF8", "ISO8859_1", or the platform default encoding if none | |||
* default encoding if none of these can possibly be correct | * of these can possibly be correct | |||
*/ | */ | |||
public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) { | public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) { | |||
Charset c = guessCharset(bytes, hints); | ||||
if (c == SHIFT_JIS_CHARSET) { | ||||
return "SJIS"; | ||||
} else if (c == StandardCharsets.UTF_8) { | ||||
return "UTF8"; | ||||
} else if (c == StandardCharsets.ISO_8859_1) { | ||||
return "ISO8859_1"; | ||||
} | ||||
return c.name(); | ||||
} | ||||
/** | ||||
* @param bytes bytes encoding a string, whose encoding should be guessed | ||||
* @param hints decode hints if applicable | ||||
* @return Charset of guessed encoding; at the moment will only guess one of: | ||||
* {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8}, | ||||
* {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16}, | ||||
* or the platform default encoding if | ||||
* none of these can possibly be correct | ||||
*/ | ||||
public static Charset guessCharset(byte[] bytes, Map<DecodeHintType,?> hints) | ||||
{ | ||||
if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) { | if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) { | |||
return hints.get(DecodeHintType.CHARACTER_SET).toString(); | return Charset.forName(hints.get(DecodeHintType.CHARACTER_SET).toString()) ; | |||
} | } | |||
// First try UTF-16, assuming anything with its BOM is UTF-16 | ||||
if (bytes.length > 2 && | ||||
((bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) || | ||||
(bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE))) { | ||||
return StandardCharsets.UTF_16; | ||||
} | ||||
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, | // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, | |||
// which should be by far the most common encodings. | // which should be by far the most common encodings. | |||
int length = bytes.length; | int length = bytes.length; | |||
boolean canBeISO88591 = true; | boolean canBeISO88591 = true; | |||
boolean canBeShiftJIS = true; | boolean canBeShiftJIS = true; | |||
boolean canBeUTF8 = true; | boolean canBeUTF8 = true; | |||
int utf8BytesLeft = 0; | int utf8BytesLeft = 0; | |||
int utf2BytesChars = 0; | int utf2BytesChars = 0; | |||
int utf3BytesChars = 0; | int utf3BytesChars = 0; | |||
int utf4BytesChars = 0; | int utf4BytesChars = 0; | |||
skipping to change at line 167 | skipping to change at line 199 | |||
if (canBeUTF8 && utf8BytesLeft > 0) { | if (canBeUTF8 && utf8BytesLeft > 0) { | |||
canBeUTF8 = false; | canBeUTF8 = false; | |||
} | } | |||
if (canBeShiftJIS && sjisBytesLeft > 0) { | if (canBeShiftJIS && sjisBytesLeft > 0) { | |||
canBeShiftJIS = false; | canBeShiftJIS = false; | |||
} | } | |||
// Easy -- if there is BOM or at least 1 valid not-single byte character (an d no evidence it can't be UTF-8), done | // Easy -- if there is BOM or at least 1 valid not-single byte character (an d no evidence it can't be UTF-8), done | |||
if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChar s > 0)) { | if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChar s > 0)) { | |||
return UTF8; | return StandardCharsets.UTF_8; | |||
} | } | |||
// Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii charact ers (and no evidence it can't be), done | // Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii charact ers (and no evidence it can't be), done | |||
if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) { | if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) { | |||
return SHIFT_JIS; | return SHIFT_JIS_CHARSET; | |||
} | } | |||
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short w ords. The crude heuristic is: | // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short w ords. The crude heuristic is: | |||
// - If we saw | // - If we saw | |||
// - only two consecutive katakana chars in the whole text, or | // - only two consecutive katakana chars in the whole text, or | |||
// - at least 10% of bytes that could be "upper" not-alphanumeric Latin1, | // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1, | |||
// - then we conclude Shift_JIS, else ISO-8859-1 | // - then we conclude Shift_JIS, else ISO-8859-1 | |||
if (canBeISO88591 && canBeShiftJIS) { | if (canBeISO88591 && canBeShiftJIS) { | |||
return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHi ghOther * 10 >= length | return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHi ghOther * 10 >= length | |||
? SHIFT_JIS : ISO88591; | ? SHIFT_JIS_CHARSET : StandardCharsets.ISO_8859_1; | |||
} | } | |||
// Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to def ault platform encoding | // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to def ault platform encoding | |||
if (canBeISO88591) { | if (canBeISO88591) { | |||
return ISO88591; | return StandardCharsets.ISO_8859_1; | |||
} | } | |||
if (canBeShiftJIS) { | if (canBeShiftJIS) { | |||
return SHIFT_JIS; | return SHIFT_JIS_CHARSET; | |||
} | } | |||
if (canBeUTF8) { | if (canBeUTF8) { | |||
return UTF8; | return StandardCharsets.UTF_8; | |||
} | } | |||
// Otherwise, we take a wild guess with platform encoding | // Otherwise, we take a wild guess with platform encoding | |||
return PLATFORM_DEFAULT_ENCODING; | return PLATFORM_DEFAULT_ENCODING; | |||
} | } | |||
} | } | |||
End of changes. 13 change blocks. | ||||
17 lines changed or deleted | 50 lines changed or added |