"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java" between
lucene-7.6.0-src.tgz and lucene-7.7.0-src.tgz

About: Lucene is a Java full-text search engine (not a complete application, but rather a code library and API; java source code).

UAX29URLEmailTokenizer.java  (lucene-7.6.0-src.tgz):UAX29URLEmailTokenizer.java  (lucene-7.7.0-src.tgz)
skipping to change at line 34 skipping to change at line 34
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
/** /**
* This class implements Word Break rules from the Unicode Text Segmentation * This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in * algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
* URLs and email addresses are also tokenized according to the relevant RFCs. * URLs and email addresses are also tokenized according to the relevant RFCs.
* <p>
* Tokens produced are of the following types:
* <ul>
* <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
* <li>&lt;NUM&gt;: A number</li>
* <li>&lt;URL&gt;: A URL</li>
* <li>&lt;EMAIL&gt;: An email address</li>
* <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southe
ast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* </ul>
*/ */
public final class UAX29URLEmailTokenizer extends Tokenizer { public final class UAX29URLEmailTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */ /** A private instance of the JFlex-constructed scanner */
private final UAX29URLEmailTokenizerImpl scanner; private final UAX29URLEmailTokenizerImpl scanner;
public static final int ALPHANUM = 0; /** Alpha/numeric token type */
public static final int NUM = 1; public static final int ALPHANUM = 0;
public static final int SOUTHEAST_ASIAN = 2; /** Numeric token type */
public static final int IDEOGRAPHIC = 3; public static final int NUM = 1;
public static final int HIRAGANA = 4; /** Southeast Asian token type */
public static final int KATAKANA = 5; public static final int SOUTHEAST_ASIAN = 2;
public static final int HANGUL = 6; /** Ideographic token type */
public static final int URL = 7; public static final int IDEOGRAPHIC = 3;
public static final int EMAIL = 8; /** Hiragana token type */
public static final int HIRAGANA = 4;
/** Katakana token type */
public static final int KATAKANA = 5;
/** Hangul token type */
public static final int HANGUL = 6;
/** URL token type */
public static final int URL = 7;
/** Email token type */
public static final int EMAIL = 8;
/** Emoji token type. */
public static final int EMOJI = 9;
/** String token types that correspond to token type int constants */ /** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] { public static final String [] TOKEN_TYPES = new String [] {
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM],
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM],
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN],
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC],
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA],
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA],
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL], StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
"<URL>", "<URL>",
"<EMAIL>", "<EMAIL>",
StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]
}; };
/** Absolute maximum sized token */ /** Absolute maximum sized token */
public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024; public static final int MAX_TOKEN_LENGTH_LIMIT = 1024 * 1024;
private int skippedPositions; private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** /**
 End of changes. 3 change blocks. 
22 lines changed or deleted 21 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)