"Fossies" - the Fresh Open Source Software Archive

Member "chandler-1.0.3/external/PyLucene/PyLucene-2.3.1-3-418/lucene-java-2.3.1/src/java/org/apache/lucene/analysis/Token.java" (6 Dec 2007, 13044 Bytes) of archive /windows/misc/Chandler_src_1.0.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Java source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 package org.apache.lucene.analysis;
    2 
    3 /**
    4  * Licensed to the Apache Software Foundation (ASF) under one or more
    5  * contributor license agreements.  See the NOTICE file distributed with
    6  * this work for additional information regarding copyright ownership.
    7  * The ASF licenses this file to You under the Apache License, Version 2.0
    8  * (the "License"); you may not use this file except in compliance with
    9  * the License.  You may obtain a copy of the License at
   10  *
   11  *     http://www.apache.org/licenses/LICENSE-2.0
   12  *
   13  * Unless required by applicable law or agreed to in writing, software
   14  * distributed under the License is distributed on an "AS IS" BASIS,
   15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16  * See the License for the specific language governing permissions and
   17  * limitations under the License.
   18  */
   19 
   20 import org.apache.lucene.index.Payload;
   21 import org.apache.lucene.index.TermPositions;
   22 
   23 /** A Token is an occurence of a term from the text of a field.  It consists of
   24   a term's text, the start and end offset of the term in the text of the field,
   25   and a type string.
   26   <p>
   27   The start and end offsets permit applications to re-associate a token with
   28   its source text, e.g., to display highlighted query terms in a document
   29   browser, or to show matching text fragments in a KWIC (KeyWord In Context)
   30   display, etc.
   31   <p>
   32   The type is an interned string, assigned by a lexical analyzer
   33   (a.k.a. tokenizer), naming the lexical or syntactic class that the token
   34   belongs to.  For example an end of sentence marker token might be implemented
   35   with type "eos".  The default token type is "word".  
   36   <p>
   37   A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
   38   length byte array. Use {@link TermPositions#getPayloadLength()} and 
   39   {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
   40   
   41   <br><br>
   42   <p><font color="#FF0000">
   43   WARNING: The status of the <b>Payloads</b> feature is experimental. 
   44   The APIs introduced here might change in the future and will not be 
   45   supported anymore in such a case.</font>
   46 
   47   <br><br>
   48 
   49   <p><b>NOTE:</b> As of 2.3, Token stores the term text
   50   internally as a malleable char[] termBuffer instead of
   51   String termText.  The indexing code and core tokenizers
   52   have been changed re-use a single Token instance, changing
   53   its buffer and other fields in-place as the Token is
   54   processed.  This provides substantially better indexing
   55   performance as it saves the GC cost of new'ing a Token and
   56   String for every term.  The APIs that accept String
   57   termText are still available but a warning about the
   58   associated performance cost has been added (below).  The
   59   {@link #termText()} method has been deprecated.</p>
   60   
   61   <p>Tokenizers and filters should try to re-use a Token
   62   instance when possible for best performance, by
   63   implementing the {@link TokenStream#next(Token)} API.
   64   Failing that, to create a new Token you should first use
   65   one of the constructors that starts with null text.  Then
   66   you should call either {@link #termBuffer()} or {@link
   67   #resizeTermBuffer(int)} to retrieve the Token's
   68   termBuffer.  Fill in the characters of your term into this
   69   buffer, and finally call {@link #setTermLength(int)} to
   70   set the length of the term text.  See <a target="_top"
   71   href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
   72   for details.</p>
   73 
   74   @see org.apache.lucene.index.Payload
   75 */
   76 public class Token implements Cloneable {
   77 
   78   public static final String DEFAULT_TYPE = "word";
   79   private static int MIN_BUFFER_SIZE = 10;
   80 
   81   /** @deprecated: we will remove this when we remove the
   82    * deprecated APIs */
   83   private String termText;
   84 
   85   char[] termBuffer;                              // characters for the term text
   86   int termLength;                                 // length of term text in buffer
   87 
   88   int startOffset;                // start in source text
   89   int endOffset;                  // end in source text
   90   String type = DEFAULT_TYPE;                     // lexical type
   91   
   92   Payload payload;
   93   
   94   int positionIncrement = 1;
   95 
   96   /** Constructs a Token will null text. */
   97   public Token() {
   98   }
   99 
  100   /** Constructs a Token with null text and start & end
  101    *  offsets.
  102    *  @param start start offset
  103    *  @param end end offset */
  104   public Token(int start, int end) {
  105     startOffset = start;
  106     endOffset = end;
  107   }
  108 
  109   /** Constructs a Token with null text and start & end
  110    *  offsets plus the Token type.
  111    *  @param start start offset
  112    *  @param end end offset */
  113   public Token(int start, int end, String typ) {
  114     startOffset = start;
  115     endOffset = end;
  116     type = typ;
  117   }
  118 
  119   /** Constructs a Token with the given term text, and start
  120    *  & end offsets.  The type defaults to "word."
  121    *  <b>NOTE:</b> for better indexing speed you should
  122    *  instead use the char[] termBuffer methods to set the
  123    *  term text.
  124    *  @param text term text
  125    *  @param start start offset
  126    *  @param end end offset */
  127   public Token(String text, int start, int end) {
  128     termText = text;
  129     startOffset = start;
  130     endOffset = end;
  131   }
  132 
  133   /** Constructs a Token with the given text, start and end
  134    *  offsets, & type.  <b>NOTE:</b> for better indexing
  135    *  speed you should instead use the char[] termBuffer
  136    *  methods to set the term text.
  137    *  @param text term text
  138    *  @param start start offset
  139    *  @param end end offset
  140    *  @param typ token type */
  141   public Token(String text, int start, int end, String typ) {
  142     termText = text;
  143     startOffset = start;
  144     endOffset = end;
  145     type = typ;
  146   }
  147 
  148   /** Set the position increment.  This determines the position of this token
  149    * relative to the previous Token in a {@link TokenStream}, used in phrase
  150    * searching.
  151    *
  152    * <p>The default value is one.
  153    *
  154    * <p>Some common uses for this are:<ul>
  155    *
  156    * <li>Set it to zero to put multiple terms in the same position.  This is
  157    * useful if, e.g., a word has multiple stems.  Searches for phrases
  158    * including either stem will match.  In this case, all but the first stem's
  159    * increment should be set to zero: the increment of the first instance
  160    * should be one.  Repeating a token with an increment of zero can also be
  161    * used to boost the scores of matches on that token.
  162    *
  163    * <li>Set it to values greater than one to inhibit exact phrase matches.
  164    * If, for example, one does not want phrases to match across removed stop
  165    * words, then one could build a stop word filter that removes stop words and
  166    * also sets the increment to the number of stop words removed before each
  167    * non-stop word.  Then exact phrase queries will only match when the terms
  168    * occur with no intervening stop words.
  169    *
  170    * </ul>
  171    * @see org.apache.lucene.index.TermPositions
  172    */
  173   public void setPositionIncrement(int positionIncrement) {
  174     if (positionIncrement < 0)
  175       throw new IllegalArgumentException
  176         ("Increment must be zero or greater: " + positionIncrement);
  177     this.positionIncrement = positionIncrement;
  178   }
  179 
  180   /** Returns the position increment of this Token.
  181    * @see #setPositionIncrement
  182    */
  183   public int getPositionIncrement() {
  184     return positionIncrement;
  185   }
  186 
  187   /** Sets the Token's term text.  <b>NOTE:</b> for better
  188    *  indexing speed you should instead use the char[]
  189    *  termBuffer methods to set the term text. */
  190   public void setTermText(String text) {
  191     termText = text;
  192     termBuffer = null;
  193   }
  194 
  195   /** Returns the Token's term text.
  196    * 
  197    * @deprecated Use {@link #termBuffer()} and {@link
  198    * #termLength()} instead. */
  199   public final String termText() {
  200     if (termText == null && termBuffer != null)
  201       termText = new String(termBuffer, 0, termLength);
  202     return termText;
  203   }
  204 
  205   /** Copies the contents of buffer, starting at offset for
  206    *  length characters, into the termBuffer
  207    *  array. <b>NOTE:</b> for better indexing speed you
  208    *  should instead retrieve the termBuffer, using {@link
  209    *  #termBuffer()} or {@link #resizeTermBuffer(int)}, and
  210    *  fill it in directly to set the term text.  This saves
  211    *  an extra copy. */
  212   public final void setTermBuffer(char[] buffer, int offset, int length) {
  213     resizeTermBuffer(length);
  214     System.arraycopy(buffer, offset, termBuffer, 0, length);
  215     termLength = length;
  216   }
  217 
  218   /** Returns the internal termBuffer character array which
  219    *  you can then directly alter.  If the array is too
  220    *  small for your token, use {@link
  221    *  #resizeTermBuffer(int)} to increase it.  After
  222    *  altering the buffer be sure to call {@link
  223    *  #setTermLength} to record the number of valid
  224    *  characters that were placed into the termBuffer. */
  225   public final char[] termBuffer() {
  226     initTermBuffer();
  227     return termBuffer;
  228   }
  229 
  230   /** Grows the termBuffer to at least size newSize.
  231    *  @param newSize minimum size of the new termBuffer
  232    *  @return newly created termBuffer with length >= newSize
  233    */
  234   public char[] resizeTermBuffer(int newSize) {
  235     initTermBuffer();
  236     if (newSize > termBuffer.length) {
  237       int size = termBuffer.length;
  238       while(size < newSize)
  239         size *= 2;
  240       char[] newBuffer = new char[size];
  241       System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
  242       termBuffer = newBuffer;
  243     }
  244     return termBuffer;
  245   }
  246 
  247   // TODO: once we remove the deprecated termText() method
  248   // and switch entirely to char[] termBuffer we don't need
  249   // to use this method anymore
  250   private void initTermBuffer() {
  251     if (termBuffer == null) {
  252       if (termText == null) {
  253         termBuffer = new char[MIN_BUFFER_SIZE];
  254         termLength = 0;
  255       } else {
  256         int length = termText.length();
  257         if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE;
  258         termBuffer = new char[length];
  259         termLength = termText.length();
  260         termText.getChars(0, termText.length(), termBuffer, 0);
  261         termText = null;
  262       }
  263     } else if (termText != null)
  264       termText = null;
  265   }
  266 
  267   /** Return number of valid characters (length of the term)
  268    *  in the termBuffer array. */
  269   public final int termLength() {
  270     initTermBuffer();
  271     return termLength;
  272   }
  273 
  274   /** Set number of valid characters (length of the term) in
  275    *  the termBuffer array. */
  276   public final void setTermLength(int length) {
  277     initTermBuffer();
  278     termLength = length;
  279   }
  280 
  281   /** Returns this Token's starting offset, the position of the first character
  282     corresponding to this token in the source text.
  283 
  284     Note that the difference between endOffset() and startOffset() may not be
  285     equal to termText.length(), as the term text may have been altered by a
  286     stemmer or some other filter. */
  287   public final int startOffset() {
  288     return startOffset;
  289   }
  290 
  291   /** Set the starting offset.
  292       @see #startOffset() */
  293   public void setStartOffset(int offset) {
  294     this.startOffset = offset;
  295   }
  296 
  297   /** Returns this Token's ending offset, one greater than the position of the
  298     last character corresponding to this token in the source text. */
  299   public final int endOffset() {
  300     return endOffset;
  301   }
  302 
  303   /** Set the ending offset.
  304       @see #endOffset() */
  305   public void setEndOffset(int offset) {
  306     this.endOffset = offset;
  307   }
  308 
  309   /** Returns this Token's lexical type.  Defaults to "word". */
  310   public final String type() {
  311     return type;
  312   }
  313 
  314   /** Set the lexical type.
  315       @see #type() */
  316   public final void setType(String type) {
  317     this.type = type;
  318   }
  319 
  320   /** 
  321    * Returns this Token's payload.
  322    */ 
  323   public Payload getPayload() {
  324     return this.payload;
  325   }
  326 
  327   /** 
  328    * Sets this Token's payload.
  329    */
  330   public void setPayload(Payload payload) {
  331     this.payload = payload;
  332   }
  333   
  334   public String toString() {
  335     StringBuffer sb = new StringBuffer();
  336     sb.append('(');
  337     initTermBuffer();
  338     if (termBuffer == null)
  339       sb.append("null");
  340     else
  341       sb.append(termBuffer, 0, termLength);
  342       sb.append(',').append(startOffset).append(',').append(endOffset);
  343     if (!type.equals("word"))
  344       sb.append(",type=").append(type);
  345     if (positionIncrement != 1)
  346       sb.append(",posIncr=").append(positionIncrement);
  347     sb.append(')');
  348     return sb.toString();
  349   }
  350 
  351   /** Resets the term text, payload, and positionIncrement to default.
  352    * Other fields such as startOffset, endOffset and the token type are
  353    * not reset since they are normally overwritten by the tokenizer. */
  354   public void clear() {
  355     payload = null;
  356     // Leave termBuffer to allow re-use
  357     termLength = 0;
  358     termText = null;
  359     positionIncrement = 1;
  360     // startOffset = endOffset = 0;
  361     // type = DEFAULT_TYPE;
  362   }
  363 
  364   public Object clone() {
  365     try {
  366       Token t = (Token)super.clone();
  367       if (termBuffer != null) {
  368         t.termBuffer = null;
  369         t.setTermBuffer(termBuffer, 0, termLength);
  370       }
  371       if (payload != null) {
  372         t.setPayload((Payload) payload.clone());
  373       }
  374       return t;
  375     } catch (CloneNotSupportedException e) {
  376       throw new RuntimeException(e);  // shouldn't happen
  377     }
  378   }
  379 }