"Fossies" - the Fresh Open Source Software Archive

Member "speech_tools/include/EST_Token.h" (4 Sep 2017, 15172 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "EST_Token.h" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.4-release_vs_2.5.0-release.

    1 /*************************************************************************/
    2 /*                                                                       */
    3 /*                Centre for Speech Technology Research                  */
    4 /*                     University of Edinburgh, UK                       */
    5 /*                         Copyright (c) 1996                            */
    6 /*                        All Rights Reserved.                           */
    7 /*                                                                       */
    8 /*  Permission is hereby granted, free of charge, to use and distribute  */
    9 /*  this software and its documentation without restriction, including   */
   10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
   11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
   12 /*  permit persons to whom this work is furnished to do so, subject to   */
   13 /*  the following conditions:                                            */
   14 /*   1. The code must retain the above copyright notice, this list of    */
   15 /*      conditions and the following disclaimer.                         */
   16 /*   2. Any modifications must be clearly marked as such.                */
   17 /*   3. Original authors' names are not deleted.                         */
   18 /*   4. The authors' names are not used to endorse or promote products   */
   19 /*      derived from this software without specific prior written        */
   20 /*      permission.                                                      */
   21 /*                                                                       */
   22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
   23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
   24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
   25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
   26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
   27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
   28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
   29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
   30 /*  THIS SOFTWARE.                                                       */
   31 /*                                                                       */
   32 /*************************************************************************/
   33 /*                     Author :  Alan W Black                            */
   34 /*                     Date   :  April 1996                              */
   35 /*-----------------------------------------------------------------------*/
   36 /*                    Token/Tokenizer class                              */
   37 /*                                                                       */
   38 /*=======================================================================*/
   39 
   40 #ifndef __EST_TOKEN_H__
   41 #define __EST_TOKEN_H__
   42 
   43 #include <cstdio>
   44 
   45 using namespace std;
   46 
   47 #include "EST_String.h"
   48 #include "EST_common.h"
   49 
   50 // I can never really remember this so we'll define it here
   51 /// The default whitespace characters
   52 extern const EST_String EST_Token_Default_WhiteSpaceChars;
   53 ///
   54 extern const EST_String EST_Token_Default_SingleCharSymbols;
   55 ///
   56 extern const EST_String EST_Token_Default_PunctuationSymbols;
   57 ///
   58 extern const EST_String EST_Token_Default_PrePunctuationSymbols;
   59 
   60 /** This class is similar to \Ref{EST_String} but also maintains 
   61     the original punctuation and whitespace found around the 
   62     token.  
   63 
   64     \Ref{EST_Token}'s primary use is with \Ref{EST_TokenStream} class 
   65     which allows easy tokenizing of ascii files.  
   66 
   67     A token consists of four parts, any of which may be empty: a
   68     name, the actual token, preceding whitespace, preceding
   69     punctuation, the name and succeeding punctuation.  
   70 
   71     @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
   72 */
   73 class EST_Token {
   74   private:
   75     EST_String space;
   76     EST_String prepunc;
   77     EST_String pname;
   78     EST_String punc;
   79     int linenum;
   80     int linepos;
   81     int p_filepos;
   82     int p_quoted;
   83 
   84   public:
   85     ///
   86     EST_Token() {init();}
   87     ///
   88     EST_Token(const EST_String p) {init(); pname = p; }
   89     ///
   90     void init() {p_quoted=linenum=linepos=p_filepos=0;}
   91     
   92     /**@name Basic access to fields */
   93     //@{
   94     /// set token from a string
   95     void set_token(const EST_String &p) { pname = p; }
   96     ///
   97     void set_token(const char *p) { pname = p; }
   98     /// set whitespace of token.
   99     void set_whitespace(const EST_String &p) { space = p; }
  100     ///
  101     void set_whitespace(const char *p) { space = p; }
  102     /// set (post) punctuation of token.
  103     void set_punctuation(const EST_String &p) { punc = p; }
  104     /// 
  105     void set_punctuation(const char *p) { punc = p; }
  106     /// set prepunction
  107     void set_prepunctuation(const EST_String &p) { prepunc = p; }
  108     ///
  109     void set_prepunctuation(const char *p) { prepunc = p; }
  110     ///
  111     const EST_String &whitespace() { return space; }
  112     ///
  113     const EST_String &punctuation() { return punc; }
  114     ///
  115     const EST_String &prepunctuation() { return prepunc; }
  116 
  117     /**@name Access token as a string */
  118     //@{
  119     const EST_String &string() const { return String(); }
  120     /// Access token as a string
  121     const EST_String &S() const { return String(); }
  122     /// Access token as a string
  123     const EST_String &String() const { return pname; }
  124     /// For automatic coercion to \Ref{EST_String}
  125     operator EST_String() const { return String(); }
  126     //@}
  127 
  128     /**@name Access token as a int */
  129     //@{
  130     int Int(bool &valid) const { return String().Int(valid); }
  131     int Int() const { return String().Int(); }
  132     int I(bool &valid) const { return Int(valid); }
  133     int I() const { return Int(); }
  134     operator int() const { return Int(); }
  135     //@}
  136 
  137     /**@name Access token as a long */
  138     //@{
  139     long Long(bool &valid) const { return String().Long(valid); }
  140     long Long() const { return String().Long(); }
  141     long L(bool &valid) const { return Long(valid); }
  142     long L() const { return Long(); }
  143     operator long() const { return Long(); }
  144     //@}
  145 
  146     /**@name Access token as a float */
  147     //@{
  148     float Float(bool &valid) const { return String().Float(valid); }
  149     float Float() const { return String().Float(); }
  150     float F(bool &valid) const { return Float(valid); }
  151     float F() const { return Float(); }
  152     operator float() const { return Float(); }
  153     //@}
  154 
  155     /**@name Access token as a double */
  156     //@{
  157     double Double(bool &valid) const { return String().Double(valid); }
  158     double Double() const { return String().Double(); }
  159     double D(bool &valid) const { return Double(valid); }
  160     double D() const { return Double(); }
  161     operator double() const { return Double(); }
  162     //@}
  163 
  164     //@}
  165     //@{
  166     /// Note that this token was quoted (or not)
  167     void set_quoted(int q) { p_quoted = q; }
  168     /// TRUE is token was quoted
  169     int quoted() const { return p_quoted; }
  170     //@}
  171     ///
  172     void set_row(int r) { linenum = r; }
  173     ///
  174     void set_col(int c) { linepos = c; }
  175     /// Set file position in original \Ref{EST_TokenStream}
  176     void set_filepos(int c) { p_filepos = c; }
  177     /// Return lower case version of token name
  178     EST_String lstring() { return downcase(pname); }
  179     /// Return upper case version of token name
  180     EST_String ustring() { return upcase(pname); }
  181     /// Line number in original \Ref{EST_TokenStream}.
  182     int row(void) const { return linenum; }
  183     /// Line position in original \Ref{EST_TokenStream}.
  184     int col(void) const { return linepos; }
  185     /// file position in original \Ref{EST_TokenStream}.
  186     int filepos(void) const { return p_filepos; }
  187 
  188     /// A string describing current position, suitable for error messages
  189     const EST_String pos_description() const;
  190 
  191     ///
  192     friend ostream& operator << (ostream& s, const EST_Token &p);
  193     
  194     ///
  195     EST_Token & operator = (const EST_Token &a);
  196     ///
  197     EST_Token & operator = (const EST_String &a);
  198     ///
  199     int operator == (const EST_String &a) { return (pname == a); }
  200     ///
  201     int operator != (const EST_String &a) { return (pname != a); }
  202     ///
  203     int operator == (const char *a) { return (strcmp(pname,a)==0); }
  204     ///
  205     int operator != (const char *a) { return (strcmp(pname,a)!=0); }
  206 };
  207 
  208 enum EST_tokenstream_type {tst_none, tst_file, tst_pipe, tst_string, tst_istream}; 
  209 
  210 /** A class that allows the reading of \Ref{EST_Token}s from a file
  211     stream, pipe or string.  It automatically tokenizes a file based on
  212     user definable whitespace and punctuation.
  213 
  214     The definitions of whitespace and punctuation are user definable.
  215     Also support for single character symbols is included.  Single
  216     character symbols {\em always} are treated as individual tokens
  217     irrespective of their white space context.  Also a quote
  218     mode can be used to read uqoted tokens.
  219 
  220     The setting of whitespace, pre and post punctuation, single character
  221     symbols and quote mode must be down (immediately) after opening
  222     the stream.
  223 
  224     There is no unget but peek provides look ahead of one token.
  225     
  226     Note there is an interesting issue about what to do about
  227     the last whitespace in the file.  Should it be ignored or should
  228     it be attached to a token with a name string of length zero.
  229     In unquoted mode the eof() will return TRUE if the next token name
  230     is empty (the mythical last token).  In quoted mode the last must
  231     be returned so eof will not be raised.
  232 
  233     @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
  234 */
  235 class EST_TokenStream{
  236  private:
  237     EST_tokenstream_type type;
  238     EST_String WhiteSpaceChars;
  239     EST_String SingleCharSymbols;
  240     EST_String PunctuationSymbols;
  241     EST_String PrePunctuationSymbols;
  242     EST_String Origin;
  243     FILE *fp;
  244     istream *is;
  245     int fd;
  246     char *buffer;
  247     int buffer_length;
  248     int pos;
  249     int linepos;
  250     int p_filepos;
  251     int getch(void);
  252     EST_TokenStream &getch(char &C);
  253     int peeked_charp;
  254     int peeked_char;       // ungot character 
  255     int peekch(void);
  256     int peeked_tokp;
  257     int eof_flag;
  258     int quotes;
  259     char quote;
  260     char escape;
  261     EST_Token current_tok;
  262     void default_values(void);
  263     /* local buffers to save reallocating */
  264     int tok_wspacelen;
  265     char *tok_wspace;
  266     int tok_stufflen;
  267     char *tok_stuff;
  268     int tok_prepuncslen;
  269     char *tok_prepuncs;
  270     int close_at_end;
  271 
  272     /* character class map */
  273     char p_table[256];
  274     bool p_table_wrong;
  275 
  276     /** This function is deliberately private so that you'll get a compilation
  277         error if you assign a token stream or pass it as an (non-reference)
  278         argument.  The problem with copying is that you need to copy the
  279         filedescriptiors too (which can't be done for pipes).  You probably
  280         don't really want a copy anyway and meant to pass it as a reference.
  281         If you really need this (some sort of clever look ahead) I am not
  282         sure what he consequences really are (or how portable they are).
  283         Pass the \Ref{EST_TokenStream} by reference instead.
  284     */
  285     EST_TokenStream(EST_TokenStream &s);
  286 
  287     void build_table();
  288 
  289     inline int getch_internal();
  290     inline int peekch_internal();
  291     inline int getpeeked_internal();
  292   public:
  293     ///
  294     EST_TokenStream();
  295     /// will close file if appropriate for type
  296     ~EST_TokenStream();
  297     //@{
  298     /// open a \Ref{EST_TokenStream} for a file.
  299     int open(const EST_String &filename);
  300     /// open a \Ref{EST_TokenStream} for an already opened file
  301     int open(FILE *ofp, int close_when_finished);
  302     /// open a \Ref{EST_TokenStream} for an already open istream
  303     int open(istream &newis);
  304     /// open a \Ref{EST_TokenStream} for string rather than a file
  305     int open_string(const EST_String &newbuffer);
  306     /// Close stream.
  307     void close(void);
  308     //@}
  309     /**@name stream access functions */
  310     //@{
  311     /// get next token in stream
  312     EST_TokenStream &get(EST_Token &t);
  313     /// get next token in stream
  314     EST_Token &get();
  315     /**@name  get the next token which must be the argument. */
  316     //@{
  317     EST_Token &must_get(EST_String expected, bool *ok);
  318     EST_Token &must_get(EST_String expected, bool &ok) 
  319     { return must_get(expected, &ok); }
  320     EST_Token &must_get(EST_String expected) 
  321     { return must_get(expected, (bool *)NULL); }
  322     //@}
  323     /// get up to {\tt s} in stream as a single token.
  324     EST_Token get_upto(const EST_String &s);
  325     /// get up to {\tt s} in end of line as a single token.
  326     EST_Token get_upto_eoln(void);
  327     /// peek at next token
  328     EST_Token &peek(void);
  329     /// Reading binary data, (don't use peek() immediately beforehand)
  330     int fread(void *buff,int size,int nitems);
  331     //@}
  332     /**@name stream initialization functions */
  333     //@{
  334     /// set which characters are to be treated as whitespace
  335     void set_WhiteSpaceChars(const EST_String &ws) 
  336         { WhiteSpaceChars = ws; p_table_wrong=1;}
  337     /// set which characters are to be treated as single character symbols
  338     void set_SingleCharSymbols(const EST_String &sc) 
  339         { SingleCharSymbols = sc; p_table_wrong=1;}
  340     /// set which characters are to be treated as (post) punctuation
  341     void set_PunctuationSymbols(const EST_String &ps) 
  342         { PunctuationSymbols = ps; p_table_wrong=1;}
  343     /// set which characters are to be treated as (post) punctuation
  344     void set_PrePunctuationSymbols(const EST_String &ps) 
  345         { PrePunctuationSymbols = ps; p_table_wrong=1;}
  346     /// set characters to be used as quotes and escape, and set quote mode
  347     void set_quotes(char q, char e) { quotes = TRUE; quote = q; escape = e; p_table_wrong=1;}
  348     /// query quote mode
  349     int quoted_mode(void) { return quotes; }
  350     //@}
  351     /**@name miscellaneous */
  352     //@{
  353     /// returns line number of \Ref{EST_TokenStream}
  354     int linenum(void) const {return linepos;}
  355     /// end of file
  356     int eof()
  357        { return (eof_flag || ((!quotes) && (peek() == ""))); }
  358     /// end of line
  359     int eoln();
  360     /// current file position in \Ref{EST_TokenStream}
  361     int filepos(void) const { return (type == tst_string) ? pos : p_filepos; }
  362     /// tell, synonym for filepos
  363     int tell(void) const { return filepos(); }
  364     /// seek, reposition file pointer
  365     int seek(int position);
  366     int seek_end();
  367     /// Reset to start of file/string 
  368     int restart(void);
  369     /// A string describing current position, suitable for error messages
  370     const EST_String pos_description();
  371     /// The originating filename (if there is one)
  372     const EST_String filename() const { return Origin; }
  373     /// For the people who *need* the actual description (if possible)
  374     FILE *filedescriptor() { return (type == tst_file) ? fp : 0; }
  375     ///
  376     EST_TokenStream & operator >>(EST_Token &p);
  377     ///
  378     EST_TokenStream & operator >>(EST_String &p);
  379     ///
  380     friend ostream& operator <<(ostream& s, EST_TokenStream &p);
  381     //@}
  382 };
  383 
  384 /** Quote a string with given quotes and escape character
  385 */
  386 EST_String quote_string(const EST_String &s,
  387             const EST_String &quote = "\"", 
  388             const EST_String &escape = "\\", 
  389             int force=0);
  390 
  391 #endif // __EST_TOKEN_H__