"Fossies" - the Fresh Open Source Software Archive

Member "utrac-0.3.2/src/ut_text.h" (4 Jan 2009, 7168 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ut_text.h" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *            ut_text.h
    3  *
    4  *  Tue Oct  5 11:28:11 2004
    5  *  Copyright  2004  Alliance MCA
    6  *  Written by : Antoine Calando (antoine@alliancemca.net)
    7  ****************************************************************************/
    8 
    9 /*
   10  *  This program is free software; you can redistribute it and/or modify
   11  *  it under the terms of the GNU General Public License as published by
   12  *  the Free Software Foundation; either version 2 of the License, or
   13  *  (at your option) any later version.
   14  *
   15  *  This program is distributed in the hope that it will be useful,
   16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18  *  GNU Library General Public License for more details.
   19  *
   20  *  You should have received a copy of the GNU General Public License
   21  *  along with this program; if not, write to the Free Software
   22  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   23  */
   24  
   25 /*!
   26  * \file ut_text.h
   27  * \author Antoine Calando (antoine@alliancemca.net)
   28  */
   29 
   30 #ifndef _UT_TEXT_H_
   31 #define _UT_TEXT_H_
   32 
   33 
   34 
   35 /***************************************************************************/
   36 /*!
   37  * \brief Flags that control the recognition of a text.
   38  * 
   39  * They are set by the user to tune the way the text will be analysed
   40  * (during function ut_recognize() ).
   41  *  Some of them are unimplemented (UT_F_REFERENCE_EXT_CHAR, always true).
   42  */
   43 
   44 typedef enum UtTextFlags {
   45     UT_F_UNSET               = 0,
   46     UT_F_FORCE_BINARY        = 1<<0, //!< Force processing of the file even if it is detected as binary data.
   47     UT_F_IDENTIFY_EOL        = 1<<1, 
   48     UT_F_TRANSFORM_EOL       = 1<<2, //!< Replace EOL by null character to simplify the processing.
   49     UT_F_REMOVE_ILLEGAL_CHAR = 1<<3, //!< Remove control characters (except CR, LF and TAB).
   50     UT_F_ADD_FINAL_EOL       = 1<<4, //!< Add a final EOL to the text if the last line is not empty.
   51     UT_F_IDENTIFY_CHARSET    = 1<<5,
   52     UT_F_REFERENCE_EXT_CHAR  = 1<<6, //!< Register the lines that contains extended characters (unimplemented, always true).
   53 
   54     UT_F_DEFAULT = UT_F_REMOVE_ILLEGAL_CHAR | UT_F_IDENTIFY_CHARSET
   55 } UtTextFlags;
   56 
   57 /***************************************************************************/
   58 /*!
   59  * \brief Flags that describe each step in the processing of a text.
   60  * 
   61  * They are set by the user or by utrac to select which pass will be done,
   62  * in ordrer to compute the % of the process done for the 'progress bar'
   63  * callback.
   64  *
   65  */
   66 
   67 typedef enum UtPassFlags {
   68     UT_PF_UNSET         = 0,
   69     UT_PF_NONE          = 1<<0,
   70     UT_PF_LOAD          = 1<<1,
   71     UT_PF_RECOGNIZE     = 1<<2,
   72     UT_PF_DISTRIB_PASS  = 1<<3,
   73     UT_PF_EOL_PASS      = 1<<4,
   74     UT_PF_XASCII_PASS   = 1<<5,
   75     UT_PF_CONVERT       = 1<<6,
   76 
   77     UT_PF_MAX           = 1<<6
   78     //UT_PF_ALL         = UT_PF_LOAD | UT_PF_RECOGNIZE | UT_PF_CONVERT
   79 } UtPassFlags;
   80 
   81     
   82 
   83 
   84 
   85 /***************************************************************************/
   86 /*!
   87  * \brief Contains evaluation of a charset.
   88  *
   89  * An array of this structure is instanciated in UtText and holds the result of
   90  * the evaluation of each charset. The charset which get the best rating will
   91  * be choosed for the conversion.
   92  */
   93 
   94 typedef struct UtCharsetEval {
   95     long rating;    //!< Mark attributed to the charset depending on the text
   96     ulong checksum; //!< Checksum of each extended character in the text. Used to find equivalent charsets.
   97 } UtCharsetEval;
   98 
   99 
  100 /***************************************************************************/
  101 /*!
  102  * \brief Refers to a line with extended characters.
  103  *
  104  * This structure refers to a line with extended characters.
  105  * The list of lines with extended characters is filtered to exclude lines
  106  * with same characters and is stocked in a linked list accessible from UtText.
  107  */
  108 
  109 typedef struct UtExtCharLine {
  110     char * line_p;      //!< Pointer to the beginning line.
  111     ulong line_i;       //!< Number of the line.
  112     ulong nb_ext_chars; //!< Number of extended characters in the line.
  113     struct UtExtCharLine * next;    //!< Pointer to the next struture. NULL if last.
  114 } UtExtCharLine;
  115 
  116 
  117 /***************************************************************************/
  118 /*!
  119  * \brief Types of End-of-line characters.
  120  *
  121  * Different types are CRLF (DOS/Windows), LF (Unix), CR (Mac). The types CRLF_CR and
  122  * CRLF_LF exists in some CSV databases : entries are ended with CRLF, but some fields
  123  * may contains LF or CR alone to indicate a "carriage return" in the field.
  124  * CR is the character 0xD, LF is 0xA.
  125  *
  126  * \note EC le cas du LFCR n'est pas pris en compte (cela n'existe pas ?)
  127  *       AC Si! je ne l'ai pas rencontré, mais il faudrait le rajouter...
  128  *       (en fait il faudrait même modifier pas mal de trucs dans la reconnaissance
  129  *       de fins de ligne)
  130  */
  131 
  132 typedef enum UtEolType {
  133     UT_EOL_UNSET=-1, 
  134     UT_EOL_CR,
  135     UT_EOL_LF,
  136     UT_EOL_CRLF,
  137     UT_EOL_LFCR,
  138     UT_EOL_MIX,     //!< Detection only
  139     UT_EOL_BSN,     //!< \n, conversion only
  140     UT_EOL_NUL,     //!< ASCII NUL character
  141 //  UT_EOL_SPACE,
  142 //  UT_EOL_TAB,
  143 //  UT_EOL_NOCHANGE,    //!< Conversion only
  144     UT_EOL_NONE     //always the last
  145 } UtEolType;
  146 
  147 extern const char * UT_EOL_NAME [];
  148 
  149 typedef short UtCharsetIndex;
  150 
  151 /***************************************************************************/
  152 /*!
  153  * \brief Contains all the information about a text and its processing.
  154  *
  155  * This structure is created by ut_init_text() and destroyed by ut_free_text(). It is used
  156  * to pass different arguments to ut_process_text(), and to stock information about the
  157  * text all along its processing.
  158  */
  159 
  160 typedef struct UtText {
  161     char * data;                    //!< Pointer to the beginning of the text. It is finished by a null character. Set by user or Utrac.
  162     ulong size;                     //!< Size of the text, without the terminating null character. Set by user or Utrac.
  163 
  164     UtEolType eol;                  //!< EOL type recognized by Utrac.
  165     UtEolType eol_alt;              //!< EOL type recognized by Utrac.
  166     UtCharsetIndex charset;         //!< Charset recognized by Utrac.
  167 
  168     ulong nb_lines;                 //!< Number of lines in the text. Set by Utrac.
  169     ulong nb_lines_alt;             //!< Number of alt lines in the text. Set by Utrac.
  170     ulong * distribution;           //!< Frequency distribution of the text. Set by Utrac.
  171     UtExtCharLine * ext_char;       //!< Linked list of lines containing extended characters. Set by Utrac.
  172     UtCharsetEval * evaluation;     //!< Array containg evaluation of each charset. Set by Utrac.
  173 
  174     UtTextFlags flags;              //!< Flags that control the processing of the text. Set by user.
  175     UtPassFlags pass_flags;
  176     char skip_char;                 //!< Character to skip during conversion. A variable is used rather than the constant
  177                                     //!< UT_SKIP_CHAR, since the text can already already contains UT_SKIP_CHAR value if
  178                                     //!< UT_F_REMOVE_ILLEGAL_CHAR is not set. Set by user.
  179     float progress_done;            //!< Part of the process already done. Value included between 0.0 and 1.0. Set by Utrac.
  180     int progress_todo;              //!< Number of passes to do before end of the process. Set by Utrac.
  181     UtPassFlags current_pass;       //!< Type of the pass in progress (used in the 'progress bar' callback)
  182     
  183     void * user;                    //!< Structure for user data. Never touched by utrac, except during initalisation.
  184 } UtText;
  185 
  186 #endif //_UT_TEXT_H_