"Fossies" - the Fresh Open Source Software Archive 
Member "utrac-0.3.2/src/ut_text.h" (4 Jan 2009, 7168 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "ut_text.h" see the
Fossies "Dox" file reference documentation.
1 /***************************************************************************
2 * ut_text.h
3 *
4 * Tue Oct 5 11:28:11 2004
5 * Copyright 2004 Alliance MCA
6 * Written by : Antoine Calando (antoine@alliancemca.net)
7 ****************************************************************************/
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Library General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25 /*!
26 * \file ut_text.h
27 * \author Antoine Calando (antoine@alliancemca.net)
28 */
29
30 #ifndef _UT_TEXT_H_
31 #define _UT_TEXT_H_
32
33
34
35 /***************************************************************************/
36 /*!
37 * \brief Flags that control the recognition of a text.
38 *
39 * They are set by the user to tune the way the text will be analysed
40 * (during function ut_recognize() ).
41 * Some of them are unimplemented (UT_F_REFERENCE_EXT_CHAR, always true).
42 */
43
44 typedef enum UtTextFlags {
45 UT_F_UNSET = 0,
46 UT_F_FORCE_BINARY = 1<<0, //!< Force processing of the file even if it is detected as binary data.
47 UT_F_IDENTIFY_EOL = 1<<1,
48 UT_F_TRANSFORM_EOL = 1<<2, //!< Replace EOL by null character to simplify the processing.
49 UT_F_REMOVE_ILLEGAL_CHAR = 1<<3, //!< Remove control characters (except CR, LF and TAB).
50 UT_F_ADD_FINAL_EOL = 1<<4, //!< Add a final EOL to the text if the last line is not empty.
51 UT_F_IDENTIFY_CHARSET = 1<<5,
52 UT_F_REFERENCE_EXT_CHAR = 1<<6, //!< Register the lines that contains extended characters (unimplemented, always true).
53
54 UT_F_DEFAULT = UT_F_REMOVE_ILLEGAL_CHAR | UT_F_IDENTIFY_CHARSET
55 } UtTextFlags;
56
57 /***************************************************************************/
58 /*!
59 * \brief Flags that describe each step in the processing of a text.
60 *
61 * They are set by the user or by utrac to select which pass will be done,
62 * in ordrer to compute the % of the process done for the 'progress bar'
63 * callback.
64 *
65 */
66
67 typedef enum UtPassFlags {
68 UT_PF_UNSET = 0,
69 UT_PF_NONE = 1<<0,
70 UT_PF_LOAD = 1<<1,
71 UT_PF_RECOGNIZE = 1<<2,
72 UT_PF_DISTRIB_PASS = 1<<3,
73 UT_PF_EOL_PASS = 1<<4,
74 UT_PF_XASCII_PASS = 1<<5,
75 UT_PF_CONVERT = 1<<6,
76
77 UT_PF_MAX = 1<<6
78 //UT_PF_ALL = UT_PF_LOAD | UT_PF_RECOGNIZE | UT_PF_CONVERT
79 } UtPassFlags;
80
81
82
83
84
85 /***************************************************************************/
86 /*!
87 * \brief Contains evaluation of a charset.
88 *
89 * An array of this structure is instanciated in UtText and holds the result of
90 * the evaluation of each charset. The charset which get the best rating will
91 * be choosed for the conversion.
92 */
93
94 typedef struct UtCharsetEval {
95 long rating; //!< Mark attributed to the charset depending on the text
96 ulong checksum; //!< Checksum of each extended character in the text. Used to find equivalent charsets.
97 } UtCharsetEval;
98
99
100 /***************************************************************************/
101 /*!
102 * \brief Refers to a line with extended characters.
103 *
104 * This structure refers to a line with extended characters.
105 * The list of lines with extended characters is filtered to exclude lines
106 * with same characters and is stocked in a linked list accessible from UtText.
107 */
108
109 typedef struct UtExtCharLine {
110 char * line_p; //!< Pointer to the beginning line.
111 ulong line_i; //!< Number of the line.
112 ulong nb_ext_chars; //!< Number of extended characters in the line.
113 struct UtExtCharLine * next; //!< Pointer to the next struture. NULL if last.
114 } UtExtCharLine;
115
116
117 /***************************************************************************/
118 /*!
119 * \brief Types of End-of-line characters.
120 *
121 * Different types are CRLF (DOS/Windows), LF (Unix), CR (Mac). The types CRLF_CR and
122 * CRLF_LF exists in some CSV databases : entries are ended with CRLF, but some fields
123 * may contains LF or CR alone to indicate a "carriage return" in the field.
124 * CR is the character 0xD, LF is 0xA.
125 *
126 * \note EC le cas du LFCR n'est pas pris en compte (cela n'existe pas ?)
127 * AC Si! je ne l'ai pas rencontré, mais il faudrait le rajouter...
128 * (en fait il faudrait même modifier pas mal de trucs dans la reconnaissance
129 * de fins de ligne)
130 */
131
132 typedef enum UtEolType {
133 UT_EOL_UNSET=-1,
134 UT_EOL_CR,
135 UT_EOL_LF,
136 UT_EOL_CRLF,
137 UT_EOL_LFCR,
138 UT_EOL_MIX, //!< Detection only
139 UT_EOL_BSN, //!< \n, conversion only
140 UT_EOL_NUL, //!< ASCII NUL character
141 // UT_EOL_SPACE,
142 // UT_EOL_TAB,
143 // UT_EOL_NOCHANGE, //!< Conversion only
144 UT_EOL_NONE //always the last
145 } UtEolType;
146
147 extern const char * UT_EOL_NAME [];
148
149 typedef short UtCharsetIndex;
150
151 /***************************************************************************/
152 /*!
153 * \brief Contains all the information about a text and its processing.
154 *
155 * This structure is created by ut_init_text() and destroyed by ut_free_text(). It is used
156 * to pass different arguments to ut_process_text(), and to stock information about the
157 * text all along its processing.
158 */
159
160 typedef struct UtText {
161 char * data; //!< Pointer to the beginning of the text. It is finished by a null character. Set by user or Utrac.
162 ulong size; //!< Size of the text, without the terminating null character. Set by user or Utrac.
163
164 UtEolType eol; //!< EOL type recognized by Utrac.
165 UtEolType eol_alt; //!< EOL type recognized by Utrac.
166 UtCharsetIndex charset; //!< Charset recognized by Utrac.
167
168 ulong nb_lines; //!< Number of lines in the text. Set by Utrac.
169 ulong nb_lines_alt; //!< Number of alt lines in the text. Set by Utrac.
170 ulong * distribution; //!< Frequency distribution of the text. Set by Utrac.
171 UtExtCharLine * ext_char; //!< Linked list of lines containing extended characters. Set by Utrac.
172 UtCharsetEval * evaluation; //!< Array containg evaluation of each charset. Set by Utrac.
173
174 UtTextFlags flags; //!< Flags that control the processing of the text. Set by user.
175 UtPassFlags pass_flags;
176 char skip_char; //!< Character to skip during conversion. A variable is used rather than the constant
177 //!< UT_SKIP_CHAR, since the text can already already contains UT_SKIP_CHAR value if
178 //!< UT_F_REMOVE_ILLEGAL_CHAR is not set. Set by user.
179 float progress_done; //!< Part of the process already done. Value included between 0.0 and 1.0. Set by Utrac.
180 int progress_todo; //!< Number of passes to do before end of the process. Set by Utrac.
181 UtPassFlags current_pass; //!< Type of the pass in progress (used in the 'progress bar' callback)
182
183 void * user; //!< Structure for user data. Never touched by utrac, except during initalisation.
184 } UtText;
185
186 #endif //_UT_TEXT_H_