"Fossies" - the Fresh Open Source Software Archive 
Member "utrac-0.3.2/src/utrac.c" (4 Jan 2009, 19053 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "utrac.c" see the
Fossies "Dox" file reference documentation.
1 /***************************************************************************
2 * utrac.c
3 *
4 * Tue Oct 5 11:29:59 2004
5 * Copyright 2004 Alliance MCA
6 * Written by : Antoine Calando (antoine@alliancemca.net)
7 ****************************************************************************/
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Library General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25 /*!
26 * \file utrac.c
27 * \author Antoine Calando (antoine@alliancemca.net)
28 * \brief Public API for using Utrac.
29 */
30
31 #define _UTRAC_C_
32
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include "utrac.h"
36
37 #undef UT_DEBUG
38 #define UT_DEBUG 3
39 #include "debug.h"
40
41
42 /***************************************************************************/
43 /*!
44 * \brief Initialize the Utrac library.
45 *
46 * This function must be called before any other Utrac function. It allocates an UtSession struture
47 * that is accessible by the ut_session pointer, initalizes it, loads charsets data, and sets
48 * default language, charset and end of line type. The memory used is about 630kb for 47 charsets
49 * loaded.
50 *
51 * \note On Unix systems, LANG, LC_ALL and LC_TYPE are parsed to find default language and
52 * charsets (ISO-8859-1 is used else), and eol type is set to LF.
53 *
54 * \return UT_OK on success, an error code on failure.
55 */
56 UtCode ut_init () {
57
58 if (ut_session) return UT_ALREADY_INITIALISED_ERROR;
59
60 ut_session = (UtSession*) malloc (sizeof(UtSession));
61 if (!ut_session) return UT_MALLOC_ERROR;
62
63 return ut_init_noalloc();
64 }
65
66 /*!
67 * \brief Initialize the Utrac library, without allocating memory for UtSession. Used internally
68 */
69 UtCode ut_init_noalloc () {
70 //ut_session->flags = UT_F_UNSET; //flags_in
71 ut_session->charset = NULL;
72 ut_session->nb_charsets = 0;
73 ut_session->language.name = NULL;
74 ut_session->language.code = NULL;
75 ut_session->language.n = 0;
76 ut_session->language.n_max = 0;
77 ut_session->system.name = NULL;
78 ut_session->system.code = NULL;
79 ut_session->system.n = 0;
80 ut_session->system.n_max = 0;
81 //ut_session->charset_default = UT_UNSET;
82 ut_session->eol_default = UT_EOL_UNSET;
83 ut_session->eol_alt_default = UT_EOL_UNSET;
84
85 ut_session->nomapping_char = '_';
86 ut_session->progress_function = NULL;
87 ut_session->error_string = NULL;
88 //load charsets data
89 UT_TRY (ut_load_charsets ())
90
91 //find default language, charset, eol type on the system
92 #ifdef linux
93 //should we use nl_langinfo()? (discovered later...) ->yes!
94 int i;
95 ut_session->language_default = 0; //language_default_in
96 ut_session->system_default = 3; //3 (to check in file charsets.dat)
97 ut_session->eol_default = UT_EOL_LF;
98 ut_session->eol_alt_default = UT_EOL_LF;
99 ut_session->charset_default = ut_find_charset("ISO-8859-1");
100
101 char * def_enc = getenv ("LC_CTYPE");
102 if (!def_enc) def_enc = getenv ("LC_ALL");
103 if (!def_enc) def_enc = getenv ("LANG");
104 if (def_enc) {
105 if (def_enc[2]=='_' || def_enc[2]=='.' || def_enc[2]==0) {
106 for (i=0; i<ut_session->language.n; i++) {
107 if (def_enc[0]-'a'+'A'== ut_session->language.code[i*2+0]
108 && def_enc[1]-'a'+'A'== ut_session->language.code[i*2+1] ) {
109 ut_session->language_default = i;
110 break;
111 }
112 } //for
113 }
114 if (def_enc[2]=='.') def_enc +=3;
115 if (def_enc[2]=='_' && def_enc[5]=='.') def_enc +=6;
116 for (i=0; i<ut_session->nb_charsets; i++)
117 if (ut_str_fuzzy_cmp (def_enc, ut_session->charset[i].name,'@')) break;
118 if (i!=ut_session->nb_charsets) ut_session->charset_default = i;
119 }
120
121 if (ut_session->charset_default == UT_UNSET) {
122 for (i=0; i<ut_session->nb_charsets; i++)
123 if (ut_str_fuzzy_cmp (UT_DEFAULT_ENCODING_UNIX, ut_session->charset[i].name,0)) break;
124 if (i==ut_session->nb_charsets) {
125 DBG1 ("*** No default charset ***")
126 }
127 else ut_session->charset_default = i;
128 }
129 #else
130 ERROR ("pas unix!")
131 #endif
132
133 #if UT_DEBUG == 2
134 if (ut_session->language_default != UT_UNSET)
135 DBG2 ("lang: %s" , ut_session->language.name[ut_session->language_default])
136 if (ut_session->charset_default != UT_UNSET)
137 DBG2 ("charset: %s", ut_session->charset[ut_session->charset_default].name)
138 if (ut_session->eol_default != UT_EOL_UNSET)
139 DBG2 ("eol: %s", UT_EOL_NAME [ut_session->eol_default])
140 #endif
141
142 return UT_OK;
143 }
144
145 /*!
146 * \brief Free ressources allocated during initialization of Utrac.
147 *
148 * This function frees the structure allocated in ut_session
149 * by ut_init(). It must be the last Utrac function called.
150 *
151 * \return UT_OK on success, an error code on failure.
152 */
153
154 void ut_finish () {
155
156 ut_finish_nofree ();
157 free(ut_session);
158 ut_session = NULL;
159
160 return;
161 }
162
163 /*!
164 * \brief Free ressources allocated during initialization of Utrac, without freeing UtSession. Used internally.
165 */
166 void ut_finish_nofree () {
167
168 if (!ut_session) return;
169
170 int i; for(i=0; i<ut_session->nb_charsets; i++) {
171 free(ut_session->charset[i].name);
172 free(ut_session->charset[i].alias);
173 free(ut_session->charset[i].common_name);
174 free(ut_session->charset[i].comment);
175 free(ut_session->charset[i].unicode);
176 free(ut_session->charset[i].char_type);
177 free(ut_session->charset[i].language);
178 free(ut_session->charset[i].system);
179 }
180 free (ut_session->charset);
181
182 for (i=0; i<ut_session->language.n; i++)
183 free (ut_session->language.name[i]);
184 free (ut_session->language.name);
185 free (ut_session->language.code);
186
187 for (i=0; i<ut_session->system.n; i++)
188 free (ut_session->system.name[i]);
189 free (ut_session->system.name);
190 free (ut_session->system.code);
191
192 free (ut_session->error_string);
193 return;
194 };
195
196
197
198
199 /***************************************************************************/
200 /*!
201 * \brief Allocates and initalizes an UtText structure.
202 *
203 * \return A pointer to the allocated structure, or NULL if the allocation failed.
204 */
205
206 UtText * ut_init_text_heap () {
207 ASSERT (ut_session)
208 UtText* new_text = (UtText*) malloc (sizeof(UtText));
209 if (!new_text) return NULL;
210
211 ut_init_text (new_text);
212
213 return new_text;
214 }
215
216 /*!
217 * \brief Initalizes an UtText structure.
218 * \param new_text A pointer on the structure to initialize
219 */
220
221 void ut_init_text (UtText * new_text) {
222
223 new_text->data = NULL;
224 new_text->size = 0;
225
226 new_text->eol = UT_EOL_UNSET;
227 new_text->eol_alt = UT_EOL_UNSET;
228 new_text->charset = UT_UNSET;
229
230 new_text->nb_lines = UT_UNSET;
231 new_text->nb_lines_alt = UT_UNSET;
232 new_text->distribution = NULL;
233 //int i; for (i=0; i<0x100; i++) new_text->distribution [i] = 0;
234 new_text->ext_char = NULL;
235 new_text->evaluation = NULL;
236
237 new_text->flags = UT_F_DEFAULT;
238 new_text->pass_flags = UT_PF_UNSET;
239 new_text->skip_char = UT_SKIP_CHAR;
240
241 new_text->progress_done = 0.0;
242 new_text->progress_todo = 0;
243 new_text->current_pass = UT_PF_UNSET;
244
245 new_text->user = NULL;
246 }
247
248 /*!
249 * \brief Free an UtText structure.
250 * \param text pointer to the structure to free.
251 */
252
253 void ut_free_text_heap (UtText *text) {
254
255 ut_free_text (text);
256 free(text);
257
258 }
259
260 /*!
261 * \brief Free the contents of an UtText structure, without freeing the structure itself.
262 * \param text pointer to the structure to free.
263 */
264
265 void ut_free_text (UtText *text) {
266 //free(text->filename);
267 //filename is not freed because it is set by user.
268 free(text->data); text->data = NULL;
269 free(text->distribution); text->distribution = NULL;
270 while (text->ext_char) {
271 UtExtCharLine * tmp = text->ext_char;
272 text->ext_char = text->ext_char->next;
273 free (tmp);
274 } text->ext_char = NULL;
275
276 free(text->evaluation); text->evaluation = NULL;
277 //text->user should be free by the user.
278 }
279
280
281 /*!
282 * \brief Initialize an UtText structure before using the 'progress bar' callback feature
283 *
284 * Can be used internaly or by the user. The UtText must have member UtText::pass_flag set, or
285 * at least UtText::flags (if UtText::pass_flags is unset, it will be set for just a recognition
286 * pass and subpasses will be selected upon the value of UtText::flags).
287 */
288
289 UtCode ut_init_progress (UtText *text) {
290
291 ASSERT (text);
292
293 text->progress_done = 0.0;
294 text->progress_todo = 0;
295 if (text->pass_flags == UT_PF_UNSET) text->pass_flags = UT_PF_RECOGNIZE;
296
297 if (text->pass_flags & UT_PF_LOAD ) text->progress_todo++;
298
299 if (text->pass_flags & UT_PF_RECOGNIZE ) {
300 if ((text->flags & UT_F_IDENTIFY_CHARSET) || (text->pass_flags & UT_PF_CONVERT ) )
301 text->pass_flags |= UT_PF_DISTRIB_PASS;
302 else text->pass_flags &= ~UT_PF_DISTRIB_PASS;
303 if (text->flags & (UT_F_TRANSFORM_EOL | UT_F_REMOVE_ILLEGAL_CHAR | UT_F_ADD_FINAL_EOL | UT_F_IDENTIFY_EOL ) )
304 text->pass_flags |= UT_PF_EOL_PASS;
305 else text->pass_flags &= ~UT_PF_EOL_PASS;
306
307 if (text->flags & (UT_F_IDENTIFY_CHARSET | UT_F_REFERENCE_EXT_CHAR ) )
308 text->pass_flags |= UT_PF_XASCII_PASS;
309 else text->pass_flags &= ~UT_PF_XASCII_PASS;
310
311 if (text->pass_flags & UT_PF_DISTRIB_PASS) text->progress_todo++;
312 if (text->pass_flags & UT_PF_EOL_PASS) text->progress_todo++;
313 if (text->pass_flags & UT_PF_XASCII_PASS) text->progress_todo++;
314 } else {
315 text->pass_flags &= ~(UT_PF_DISTRIB_PASS | UT_PF_EOL_PASS | UT_PF_XASCII_PASS);
316 }
317
318 if (text->pass_flags & UT_PF_CONVERT ) text->progress_todo++;
319
320 return UT_OK;
321 }
322
323 /*! \brief Load a file in an UtText structure
324 *
325 * If filename is null, it will read stdin. text->data and text->size will be set.
326 * If ut_session->progress_function is set, it will be called during loading and members of
327 * text dealing with this feature will be updated.
328 */
329
330 UtCode ut_load (UtText *text, const char * filename) {
331
332 ASSERT (text);
333
334 if (text->pass_flags==UT_PF_UNSET) {
335 text->pass_flags |= UT_PF_LOAD | UT_PF_RECOGNIZE;
336 ut_init_progress(text);
337 }
338
339 if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
340
341 text->current_pass = UT_PF_LOAD;
342
343 if (filename) {
344 UT_TRY ( ut_load_file_pass (text, filename) )
345 } else {
346 UT_TRY ( ut_load_stdin_pass (text) )
347 }
348
349 text->current_pass = UT_PF_NONE;
350
351 if (ut_session->progress_function) {
352 text->progress_done+= (1-text->progress_done)/text->progress_todo;
353 text->progress_todo--;
354 }
355
356 //if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
357 if (ut_session->progress_function && text->progress_todo == 0) ut_update_progress (text, 0, true);
358
359 return UT_OK;
360 }
361
362
363 /*! \brief Recognize charset and EOL of a text.
364 *
365 * text->data must be set. If text->size is null, recognition will stop at the first
366 * null character. text->flags must also be set to select processes to do (see UtTextFlags).
367 *
368 * If ut_session->progress_function is set, it will be called during loading and members of
369 * text dealing with this feature will be updated.
370 *
371 * If UT_F_FORCE_BINARY is set, texts with caracters between 0 and 0x19 (space is 0x20, and TAB, CR,
372 * LF are excluded of this range) won't produce error.
373 *
374 *
375 * If UT_F_IDENTIFY_EOL is set, text->eol, text->eol_alt, text->nb_lines, text->nb_lines_alt will be
376 * updated. If convertion of EOL is planned, UT_F_TRANSFORM_EOL must be set.
377 *
378 * If UT_F_IDENTIFY_CHARSET is set, text->charset will be updated. text->evaluation also if charset
379 * is 8bits and ASCII -erivated.
380
381 * text->distribution will always be set, text->ext_char also (but this is a bug!)
382 *
383 * If ut_session->progress_function is set, it will be called during loading and members of
384 * text dealing with this feature will be updated.
385 */
386
387 UtCode ut_recognize (UtText *text) {
388
389 if (!text || !text->data) return UT_BAD_PARAMETER_ERROR;
390
391 if (text->pass_flags==UT_PF_UNSET) ut_init_progress(text);
392
393 if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
394
395 //FIRST PASS
396 if (text->pass_flags & UT_PF_DISTRIB_PASS) {
397 text->current_pass = UT_PF_DISTRIB_PASS | UT_PF_RECOGNIZE;
398 int rcode = ut_distrib_utf_pass (text);
399 text->current_pass = UT_PF_NONE;
400
401 if (rcode == UT_BINARY_DATA_ERROR) {
402 if ( !(text->flags & UT_F_FORCE_BINARY)) return rcode;
403 } else if ( rcode != UT_OK) return rcode;
404
405 if (text->charset != UT_UNSET && text->pass_flags & UT_PF_XASCII_PASS) {
406 text->pass_flags &= ~UT_PF_XASCII_PASS | UT_PF_RECOGNIZE;
407 text->progress_todo--;
408 }
409
410 if (ut_session->progress_function) {
411 text->progress_done+= (1-text->progress_done)/text->progress_todo;
412 text->progress_todo--;
413 }
414 }
415
416 // set text->skip_char
417 if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR ) {
418 text->skip_char = UT_SKIP_CHAR;
419 } else {
420 //if control code accepted in file, try to find one not used
421 int i; for (i=1; i<0x20; i++) {
422 if (i==UT_EOL_ALT_CHAR || i== 0x9|| i==0xA || i==0xD) continue; //UT_EOL_CHAR and UT_EOF_CHAR = 0
423 if (!text->distribution[i]) break;
424 }
425 if (i!=0x20) text->skip_char = i;
426 else text->skip_char = UT_SKIP_CHAR; //all control code used, nevermind, we use UT_SKIP_CHAR
427 }
428
429 //ASSERT (text->flags & UT_F_TRANSFORM_EOL)
430
431 //SECOND PASS
432 if (text->pass_flags & UT_PF_EOL_PASS) {
433 text->current_pass = UT_PF_EOL_PASS | UT_PF_RECOGNIZE;
434 UT_TRY ( ut_eol_pass (text) )
435 text->current_pass = UT_PF_NONE;
436 if (ut_session->progress_function) {
437 text->progress_done+= (1-text->progress_done)/text->progress_todo;
438 text->progress_todo--;
439 }
440 }
441
442 //THIRD PASS
443 if ( text->pass_flags & UT_PF_XASCII_PASS ) {
444 text->current_pass = UT_PF_XASCII_PASS | UT_PF_RECOGNIZE;
445 UT_TRY ( ut_xascii_pass (text) )
446 text->current_pass = UT_PF_NONE;
447 if (ut_session->progress_function) {
448 text->progress_done+= (1-text->progress_done)/text->progress_todo;
449 text->progress_todo--;
450 }
451 }
452
453 if (ut_session->progress_function && text->progress_todo == 0) ut_update_progress (text, 0, true);
454
455 return UT_OK;
456 }
457
458
459 /*!
460 * \brief Convert a text.
461 *
462 * \param src_text source text, with input eol and charset set.
463 * \param dst_text destination text, with output eol and charset set. If it is null, src_text will be replaced
464 * by the destination text, and output eol and charset will be selectionned from ut_session.
465 *
466 * If ut_session->progress_function is set, it will be called during loading and members of
467 * text dealing with this feature will be updated.
468 */
469
470 UtCode ut_convert (UtText *src_text, UtText *dst_text) {
471
472 if (!src_text || !src_text->data) return UT_BAD_PARAMETER_ERROR;
473
474 ASSERT (src_text->eol != UT_EOL_UNSET)
475 ASSERT (src_text->charset != UT_UNSET)
476 ASSERT (src_text->distribution)
477
478 bool same_text = false;
479 if (!dst_text) {
480 same_text = true;
481 dst_text = ut_init_text_heap ();
482 if (!dst_text) return UT_MALLOC_ERROR;
483 }
484
485 ASSERT (dst_text)
486
487 if (src_text->pass_flags==UT_PF_UNSET) {
488 src_text->pass_flags |= UT_PF_CONVERT;
489 ut_init_progress(src_text);
490 }
491
492
493 if (ut_session->progress_function && src_text->progress_done == 0.0) ut_update_progress (src_text, 0, true);
494
495 if (dst_text->eol == UT_EOL_UNSET) dst_text->eol = ut_session->eol_default;
496 if (dst_text->eol_alt == UT_EOL_UNSET) dst_text->eol_alt = ut_session->eol_alt_default;
497 if (dst_text->charset == UT_UNSET) dst_text->charset = ut_session->charset_default;
498
499 src_text->current_pass = UT_PF_CONVERT;
500 UT_TRY ( ut_conversion_pass (src_text, dst_text) )
501 src_text->current_pass = UT_PF_NONE;
502
503 if (ut_session->progress_function) {
504 src_text->progress_done+= (1-src_text->progress_done)/src_text->progress_todo;
505 src_text->progress_todo--;
506 }
507
508 if (ut_session->progress_function && src_text->progress_todo == 0) ut_update_progress (src_text, 0, true);
509
510 if (same_text) {
511 free (src_text->data);
512 src_text->data = dst_text->data;
513 dst_text->data = NULL;
514 src_text->size = dst_text->size ;
515 src_text->eol = dst_text->eol ;
516 src_text->eol_alt = dst_text->eol_alt ;
517 src_text->charset = dst_text->charset ;
518 free (src_text->distribution);
519 src_text->distribution = NULL;
520 while (src_text->ext_char) {
521 UtExtCharLine * tmp = src_text->ext_char;
522 src_text->ext_char = src_text->ext_char->next;
523 free (tmp);
524 } src_text->ext_char = NULL;
525 free(src_text->evaluation);
526 src_text->evaluation = NULL;
527 ut_free_text_heap (dst_text);
528 }
529
530 return UT_OK;
531 }
532
533
534
535
536 /***************************************************************************/
537 /* OLD DOC!!!!
538 * \brief Recognize charset and EOL type of a text, and eventually convert it.
539 *
540 * This function take an UtText structure as a parameter and do severeal tasks :
541 * -# it loads the file (or read the standard input),
542 * -# it calculate the frequency distribution of each byte in the file
543 * (UtText::distribution), checks if the file is binary data or text,
544 * checks if it is ASCII or UTF-8,
545 * -# it recognize the EOL type, and replace each EOL by null character to make
546 * further processing of the file easier (this feature can be disbled).
547 * -# if the charset has not been determined earlier as ASCII or UTF-8, it tries
548 * to detect which known charset fit the best to the text.
549 * -# it eventually convert the text, replacing EOL and extended character by
550 * those corresponding to the selection of the user and/or the result of the recogntion.
551 *
552 * \param text Text to recognize and eventually convert. Some members must be set
553 * before calling this function, but some other are optionnal. Members that
554 * select the input text are:
555 * - UtText::data: Pointer to the text to process (which must be null terminated).
556 * If NULL, UtText::filename is used.
557 * - UtText::filename: Path to the file containing the text to process, which will
558 * be loaded if . If NULL, standard input is read.
559 * - UtText::size: If UtText::data is set, this member can also be set to indicate
560 * the size of the text, if null, the first null character will determine the
561 * end of the text.
562 *
563 * Members that modifies the recognition or the conversion are:
564 * - UtText::flags: Flags to customize the processing and the modification of the text.
565 * Set intially to UT_F_DEFAULT.
566 * - UtText::src_eol and UtText::src_charset: EOL type and charset of the text used as
567 * source for the conversion. If unset, the values taken are those recognized automatically.
568 * - UtText::dst_eol and UtText::dst_charset: EOL type and charset of the text resulting of
569 * the conversion. If unset, the values taken are those by default found by ut_init().
570 * - UtText::nomapping_char: Character inserted during the conversion each time an error occurs.
571 *
572 * Misc member:
573 * - UtText::progress_function: Custom function provided by the user to refresh a progress bar.
574 *
575 * \param convert If true, conversion is effectued after recognition.
576 *
577 * \return UT_OK on success, error code on failure (see UtCode).
578 */