"Fossies" - the Fresh Open Source Software Archive 
Member "utrac-0.3.2/src/ut_recognition1.c" (4 Jan 2009, 15194 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "ut_recognition1.c" see the
Fossies "Dox" file reference documentation.
1 /***************************************************************************
2 * ut_recognition1.c
3 *
4 * Tue Oct 5 11:29:40 2004
5 * Copyright 2004 Alliance MCA
6 * Written by : Antoine Calando (antoine@alliancemca.net)
7 ****************************************************************************/
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Library General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25
26 /*!
27 * \file ut_recognition1.c
28 * \brief Distrib/utf8 pass and EOL pass
29 * \author Antoine Calando (antoine@alliancemca.net)
30 */
31
32 #include <stdlib.h>
33 #include <stdio.h>
34 #include <string.h>
35 #include "utrac.h"
36
37 #undef UT_DEBUG
38 #define UT_DEBUG 1
39 #include "debug.h"
40
41 /***************************************************************************/
42 /*!
43 * \brief Return false if unicode scalar value is invalid
44 */
45 bool inline ut_unicode_invalid (ulong unicode) {
46 return (( 0x0000FDD0 <= unicode && unicode <= 0x0000FDEF )
47 || ( 0x0010FFFE <= unicode )
48 || ( ( 0xFFF0FFFE & unicode ) == 0x0000FFFE ));
49 }
50
51 /***************************************************************************/
52 /*!
53 * \brief Scan the text to calculate frequency distribution and UTF-8 correctness.
54 *
55 * This function calculate the frequency distribution, i.e. for i between 0 and 255,
56 * text->distribution [i] is equal to the number of bytes "i" in the text. This
57 * distribution is used to determinate if the file is binary or ASCII. The text is also
58 * simultaneously scanned to check for UTF-8 errors.
59 *
60 * \return UT_OK on success, UT_BINARY_DATA_ERROR if file is binary, error code otherwise.
61 */
62 UtCode ut_distrib_utf_pass (UtText * text) {
63
64 char * scan = text->data;
65 char * scan_end;
66
67 ASSERT(text);
68 ASSERT(text->data);
69
70 //bug! (see assert l85)
71 if (text->size) scan_end = scan + text->size;
72 else scan_end = NULL;
73
74 ulong unicode = 0;
75 ushort multibyte = 0;
76 ulong error_utf8 = 0;
77 int cumul = 1;
78 if (!text->distribution) text->distribution = (ulong*) malloc (sizeof(ulong)*256);
79 int i; for (i=0; i<0x100; i++) text->distribution[i] = 0;
80
81 scan--; //incrementation at the beginning of the loop is faster
82 for (;;) {
83 scan++;
84 //EC: double test de !*scan !! AC ok
85 switch (*scan) {
86 case 0:
87 if (scan>=scan_end) {
88 ASSERT (!scan_end || scan==scan_end)
89 goto out_for;
90 } else if (!scan_end) goto out_for;
91 case 0xA:
92 case 0xD:
93 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
94 if (!ut_update_progress (text, scan - text->data, false)) goto out_for;
95 cumul++;
96 }
97 }
98
99 text->distribution [(u_char) *scan]++;
100 if (multibyte) {
101 if ((*scan & 0xC0) == 0x80) { //==10xx xxxx
102 unicode <<= 6;
103 unicode |= *scan & 0x3F;
104 if(!--multibyte) { //last multybyte byte? then test if noncharacter (66 cases)
105 if (ut_unicode_invalid (unicode)) error_utf8++;
106 }
107 } else {
108 multibyte = 0;
109 error_utf8++;
110 }
111 } else if (*scan & 0x80) { //1xxx xxx
112 if ((*scan & 0xE0) == 0xC0) { //110x xxxx
113 multibyte = 1;
114 unicode = *scan & 0x1F;
115 } else if ((*scan & 0xF0) == 0xE0) { //1110 xxxx
116 multibyte = 2;
117 unicode = *scan & 0x0F;
118 } else if ((*scan & 0xF8) == 0xF0) { //1111 0xxx
119 multibyte = 3;
120 unicode = *scan & 0x07;
121 } else { //error
122 error_utf8++;
123 }
124 }
125 } //for (;;)
126 out_for:
127 //interrupted?
128 //EC: ou il y a déja un 0 dans le texte ! AC test déjà fait
129 if (scan<scan_end) {
130 return UT_INTERRUPTED_BY_USER;
131 }
132
133 if (multibyte) error_utf8++;
134
135 DBG2 ("Distribution and UTF-8 pass done! (%lu B)", text->size)
136
137 if (!text->size) text->size = scan - text->data; //terminating 0 not counted
138 if (!text->size) return UT_EMPTY_DATA_ERROR;
139
140 ulong nb_ctrl_chars = 0;
141 // count the number of control chars
142 for (i=0; i<0x20; i++) {
143 if (i==0x9 || i==0xA || i==0xD) continue;
144 nb_ctrl_chars += text->distribution[i];
145 }
146 nb_ctrl_chars += text->distribution[0x7F];
147
148 //test if text is actually binary data
149 if (text->size * UT_THRESHOLD_CONTROL_CHAR < nb_ctrl_chars) {
150 //to do: detect if UTF16!?!?
151 DBG3 ("Binary file detected! (%lu cc)", nb_ctrl_chars)
152 return UT_BINARY_DATA_ERROR;
153 }
154
155 //count the number of extended char
156 ulong nb_ext_chars = 0;
157 for (i=0x80; i<0x100; i++) {
158 nb_ext_chars += text->distribution[i];
159 }
160 DBG3 ("UTF-8 error : %lu, ext char number : %lu", error_utf8, nb_ext_chars)
161
162 if (text->flags & UT_F_IDENTIFY_CHARSET) {
163 if (!nb_ext_chars) {
164 //text is ASCII
165 for (i=0; i<ut_session->nb_charsets; i++)
166 if (ut_session->charset[i].type == UT_CST_ASCII) break;
167 ASSERT_MSG (i!=ut_session->nb_charsets, "ASCII not defined")
168 text->charset = i;
169 DBG3 ("ASCII Encoding detected!")
170 } else if (nb_ext_chars * UT_THRESHOLD_UTF8 > error_utf8) {
171 //text is UTF-8
172
173 for (i=0; i<ut_session->nb_charsets; i++)
174 if (ut_session->charset[i].type == UT_CST_UTF_8) break;
175 ASSERT_MSG (i!=ut_session->nb_charsets, "UTF-8 not defined")
176 text->charset = i;
177 DBG3 ("UTF-8 Encoding detected!")
178 } else {
179 text->charset = UT_UNSET;
180 }
181 }
182
183 return UT_OK;
184 }
185
186
187 /***************************************************************************/
188 /*!
189 * \brief Change all UT_EOL_CHAR to UT_EOL_ALT_CHAR, from beg to end-1.
190 *
191 * \note EC pourquoi revenir en arrière ?
192 * AC Si on s'est trompé de type d'eol (un LF a été scanné avant un CRLF par ex)
193 */
194 void ut_change_EOL1toEOL2 (char * beg, char * end) {
195 ASSERT (beg<end)
196 ASSERT (*end==UT_EOL_CHAR)
197 char * scan = beg;
198 for(;;) {
199 if (*scan==UT_EOL_CHAR) {
200 if (scan==end) return;
201 *scan=UT_EOL_ALT_CHAR;
202 }
203 scan++;
204 }
205 }
206
207 /***************************************************************************/
208 /*
209 * \brief Change all UT_EOL_ALT_CHAR to UT_EOL_CHAR, from beg to end-1.
210 *
211 * \note pour faire de vraie optimisation, on utilise strchr() à la place de
212 * for(;;) {... scan++ }, strchr() est une macro assembleur.
213 */
214 /*
215 void ut_change_lff2eoe (char * beg, char * end) {
216 ASSERT (beg<end)
217 ASSERT (*end==UT_EOL_ALT_CHAR)
218 char * scan = beg;
219 for(;;) {
220 if (*scan==UT_EOL_ALT_CHAR) {
221 if (scan==end) return;
222 *scan=UT_EOL_CHAR;
223 }
224 scan++;
225 }
226 }
227 */
228 // \brief exemple de fonction de remplacement pour ut_change_lff_eoe()
229 /*
230 void ut_change_lff_eoe_maybe (char * beg, char * end)
231 {
232 char * scan; //les variables locales en début de bloc, sinon c'est du C++
233
234 ASSERT (beg!=NULL) //important à tester en debug
235 ASSERT (end!=NULL) //important à tester en debug
236 ASSERT (beg<end)
237 ASSERT (*end==UT_EOL_ALT_CHAR) //c'est sur que cela doit être en ASSERT ?
238
239 *end = UT_EOL_CHAR; //c'est bien le 0 final ? non ?
240
241
242 donc ici pas d'appel de fonction ! c'est une directive __asm {}
243
244 il vaut mieux cependant utiliser memchr, c'est plus sûr (puisque
245 l'on spécifie la taille du buffer), et plus rapide car il utilis
246 REPNE SCASB
247
248 movb AL,octet à rechercher
249 movl EDX,adresse du buffer
250 movl ECX,taille du buffer -1
251 rpne scasb
252 je ...
253 EDX contient l'adresse de l'octet trouvé
254
255 for(scan=beg;
256 (scan=strchr(scan,UT_EOL_ALT_CHAR));
257 *scan=UT_EOL_CHAR)
258 ;
259
260 Si il peut y avoir des 0 dans le texte avant l'appel de cette fonction, il faut faire
261 une double boucle pour avancer d'un octet si scan!=end alors que strchr renvoi NULL
262 }
263 */
264
265
266 /***************************************************************************/
267 /*!
268 * \brief Scan the text to detect EOL type and replace EOL by UT_EOL_CHAR or UT_EOL_ALT_CHAR.
269 *
270 * EOL are recognized and replaced by UT_EOL_CHAR (null char), and eventually UT_EOL_ALT_CHAR
271 * if EOL type is UT_EOL_CRLF_CR or UT_EOL_CRLF_LF (see UtEolType).
272 * ut_session->progress_function() is called only if ( text->flags & UT_F_TRANSFORM_EOL )
273 *
274 * \return UT_OK on success, error code otherwise.
275 */
276
277 UtCode ut_eol_pass (UtText * text) {
278
279 char * scan = text->data;
280 char * scan_end = text->data+text->size;
281 ASSERT ( *scan_end == 0 )
282 //ASSERT ( text->flags & UT_F_TRANSFORM_EOL )
283 text->nb_lines = 0;
284 text->nb_lines_alt = 0;
285 ulong cumul=1;
286
287 //while (scan < scan_end) {
288
289 UtEolType eol1 = UT_EOL_NONE;
290 UtEolType eol2 = UT_EOL_NONE;
291
292
293 for (;;) {
294 DBG3_S ("<%d>", *scan);
295
296 if ((u_char)*scan<0x20) { //======== control code =============
297 if (!*scan) { //--------null char
298 if (scan>=scan_end) {
299 ASSERT (scan==scan_end)
300 break;
301 } else if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
302 if (!ut_update_progress (text, scan - text->data, false)) break;
303 cumul++;
304 }
305 }
306 if (*scan == 0xA) { //-------- LF (+CR?) -------------
307 DBG3_S ("*");
308 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
309 ut_update_progress (text, scan - text->data, false);
310 cumul++;
311 }
312
313 if (*(scan+1) == 0xD) { //LFCR
314 switch (eol1) {
315 case UT_EOL_LFCR:
316 case UT_EOL_MIX:
317 if (*(scan+2) == 0xA) goto LF_only;
318 break;
319 case UT_EOL_CRLF:
320 if (*(scan+2) == 0xA) goto LF_only;
321 eol1 = UT_EOL_MIX;
322 if (eol2 != UT_EOL_NONE) {
323 ERROR ("EOL2 todo...")
324 }
325 break;
326 case UT_EOL_CR:
327 case UT_EOL_LF:
328 if (*(scan+2) == 0xA) goto LF_only;
329 ASSERT (eol2 == UT_EOL_NONE)
330 eol2 = eol1;
331 text->nb_lines_alt = text->nb_lines;
332 text->nb_lines = 0;
333 *scan = UT_EOL_CHAR;
334 ut_change_EOL1toEOL2 (text->data, scan);
335 case UT_EOL_NONE:
336 eol1 = UT_EOL_LFCR;
337 break;
338 default:
339 ERROR ("Forbiden case!?!")
340 }
341 *scan++ = UT_EOL_CHAR;
342 *scan++ = text->skip_char;
343 text->nb_lines++;
344 } else { //LF only
345 LF_only:
346 switch (eol1) {
347 case UT_EOL_NONE:
348 eol1 = UT_EOL_LF;
349 case UT_EOL_LF:
350 case UT_EOL_MIX:
351 *scan++ = UT_EOL_CHAR;
352 text->nb_lines++;
353 break;
354 case UT_EOL_CR:
355 eol1 = UT_EOL_MIX;
356 *scan++ = UT_EOL_CHAR;
357 text->nb_lines++;
358 break;
359 case UT_EOL_CRLF:
360 case UT_EOL_LFCR:
361 switch (eol2) {
362 case UT_EOL_NONE:
363 eol2 = UT_EOL_LF;
364 break;
365 case UT_EOL_CR:
366 eol2 = UT_EOL_MIX;
367 case UT_EOL_LF:
368 case UT_EOL_MIX:
369 break;
370 default:
371 ERROR ("Forbiden case!?!")
372 }
373 *scan++ = UT_EOL_ALT_CHAR;
374 text->nb_lines_alt++;
375 break;
376 default:
377 ERROR ("Forbiden case!?!")
378 } //switch
379 } // else LF
380 } else if (*scan == 0xD) { //--------- CR (LF?) ------------
381 DBG3_S ("*");
382 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
383 ut_update_progress (text, scan - text->data, false);
384 cumul++;
385 }
386
387 if (*(scan+1) == 0xA) { //CRLF
388 switch (eol1) {
389 case UT_EOL_CRLF:
390 case UT_EOL_MIX:
391 break;
392 case UT_EOL_LFCR:
393 eol1 = UT_EOL_MIX;
394 if (eol2 != UT_EOL_NONE) {
395 ERROR ("EOL2 todo...")
396 }
397 break;
398 case UT_EOL_CR:
399 case UT_EOL_LF:
400 ASSERT (eol2 == UT_EOL_NONE)
401 eol2 = eol1;
402 text->nb_lines_alt = text->nb_lines;
403 text->nb_lines = 0;
404 *scan = UT_EOL_CHAR;
405 ut_change_EOL1toEOL2 (text->data, scan);
406 case UT_EOL_NONE:
407 eol1 = UT_EOL_CRLF;
408 break;
409 default:
410 ERROR ("Forbiden case!?!")
411 }
412 *scan++ = UT_EOL_CHAR;
413 *scan++ = text->skip_char;
414 text->nb_lines++;
415 } else { //CR only
416 switch (eol1) {
417 case UT_EOL_NONE:
418 eol1 = UT_EOL_CR;
419 case UT_EOL_CR:
420 case UT_EOL_MIX:
421 *scan++ = UT_EOL_CHAR;
422 text->nb_lines++;
423 break;
424 case UT_EOL_LF:
425 eol1 = UT_EOL_MIX;
426 *scan++ = UT_EOL_CHAR;
427 text->nb_lines++;
428 break;
429 case UT_EOL_CRLF:
430 case UT_EOL_LFCR:
431 switch (eol2) {
432 case UT_EOL_CR:
433 case UT_EOL_MIX:
434 break;
435 case UT_EOL_NONE:
436 eol2 = UT_EOL_CR;
437 break;
438 case UT_EOL_LF:
439 eol2 = UT_EOL_MIX;
440 break;
441 default:
442 ERROR ("Forbiden case!?!")
443 }
444 *scan++ = UT_EOL_ALT_CHAR;
445 text->nb_lines_alt++;
446 break;
447 default:
448 ERROR ("Forbiden case!?!")
449 } //switch
450 } // else CR
451 } else if (*scan == 0x9 ) { //------------- tab ----------
452 scan++;
453 } else if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) {
454 *scan++ = text->skip_char;
455 } //else
456
457 } else { //======== non control code =============
458 if (*scan == 0x7F && (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) ) { //control char del
459 *scan++ = text->skip_char;
460 } else {
461 scan++;
462 } //else
463 } //else
464 } //while
465
466 //interrupted?
467 if (scan<scan_end) {
468 return UT_INTERRUPTED_BY_USER;
469 }
470
471 if (text->flags & UT_F_ADD_FINAL_EOL) {
472 //add EOE if missinG
473 if ( (*(scan-2) != UT_EOL_CHAR || *(scan-1) != text->skip_char)
474 && *(scan-1) != UT_EOL_CHAR ) {
475 if (text->flags & UT_F_TRANSFORM_EOL) {
476 *scan = UT_EOL_CHAR;
477 text->size++;
478 } /* text->flags & UT_F_TRANSFORM_EOL should be true
479 else { switch (text->eol) {
480 case UT_EOL_CR:
481 *scan = 0xD;
482 text->size++;
483 break;
484 case UT_EOL_LF:
485 *scan = 0xA;
486 text->size++;
487 case UT_EOL_LF:
488 *scan++ = 0xD;
489 *scan = 0xA;
490 text->size+=2;
491 } } //else switch
492 */
493 text->nb_lines++;
494 } // if *scan
495 } //if text->flags
496
497 if (text->eol == UT_EOL_UNSET) {
498 text->eol = eol1;
499 text->eol_alt = eol2;
500 } else {
501 text->nb_lines = UT_UNSET;
502 text->nb_lines_alt = UT_UNSET;
503 }
504
505 //verify EOF
506 ASSERT (*scan == UT_EOF_CHAR)
507
508 DBG2 ("End Of Line pass done! (%lu B)", text->size)
509
510 return UT_OK;
511 }
512
513 // ************* Check for UTF16 - big endian & little endian *********
514 /*
515 {
516 ulong error_utf16 = 0 ; //, error_utf16be = 0, error_utf16le = 0;
517 ushort * scanw;
518 ushort * scanw_end;
519
520 if ( ifd->data_size%2) {
521 error_utf16 = -1U;
522 } else {
523 scanw = (ushort *) ifd->data;
524 scanw_end = scanw+ifd->data_size/2;
525 for (;;) {
526 if (!*scanw && scanw==scanw_end) break;
527 if (0xD800 <=*scanw && *scanw < 0xDC00) { //surrogate?
528 unicode = (*scanw & 0x3FF) + 0x400;
529 scanw++;
530 if (!(0xDC00 <= *scanw && *scanw < 0xE000 )) {
531 error_utf16++;
532 if (scanw==scanw_end) break;
533 }
534 unicode <<= 10;
535 unicode |= *scanw & 0x3FF;
536 } else {
537 unicode = *scanw;
538 }
539 if ( ( 0xFDD0 <= unicode && unicode <= 0xFDEF )
540 || ( (unicode & 0xFFF0FFFE) == 0x0000FFFE)
541 || ( unicode >= 0x0010FFFE)
542 || ( 0xD800 <=unicode && unicode < 0xE000) ) {
543 error_utf16++;
544 }
545 scanw++;
546 } //for (;;)
547 printf ("UTF16 : %lu errors\n", error_utf16);
548 } //else
549
550
551 //ulong error_utf32be = 0, error_utf32le = 0;
552 } */