"Fossies" - the Fresh Open Source Software Archive 
Member "utrac-0.3.2/src/ut_recognition2.c" (4 Jan 2009, 13419 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "ut_recognition2.c" see the
Fossies "Dox" file reference documentation.
1 /***************************************************************************
2 * ut_recognition2.c
3 *
4 * Tue Oct 5 11:29:47 2004
5 * Copyright 2004 Alliance MCA
6 * Written by : Antoine Calando (antoine@alliancemca.net)
7 ****************************************************************************/
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Library General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25 /*!
26 * \file ut_recognition2.c
27 * \author Antoine Calando (antoine@alliancemca.net)
28 * \brief Extended ASCII charset pass.
29 */
30
31 #include <stdlib.h>
32 #include <stdio.h>
33 #include "ut_text.h"
34 #include "ut_charset.h"
35 #include "utrac.h"
36
37 //#undef UT_DEBUG
38 //#define UT_DEBUG 3
39 #include "debug.h"
40
41 /***************************************************************************/
42 //! \brief Move the scan_pre pointer to the previous character and return it.
43 char inline ut_get_pre_char (char **scan_pre, UtText * text) {
44 do {
45 if (*scan_pre == text->data) return 0;
46 --(*scan_pre);
47 } while (**scan_pre == text->skip_char);
48 return **scan_pre;
49 }
50
51 /***************************************************************************/
52 //! \brief Move the scan_post pointer to the next character and return it.
53 char inline ut_get_post_char (char **scan_post, UtText * text, char *scan_end) {
54 do {
55 if (*scan_post == scan_end) return 0;
56 ++(*scan_post);
57 } while (**scan_post == text->skip_char);
58 return **scan_post;
59 }
60
61
62 /***************************************************************************/
63 /*!
64 * \brief Rate each charset relatively yo the text and register lines with extended characters.
65 *
66 * - Rate single byte extended ascii charsets: the function scan the whole text. Each time an
67 * extended character is found, and for each charset, it is encoded in this charset, compared to
68 * the previous and following character(s), and depending on the result, some points are added to
69 * charset rating. For instance, "café" (Latin1) will get more points than "cafÈ" (MacRoman).
70 * The checksum of all the extended characters in each charset is also calculated, to determine
71 * which charsets will have the same result (see UtCharsetEval).
72 * - Register lines with extended chars: each time an extended character is found, and if that
73 * character was not already found, the line is registered in a linked list (see UtExtCharLine).
74 * After the whole text is scanned, the line linked list is filtered and sorted to keep only
75 * the most revelant lines.
76 *
77 * \todo check if charmap exists!
78 *
79 * \return UT_OK on success, error code otherwise.
80 */
81
82 UtCode ut_xascii_pass (UtText * text) {
83
84
85 int i,j;
86 char * scan = text->data;
87 char * scan_end = text->data + text->size;
88
89 char * line_beg = scan;
90 ulong line_i = 0;
91 ulong nb_ext_chars = 0; //number of ext char in current line
92 bool ext_char[0x80]; for (i=0x0; i<0x80; i++) ext_char[i] = false; //bit for each of the 128 ext char in current line
93 bool ext_char_diff = false; //ext char not previously found in current line?
94
95 UtExtCharLine * scan_exl, * pre_exl, * new_exl;
96 ulong ponct_init[UT_CTG_PONCT_IF_N]; for (i=0; i<UT_CTG_PONCT_IF_N; i++) ponct_init[i] = 0;
97
98
99 if (text->charset == UT_UNSET) {
100 if (!text->evaluation)
101 text->evaluation = (UtCharsetEval*) malloc ( sizeof (UtCharsetEval) * ut_session->nb_charsets);
102
103 for (i=0; i<ut_session->nb_charsets; i++) {
104 text->evaluation [i].rating = 0;
105 text->evaluation [i].checksum = 0;
106 }
107 }
108
109 int cumul = 1;
110 scan--;
111 for (;;) {
112 scan++;
113 if (!*scan) { //eol!!!
114 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
115 if (!ut_update_progress (text, scan - text->data, false)) break;
116 cumul++;
117 }
118 if (scan >= scan_end) {
119 ASSERT (scan==scan_end)
120 break; //last line?
121 }
122 if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
123 if (ext_char_diff) { //extended char in this line?
124 //create new struct
125 new_exl = (UtExtCharLine*) malloc (sizeof(UtExtCharLine));
126 new_exl->line_p = line_beg;
127 new_exl->line_i = line_i;
128 new_exl->nb_ext_chars = nb_ext_chars;
129
130 //the link is inserted in the list which is sorted by
131 //line with biggest number of extended char first
132 if (!text->ext_char //insert struct at first pos?
133 || text->ext_char->nb_ext_chars <= nb_ext_chars ) {
134 new_exl->next = text->ext_char;
135 text->ext_char = new_exl;
136 } else {
137 pre_exl = scan_exl = text->ext_char;
138 while (scan_exl && scan_exl->nb_ext_chars > nb_ext_chars) {
139 pre_exl = scan_exl;
140 scan_exl = scan_exl->next;
141 }
142 pre_exl->next = new_exl;
143 new_exl->next = scan_exl;
144 }
145 ext_char_diff = false;
146 } //if
147 nb_ext_chars = 0;
148 line_beg = scan+1;
149 line_i++;
150 }
151
152 } else if ((u_char)*scan>0x7F) { //char extended found
153 if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
154 nb_ext_chars++;
155 if (!ext_char[(u_char)*scan-0x80]) { //already found?
156 ext_char[(u_char)*scan-0x80] = true;
157 ext_char_diff = true;
158 }
159 }
160
161 if (text->charset == UT_UNSET) {
162
163 UtCharsetEval * cs_eval = &(text->evaluation[0]);
164
165 //rate each charset for this extended char
166 for (i=0; i<ut_session->nb_charsets; i++, cs_eval++) {
167 UtCharset * cs = &(ut_session->charset[i]);
168 if (cs->type != UT_CST_ASCII_EXTENSION) continue;
169
170 char tmp;
171 UtCateg pre1_ctg, pre2_ctg, scan_ctg, post1_ctg, post2_ctg, post3_ctg;
172 UtScript pre1_scr, scan_scr, post1_scr;
173 char * scan_pre = scan, * scan_post = scan;
174
175 //get category and alphabet type of chars at pos scan-1, scan and scan+1
176 scan_ctg = (cs->char_type[(u_char) *scan].categorie);
177 scan_scr = (cs->char_type[(u_char) *scan].script);
178 tmp = ut_get_pre_char (&scan_pre, text);
179 pre1_ctg = (cs->char_type[(u_char) tmp].categorie);
180 pre1_scr = (cs->char_type[(u_char) tmp].script);
181 tmp = ut_get_post_char (&scan_post, text, scan_end);
182 post1_ctg = (cs->char_type[(u_char) tmp].categorie);
183 post1_scr = (cs->char_type[(u_char) tmp].script);
184
185 //compare to previous and following char(s)
186 switch (scan_ctg) {
187 case UT_CTG_UPPERCASE:
188 if ( pre1_ctg==UT_CTG_DELIMITER &&
189 (post1_ctg==UT_CTG_LOWERCASE || post1_ctg==UT_CTG_UPPERCASE)) cs_eval->rating++;
190 else
191 if ( pre1_ctg==UT_CTG_UPPERCASE) cs_eval->rating++;
192 else {
193 post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
194 if (post1_ctg==UT_CTG_UPPERCASE && post2_ctg!=UT_CTG_LOWERCASE) cs_eval->rating++;
195 else {
196 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre, text)].categorie);
197 if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
198 (( pre2_ctg==UT_CTG_UPPERCASE && post2_ctg==UT_CTG_UPPERCASE) ||
199 (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER))) cs_eval->rating++;
200 }
201 } break;
202
203 case UT_CTG_LOWERCASE:
204 if ( pre1_ctg==UT_CTG_LOWERCASE) cs_eval->rating++;
205 else
206 if (post1_ctg==UT_CTG_LOWERCASE) cs_eval->rating++;
207 else
208 if ( pre1_ctg==UT_CTG_UPPERCASE && post1_ctg!=UT_CTG_UPPERCASE) cs_eval->rating++;
209 else {
210 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre , text)].categorie);
211 post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
212 post3_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
213 if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
214 (( pre2_ctg==UT_CTG_LOWERCASE && (post2_ctg==UT_CTG_LOWERCASE || (post2_ctg==UT_CTG_UPPERCASE && post3_ctg==UT_CTG_LOWERCASE))
215 ) || (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER))) cs_eval->rating++;
216 } break;
217 case UT_CTG_OTHER_LETTER:
218 if (pre1_ctg==UT_CTG_OTHER_LETTER) cs_eval->rating++;
219 if (post1_ctg==UT_CTG_OTHER_LETTER) cs_eval->rating++;
220 break;
221
222 case UT_CTG_MARK:
223 if (pre1_ctg>=UT_CTG_UPPERCASE && pre1_ctg<=UT_CTG_OTHER_LETTER) cs_eval->rating++;
224 if (post1_ctg>=UT_CTG_UPPERCASE && post1_ctg<=UT_CTG_OTHER_LETTER) cs_eval->rating++;
225 break;
226
227 case UT_CTG_CONTROL:
228 case UT_CTG_UNSET:
229 cs_eval->rating-=2;
230 break;
231
232 case UT_CTG_CURRENCY:
233 if (pre1_ctg==UT_CTG_NUMBER || post1_ctg==UT_CTG_NUMBER) cs_eval->rating++;
234 else if (pre1_ctg==UT_CTG_DELIMITER) {
235 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre , text)].categorie);
236 if (pre2_ctg==UT_CTG_NUMBER ) cs_eval->rating++;
237 }
238 break;
239
240 case UT_CTG_SYMBOL:
241 switch (cs->unicode[(u_char)*scan]) {
242 case 0x00B0: /* ° */
243 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre, text)].categorie);
244 if (pre2_ctg>UT_CTG_OTHER_LETTER && (*(scan-1)=='N' || *(scan-1)=='n')
245 && post1_ctg>UT_CTG_OTHER_LETTER) cs_eval->rating+=3;
246 } break;
247 case UT_CTG_DELIMITER:
248 if (pre1_ctg==post1_ctg || *scan==*(scan-1) || *scan==*(scan+1)) cs_eval->rating++;
249 break;
250 case UT_CTG_NUMBER:
251 case UT_CTG_PONCTUATION:
252 case UT_CTG_OTHER: break;
253 default:
254 for (j=0; j<UT_CTG_PONCT_IF_N; j++) {
255 if (scan_ctg==UT_CTG_PONCT_INIT_0+j) ponct_init[j]++;
256 else if (scan_ctg==UT_CTG_PONCT_FINAL_0+j && ponct_init[j]) {
257 ponct_init[j]--;
258 cs_eval->rating+=2;
259 }
260 } //for
261 } //switch
262
263 //rate according to the script
264 if (scan_scr==1) {
265 if (scan_scr== pre1_scr)
266 cs_eval->rating++;
267 if (scan_scr == post1_scr)
268 cs_eval->rating++;
269 } else if (scan_scr>1) {
270 if (scan_scr== pre1_scr)
271 cs_eval->rating+=2;
272 if (scan_scr == post1_scr)
273 cs_eval->rating+=2;
274 }
275
276 } //for nb_charsets
277
278 } //if (text->charset == UT_UNSET)
279
280 } //if (*scan>0x7F)
281
282 } //for (;;)
283
284
285 //interrupted?
286 if (scan<scan_end) {
287 return UT_INTERRUPTED_BY_USER;
288 }
289
290 if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
291 //filter the extended line linked list
292 for (i=0x0; i<0x80; i++) ext_char[i] = false;
293 pre_exl = scan_exl = text->ext_char;
294
295 while (scan_exl) { //scan each struct
296 ext_char_diff = false;
297 scan = scan_exl->line_p;
298 while (*scan) { //scan each char
299 if ((u_char)*scan>0x7F) { //char extended found
300 if (!ext_char[(u_char)*scan-0x80]) { //already found?
301 ext_char[(u_char)*scan-0x80] = true;
302 ext_char_diff = true;
303 }
304 }
305 scan++;
306 }//while
307
308 if (!ext_char_diff) { //remove the struct ext_char_line?
309 pre_exl->next = scan_exl->next; //(first struct is never removed, so this code is ok)
310 free (scan_exl);
311 scan_exl = pre_exl->next;
312 } else {
313 pre_exl = scan_exl;
314 scan_exl = scan_exl->next;
315 }
316 } //while
317
318 //sort the extended line linked list with an insertion sort
319 UtExtCharLine * src_exl, *pre_src_exl;
320 UtExtCharLine * dst_exl, *pre_dst_exl;
321
322 src_exl = pre_src_exl = text->ext_char;
323 while (src_exl) {
324
325 pre_dst_exl = dst_exl = text->ext_char;
326 new_exl = src_exl->next;
327
328 while (src_exl!=dst_exl) {
329 if (src_exl->line_i < dst_exl->line_i) {
330 //insert src before dst postion
331 pre_src_exl->next = src_exl->next;
332 src_exl->next = dst_exl;
333
334 if (dst_exl == text->ext_char) text->ext_char = src_exl; //fisrt pos?
335 else pre_dst_exl->next = src_exl; //second pos or after
336 src_exl = pre_src_exl;
337 break;
338 } //if
339 pre_dst_exl = dst_exl;
340 dst_exl = dst_exl->next;
341 } //while
342 pre_src_exl = src_exl;
343 src_exl = new_exl;
344 } //while
345 }
346
347 if (text->charset == UT_UNSET) {
348 //calculate checksum for each charset
349 for (i=0; i<ut_session->nb_charsets; i++) {
350 if (ut_session->charset[i].type != UT_CST_ASCII_EXTENSION) continue;
351 for (j=0x80; j<0x100; j++) {
352 if ( text->distribution[j]) text->evaluation[i].checksum
353 = ut_crc32 (ut_session->charset[i].unicode[(u_char)j], text->evaluation[i].checksum);
354 }
355 }
356
357 //choose the best charmap depending on the results of the estimation
358 //and on the selected language
359 double max_value = -1; //long could also be used
360 short max_index = -1;
361 double tmp;
362
363 for (i=0; i<ut_session->nb_charsets; i++) {
364 tmp = text->evaluation[i].rating;
365 tmp *= ut_get_charset_coef (i);
366
367 if (tmp > max_value) {
368 max_value = tmp;
369 max_index = i;
370 }
371 }
372 text->charset = max_index;
373
374 if (max_index<0) {
375 DBG1 ("*** NO CHARSET SELECTED !!! ***")
376 //return UT_CHARSET_NOT_RECOGNIZED_ERROR;
377 } else {
378 DBG2 ("%s selected", ut_session->charset[max_index].name)
379 }
380 }
381 DBG2 ("Extended Ascii charset pass done! (%lu B)", text->size)
382
383 return UT_OK;
384 }