"Fossies" - the Fresh Open Source Software Archive

Member "utrac-0.3.2/src/ut_recognition2.c" (4 Jan 2009, 13419 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ut_recognition2.c" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *            ut_recognition2.c
    3  *
    4  *  Tue Oct  5 11:29:47 2004
    5  *  Copyright  2004  Alliance MCA
    6  *  Written by : Antoine Calando (antoine@alliancemca.net)
    7  ****************************************************************************/
    8 
    9 /*
   10  *  This program is free software; you can redistribute it and/or modify
   11  *  it under the terms of the GNU General Public License as published by
   12  *  the Free Software Foundation; either version 2 of the License, or
   13  *  (at your option) any later version.
   14  *
   15  *  This program is distributed in the hope that it will be useful,
   16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18  *  GNU Library General Public License for more details.
   19  *
   20  *  You should have received a copy of the GNU General Public License
   21  *  along with this program; if not, write to the Free Software
   22  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   23  */
   24  
   25 /*!
   26  * \file ut_recognition2.c
   27  * \author Antoine Calando (antoine@alliancemca.net)
   28  * \brief Extended ASCII charset pass.
   29  */
   30 
   31 #include <stdlib.h>
   32 #include <stdio.h>
   33 #include "ut_text.h"
   34 #include "ut_charset.h"
   35 #include "utrac.h"
   36 
   37 //#undef UT_DEBUG
   38 //#define UT_DEBUG 3
   39 #include "debug.h"
   40 
   41 /***************************************************************************/
   42 //! \brief Move the scan_pre pointer to the previous character and return it. 
   43 char inline ut_get_pre_char (char **scan_pre, UtText * text) {
   44     do { 
   45         if (*scan_pre == text->data) return 0;
   46         --(*scan_pre);  
   47     } while (**scan_pre == text->skip_char);
   48     return **scan_pre;
   49 }
   50 
   51 /***************************************************************************/
   52 //! \brief Move the scan_post pointer to the next character and return it. 
   53 char inline ut_get_post_char (char **scan_post, UtText * text, char *scan_end) {
   54     do { 
   55         if (*scan_post == scan_end) return 0; 
   56         ++(*scan_post);
   57     } while (**scan_post == text->skip_char);
   58     return **scan_post;
   59 }
   60 
   61 
   62 /***************************************************************************/
   63 /*!
   64  * \brief Rate each charset relatively yo the text and register lines with extended characters.
   65  *
   66  * - Rate single byte extended ascii charsets: the function scan the whole text. Each time an
   67  *   extended character is found, and for each charset, it is encoded in this charset, compared to
   68  *   the previous and following character(s), and depending on the result, some points are added to
   69  *   charset rating. For instance, "café" (Latin1) will get more points than "cafÈ" (MacRoman).
   70  *   The checksum of all the extended characters in each charset is also calculated, to determine
   71  *   which charsets will have the same result (see UtCharsetEval).
   72  * - Register lines with extended chars: each time an extended character is found, and if that
   73  *   character was not already found, the line is registered in a linked list (see UtExtCharLine).
   74  *   After the whole text is scanned, the line linked list is filtered and sorted to keep only
   75  *   the most revelant lines.
   76  *
   77  * \todo check if charmap exists!
   78  *
   79  * \return UT_OK on success, error code otherwise.
   80  */ 
   81 
   82 UtCode ut_xascii_pass (UtText * text) {
   83     
   84 
   85     int i,j;
   86     char * scan = text->data;
   87     char * scan_end = text->data + text->size;
   88     
   89     char * line_beg = scan;
   90     ulong line_i = 0;
   91     ulong nb_ext_chars = 0; //number of ext char in current line
   92     bool ext_char[0x80]; for (i=0x0; i<0x80; i++) ext_char[i] = false;  //bit for each of the 128 ext char in current line
   93     bool ext_char_diff = false;     //ext char not previously found in current line?
   94     
   95     UtExtCharLine * scan_exl, * pre_exl, * new_exl;
   96     ulong  ponct_init[UT_CTG_PONCT_IF_N]; for (i=0; i<UT_CTG_PONCT_IF_N; i++) ponct_init[i] = 0;
   97     
   98     
   99     if (text->charset == UT_UNSET) {
  100         if (!text->evaluation) 
  101             text->evaluation = (UtCharsetEval*) malloc ( sizeof (UtCharsetEval) * ut_session->nb_charsets);
  102         
  103         for (i=0; i<ut_session->nb_charsets; i++) {
  104             text->evaluation [i].rating = 0;
  105             text->evaluation [i].checksum = 0;
  106         }
  107     }
  108 
  109     int cumul = 1;
  110     scan--;
  111     for (;;) {
  112         scan++;
  113         if (!*scan) { //eol!!!
  114             if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
  115                 if (!ut_update_progress (text, scan - text->data, false)) break;
  116                 cumul++;
  117             }
  118             if (scan >= scan_end) {
  119                 ASSERT (scan==scan_end)
  120                 break; //last line?
  121             }
  122             if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
  123                 if (ext_char_diff) { //extended char in this line?
  124                     //create new struct
  125                     new_exl = (UtExtCharLine*) malloc (sizeof(UtExtCharLine));
  126                     new_exl->line_p = line_beg;
  127                     new_exl->line_i = line_i;
  128                     new_exl->nb_ext_chars = nb_ext_chars;
  129                     
  130                     //the link is inserted in the list which is sorted by
  131                     //line with biggest number of extended char first
  132                     if (!text->ext_char         //insert struct at first pos?
  133                         || text->ext_char->nb_ext_chars <= nb_ext_chars ) {
  134                         new_exl->next = text->ext_char;
  135                         text->ext_char = new_exl;
  136                     } else {
  137                         pre_exl = scan_exl = text->ext_char;
  138                         while (scan_exl && scan_exl->nb_ext_chars > nb_ext_chars) {
  139                             pre_exl = scan_exl;
  140                             scan_exl = scan_exl->next;
  141                         }
  142                         pre_exl->next = new_exl;
  143                         new_exl->next = scan_exl;
  144                     }
  145                     ext_char_diff = false;
  146                 } //if
  147                 nb_ext_chars = 0;
  148                 line_beg = scan+1;
  149                 line_i++;
  150             }
  151             
  152         } else if ((u_char)*scan>0x7F) { //char extended found
  153             if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
  154                 nb_ext_chars++;
  155                 if (!ext_char[(u_char)*scan-0x80]) { //already found?
  156                     ext_char[(u_char)*scan-0x80] = true;
  157                     ext_char_diff = true;
  158                 }
  159             }
  160 
  161             if (text->charset == UT_UNSET) {    
  162             
  163                 UtCharsetEval * cs_eval = &(text->evaluation[0]);
  164         
  165                 //rate each charset for this extended char
  166                 for (i=0; i<ut_session->nb_charsets; i++, cs_eval++) {
  167                     UtCharset * cs = &(ut_session->charset[i]);
  168                     if (cs->type != UT_CST_ASCII_EXTENSION) continue;
  169     
  170                     char tmp;
  171                     UtCateg pre1_ctg, pre2_ctg, scan_ctg, post1_ctg, post2_ctg, post3_ctg;
  172                     UtScript pre1_scr, scan_scr, post1_scr;
  173                     char * scan_pre = scan, * scan_post = scan;
  174                     
  175                     //get category and alphabet type of chars at pos scan-1, scan and scan+1
  176                     scan_ctg  = (cs->char_type[(u_char) *scan].categorie);
  177                     scan_scr  = (cs->char_type[(u_char) *scan].script);
  178                     tmp = ut_get_pre_char  (&scan_pre, text);
  179                     pre1_ctg  = (cs->char_type[(u_char) tmp].categorie);
  180                     pre1_scr  = (cs->char_type[(u_char) tmp].script);
  181                     tmp = ut_get_post_char (&scan_post, text, scan_end);
  182                     post1_ctg = (cs->char_type[(u_char) tmp].categorie);
  183                     post1_scr  = (cs->char_type[(u_char) tmp].script);
  184                     
  185                     //compare to previous and following char(s)
  186                     switch (scan_ctg) {
  187                       case UT_CTG_UPPERCASE:
  188                         if     ( pre1_ctg==UT_CTG_DELIMITER && 
  189                                 (post1_ctg==UT_CTG_LOWERCASE || post1_ctg==UT_CTG_UPPERCASE))       cs_eval->rating++;
  190                         else
  191                             if ( pre1_ctg==UT_CTG_UPPERCASE)                                cs_eval->rating++;
  192                         else {
  193                             post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
  194                             if (post1_ctg==UT_CTG_UPPERCASE && post2_ctg!=UT_CTG_LOWERCASE)         cs_eval->rating++;
  195                             else {
  196                                 pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre, text)].categorie);
  197                                 if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
  198                                   (( pre2_ctg==UT_CTG_UPPERCASE && post2_ctg==UT_CTG_UPPERCASE) ||
  199                                     (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER)))             cs_eval->rating++;
  200                             } 
  201                         } break;
  202                         
  203                       case UT_CTG_LOWERCASE:
  204                         if     ( pre1_ctg==UT_CTG_LOWERCASE)                                cs_eval->rating++;
  205                         else 
  206                             if (post1_ctg==UT_CTG_LOWERCASE)                                cs_eval->rating++;
  207                         else
  208                             if ( pre1_ctg==UT_CTG_UPPERCASE && post1_ctg!=UT_CTG_UPPERCASE)         cs_eval->rating++;
  209                         else {
  210                             pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre , text)].categorie);
  211                             post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
  212                             post3_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
  213                             if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
  214                               (( pre2_ctg==UT_CTG_LOWERCASE && (post2_ctg==UT_CTG_LOWERCASE || (post2_ctg==UT_CTG_UPPERCASE && post3_ctg==UT_CTG_LOWERCASE)) 
  215                               ) || (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER)))      cs_eval->rating++;
  216                         } break;
  217                       case UT_CTG_OTHER_LETTER:
  218                             if (pre1_ctg==UT_CTG_OTHER_LETTER)                          cs_eval->rating++;
  219                             if (post1_ctg==UT_CTG_OTHER_LETTER)                             cs_eval->rating++;
  220                         break;
  221     
  222                       case UT_CTG_MARK:
  223                             if (pre1_ctg>=UT_CTG_UPPERCASE && pre1_ctg<=UT_CTG_OTHER_LETTER)        cs_eval->rating++;
  224                             if (post1_ctg>=UT_CTG_UPPERCASE && post1_ctg<=UT_CTG_OTHER_LETTER)  cs_eval->rating++;
  225                         break;
  226     
  227                       case UT_CTG_CONTROL:
  228                       case UT_CTG_UNSET:
  229                         cs_eval->rating-=2;
  230                         break;
  231     
  232                       case UT_CTG_CURRENCY:
  233                             if (pre1_ctg==UT_CTG_NUMBER || post1_ctg==UT_CTG_NUMBER) cs_eval->rating++;
  234                             else if (pre1_ctg==UT_CTG_DELIMITER) {
  235                                 pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre , text)].categorie);
  236                                 if (pre2_ctg==UT_CTG_NUMBER ) cs_eval->rating++;
  237                             }
  238                         break;
  239     
  240                       case UT_CTG_SYMBOL:
  241                         switch (cs->unicode[(u_char)*scan]) {
  242                           case 0x00B0: /* ° */
  243                             pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre, text)].categorie);  
  244                             if (pre2_ctg>UT_CTG_OTHER_LETTER && (*(scan-1)=='N' || *(scan-1)=='n') 
  245                                 && post1_ctg>UT_CTG_OTHER_LETTER) cs_eval->rating+=3;
  246                         } break;
  247                       case UT_CTG_DELIMITER:
  248                         if (pre1_ctg==post1_ctg || *scan==*(scan-1) || *scan==*(scan+1)) cs_eval->rating++;
  249                         break;
  250                       case UT_CTG_NUMBER:
  251                       case UT_CTG_PONCTUATION:
  252                       case UT_CTG_OTHER:  break;
  253                       default: 
  254                         for (j=0; j<UT_CTG_PONCT_IF_N; j++) {
  255                             if (scan_ctg==UT_CTG_PONCT_INIT_0+j) ponct_init[j]++;
  256                             else if (scan_ctg==UT_CTG_PONCT_FINAL_0+j && ponct_init[j]) {
  257                                 ponct_init[j]--;
  258                                 cs_eval->rating+=2;
  259                             }
  260                         } //for
  261                     } //switch
  262                     
  263                     //rate according to the script
  264                     if (scan_scr==1) {
  265                         if (scan_scr== pre1_scr)
  266                             cs_eval->rating++;
  267                         if (scan_scr == post1_scr)
  268                             cs_eval->rating++;
  269                     } else if (scan_scr>1) {
  270                         if (scan_scr== pre1_scr)
  271                             cs_eval->rating+=2;
  272                         if (scan_scr == post1_scr)
  273                             cs_eval->rating+=2;
  274                     }
  275 
  276                 } //for nb_charsets
  277 
  278             } //if (text->charset == UT_UNSET)
  279         
  280         } //if (*scan>0x7F)
  281 
  282     } //for (;;)
  283     
  284     
  285     //interrupted?
  286     if (scan<scan_end) {
  287         return UT_INTERRUPTED_BY_USER;
  288     }
  289     
  290     if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
  291         //filter the extended line linked list
  292         for (i=0x0; i<0x80; i++) ext_char[i] = false;
  293         pre_exl = scan_exl = text->ext_char;
  294         
  295         while (scan_exl) {  //scan each struct
  296             ext_char_diff = false;
  297             scan = scan_exl->line_p;
  298             while (*scan) { //scan each char
  299                 if ((u_char)*scan>0x7F) { //char extended found
  300                     if (!ext_char[(u_char)*scan-0x80]) { //already found?
  301                         ext_char[(u_char)*scan-0x80] = true;
  302                         ext_char_diff = true;
  303                     }
  304                 }
  305                 scan++;
  306             }//while
  307             
  308             if (!ext_char_diff) { //remove the struct ext_char_line?
  309                 pre_exl->next = scan_exl->next; //(first struct is never removed, so this code is ok)
  310                 free (scan_exl);
  311                 scan_exl = pre_exl->next;
  312             } else {
  313                 pre_exl = scan_exl;
  314                 scan_exl = scan_exl->next;
  315             }
  316         } //while
  317 
  318         //sort the extended line linked list with an insertion sort
  319         UtExtCharLine * src_exl, *pre_src_exl;
  320         UtExtCharLine * dst_exl, *pre_dst_exl;
  321         
  322         src_exl = pre_src_exl = text->ext_char;
  323         while (src_exl) {
  324             
  325             pre_dst_exl = dst_exl = text->ext_char;
  326             new_exl = src_exl->next;
  327             
  328             while (src_exl!=dst_exl) {
  329                 if (src_exl->line_i < dst_exl->line_i) {
  330                     //insert src before dst postion
  331                     pre_src_exl->next = src_exl->next;
  332                     src_exl->next = dst_exl;
  333     
  334                     if (dst_exl == text->ext_char)  text->ext_char = src_exl; //fisrt pos?
  335                     else pre_dst_exl->next = src_exl;   //second pos or after
  336                     src_exl = pre_src_exl;
  337                     break;
  338                 } //if
  339                 pre_dst_exl = dst_exl;
  340                 dst_exl = dst_exl->next;
  341             } //while
  342             pre_src_exl = src_exl;
  343             src_exl = new_exl;
  344         } //while
  345     }
  346     
  347     if (text->charset == UT_UNSET) {    
  348         //calculate checksum for each charset
  349         for (i=0; i<ut_session->nb_charsets; i++) {
  350             if (ut_session->charset[i].type != UT_CST_ASCII_EXTENSION) continue;
  351             for (j=0x80; j<0x100; j++) {
  352                 if ( text->distribution[j]) text->evaluation[i].checksum 
  353                         = ut_crc32 (ut_session->charset[i].unicode[(u_char)j], text->evaluation[i].checksum);
  354             }
  355         }
  356 
  357         //choose the best charmap depending on the results of the estimation
  358         //and on the selected language
  359         double max_value = -1; //long could also be used
  360         short max_index = -1;
  361         double tmp;
  362         
  363         for (i=0; i<ut_session->nb_charsets; i++) {
  364             tmp = text->evaluation[i].rating;
  365             tmp *= ut_get_charset_coef (i);
  366 
  367             if (tmp > max_value) {
  368                 max_value = tmp;
  369                 max_index = i;
  370             }
  371         }
  372         text->charset = max_index;
  373     
  374         if (max_index<0) {
  375             DBG1 ("*** NO CHARSET SELECTED !!! ***")
  376             //return UT_CHARSET_NOT_RECOGNIZED_ERROR;
  377         } else {
  378             DBG2 ("%s selected", ut_session->charset[max_index].name)
  379         }
  380     }
  381     DBG2 ("Extended Ascii charset pass done! (%lu B)", text->size)
  382         
  383     return UT_OK;
  384 }