"Fossies" - the Fresh Open Source Software Archive

Member "utrac-0.3.2/src/ut_recognition1.c" (4 Jan 2009, 15194 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ut_recognition1.c" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *            ut_recognition1.c
    3  *
    4  *  Tue Oct  5 11:29:40 2004
    5  *  Copyright  2004  Alliance MCA
    6  *  Written by : Antoine Calando (antoine@alliancemca.net)
    7  ****************************************************************************/
    8 
    9 /*
   10  *  This program is free software; you can redistribute it and/or modify
   11  *  it under the terms of the GNU General Public License as published by
   12  *  the Free Software Foundation; either version 2 of the License, or
   13  *  (at your option) any later version.
   14  *
   15  *  This program is distributed in the hope that it will be useful,
   16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18  *  GNU Library General Public License for more details.
   19  *
   20  *  You should have received a copy of the GNU General Public License
   21  *  along with this program; if not, write to the Free Software
   22  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   23  */
   24  
   25 
   26 /*!
   27  * \file ut_recognition1.c
   28  * \brief Distrib/utf8 pass and EOL pass
   29  * \author Antoine Calando (antoine@alliancemca.net)
   30  */
   31 
   32 #include <stdlib.h>
   33 #include <stdio.h>
   34 #include <string.h>
   35 #include "utrac.h"
   36 
   37 #undef UT_DEBUG
   38 #define UT_DEBUG 1
   39 #include "debug.h"
   40 
   41 /***************************************************************************/
   42 /*!
   43  * \brief Return false if unicode scalar value is invalid
   44  */
   45 bool inline ut_unicode_invalid (ulong unicode) {
   46     return ((   0x0000FDD0 <= unicode && unicode <= 0x0000FDEF  )
   47          || (   0x0010FFFE <= unicode                           )
   48          || ( ( 0xFFF0FFFE  & unicode ) == 0x0000FFFE           ));
   49 }
   50 
   51 /***************************************************************************/
   52 /*!
   53  * \brief Scan the text to calculate frequency distribution and UTF-8 correctness.
   54  *
   55  * This function calculate the frequency distribution, i.e. for i between 0 and 255,
   56  * text->distribution [i] is equal to the number of bytes "i" in the text. This
   57  * distribution is used to determinate if the file is binary or ASCII. The text is also
   58  * simultaneously scanned to check for UTF-8 errors.
   59  *
   60  * \return UT_OK on success, UT_BINARY_DATA_ERROR if file is binary, error code otherwise.
   61  */
   62 UtCode ut_distrib_utf_pass (UtText * text) {
   63     
   64     char * scan = text->data;
   65     char * scan_end;
   66     
   67     ASSERT(text);
   68     ASSERT(text->data);
   69     
   70     //bug! (see assert l85)
   71     if (text->size) scan_end = scan + text->size;
   72     else scan_end = NULL;
   73     
   74     ulong unicode = 0;
   75     ushort multibyte = 0;
   76     ulong error_utf8 = 0;
   77     int cumul = 1;
   78     if (!text->distribution) text->distribution = (ulong*) malloc (sizeof(ulong)*256);
   79     int i; for (i=0; i<0x100; i++) text->distribution[i] = 0;
   80 
   81     scan--; //incrementation at the beginning of the loop is faster
   82     for (;;) {
   83         scan++;
   84         //EC: double test de !*scan !! AC ok
   85         switch (*scan) {
   86           case 0:
   87             if (scan>=scan_end) {
   88                 ASSERT (!scan_end || scan==scan_end)
   89                 goto out_for;
   90             } else if (!scan_end) goto out_for;
   91           case 0xA:
   92           case 0xD:
   93             if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
   94                 if (!ut_update_progress (text, scan - text->data, false)) goto out_for;
   95                 cumul++;
   96             }
   97         }
   98         
   99         text->distribution [(u_char) *scan]++;
  100         if (multibyte) {
  101             if ((*scan & 0xC0) == 0x80) {   //==10xx xxxx
  102                 unicode <<= 6;
  103                 unicode |= *scan & 0x3F;
  104                 if(!--multibyte) { //last multybyte byte? then test if noncharacter (66 cases)
  105                     if (ut_unicode_invalid (unicode)) error_utf8++;
  106                 }
  107             } else {
  108                 multibyte = 0;
  109                 error_utf8++;
  110             }
  111         } else if (*scan & 0x80) { //1xxx xxx
  112             if ((*scan & 0xE0) == 0xC0) { //110x xxxx
  113                 multibyte = 1;
  114                 unicode = *scan & 0x1F;
  115             } else if ((*scan & 0xF0) == 0xE0) { //1110 xxxx
  116                 multibyte = 2;
  117                 unicode = *scan & 0x0F;
  118             } else if ((*scan & 0xF8) == 0xF0) { //1111 0xxx
  119                 multibyte = 3;
  120                 unicode = *scan & 0x07;
  121             } else { //error
  122                 error_utf8++;
  123             }
  124         }
  125     } //for (;;)
  126     out_for:
  127     //interrupted?
  128     //EC: ou il y a déja un 0 dans le texte ! AC test déjà fait
  129     if (scan<scan_end) {
  130         return UT_INTERRUPTED_BY_USER;
  131     }
  132     
  133     if (multibyte) error_utf8++;
  134     
  135     DBG2 ("Distribution and UTF-8 pass done! (%lu B)", text->size)
  136     
  137     if (!text->size) text->size = scan - text->data; //terminating 0 not counted
  138     if (!text->size) return UT_EMPTY_DATA_ERROR;
  139     
  140     ulong nb_ctrl_chars = 0;
  141     // count the number of control chars
  142     for (i=0; i<0x20; i++) {
  143         if (i==0x9 || i==0xA || i==0xD) continue;
  144         nb_ctrl_chars += text->distribution[i];
  145     }
  146     nb_ctrl_chars += text->distribution[0x7F];
  147     
  148     //test if text is actually binary data
  149     if (text->size * UT_THRESHOLD_CONTROL_CHAR < nb_ctrl_chars) {
  150         //to do: detect if UTF16!?!?
  151         DBG3 ("Binary file detected! (%lu cc)", nb_ctrl_chars)
  152         return UT_BINARY_DATA_ERROR;
  153     }
  154     
  155     //count the number of extended char
  156     ulong nb_ext_chars = 0;
  157     for (i=0x80; i<0x100; i++) {
  158         nb_ext_chars += text->distribution[i];
  159     }
  160     DBG3 ("UTF-8 error : %lu, ext char number : %lu", error_utf8, nb_ext_chars)
  161 
  162     if (text->flags & UT_F_IDENTIFY_CHARSET) {
  163         if (!nb_ext_chars) {
  164             //text is ASCII
  165             for (i=0; i<ut_session->nb_charsets; i++) 
  166                 if (ut_session->charset[i].type == UT_CST_ASCII) break;
  167             ASSERT_MSG (i!=ut_session->nb_charsets, "ASCII not defined")
  168             text->charset = i;
  169             DBG3 ("ASCII Encoding detected!")
  170         } else if (nb_ext_chars * UT_THRESHOLD_UTF8 > error_utf8) {
  171             //text is UTF-8
  172         
  173             for (i=0; i<ut_session->nb_charsets; i++) 
  174                 if (ut_session->charset[i].type == UT_CST_UTF_8) break;
  175             ASSERT_MSG (i!=ut_session->nb_charsets, "UTF-8 not defined")
  176             text->charset = i;
  177             DBG3 ("UTF-8 Encoding detected!")
  178         } else {
  179             text->charset = UT_UNSET;
  180         }
  181     }
  182 
  183     return UT_OK;
  184 }
  185 
  186 
  187 /***************************************************************************/
  188 /*!
  189  * \brief Change all UT_EOL_CHAR to UT_EOL_ALT_CHAR, from beg to end-1.
  190  *
  191  * \note EC pourquoi revenir en arrière ?
  192  *       AC Si on s'est trompé de type d'eol (un LF a été scanné avant un CRLF par ex)
  193  */
  194 void ut_change_EOL1toEOL2 (char * beg, char * end) {
  195     ASSERT (beg<end)
  196     ASSERT (*end==UT_EOL_CHAR)
  197     char * scan = beg;
  198     for(;;) {
  199         if (*scan==UT_EOL_CHAR) {
  200             if (scan==end) return;
  201             *scan=UT_EOL_ALT_CHAR;
  202         }
  203         scan++;
  204     }       
  205 }
  206 
  207 /***************************************************************************/
  208 /*
  209  * \brief Change all UT_EOL_ALT_CHAR to UT_EOL_CHAR, from beg to end-1.
  210  *
  211  * \note pour faire de vraie optimisation, on utilise strchr() à la place de
  212  *       for(;;) {... scan++ }, strchr() est une macro assembleur.
  213  */
  214 /*
  215 void ut_change_lff2eoe (char * beg, char * end) {
  216     ASSERT (beg<end)
  217     ASSERT (*end==UT_EOL_ALT_CHAR)
  218     char * scan = beg;
  219     for(;;) {
  220         if (*scan==UT_EOL_ALT_CHAR) {
  221             if (scan==end) return;
  222             *scan=UT_EOL_CHAR;
  223         }
  224         scan++;
  225     }       
  226 }
  227 */
  228 // \brief exemple de fonction de remplacement pour ut_change_lff_eoe()
  229 /*
  230 void ut_change_lff_eoe_maybe (char * beg, char * end) 
  231     {
  232     char * scan; //les variables locales en début de bloc, sinon c'est du C++
  233 
  234     ASSERT (beg!=NULL) //important à tester en debug
  235     ASSERT (end!=NULL) //important à tester en debug
  236     ASSERT (beg<end)  
  237     ASSERT (*end==UT_EOL_ALT_CHAR) //c'est sur que cela doit être en ASSERT ?
  238 
  239     *end = UT_EOL_CHAR; //c'est bien le 0 final ? non ?
  240 
  241     
  242      donc ici pas d'appel de fonction ! c'est une directive __asm {}
  243 
  244      il vaut mieux cependant utiliser memchr, c'est plus sûr (puisque
  245      l'on spécifie la taille du buffer), et plus rapide car il utilis
  246      REPNE SCASB
  247 
  248    movb AL,octet à rechercher
  249      movl EDX,adresse du buffer
  250      movl ECX,taille du buffer -1
  251      rpne scasb
  252      je ...
  253      EDX contient l'adresse de l'octet trouvé
  254      
  255     for(scan=beg;
  256             (scan=strchr(scan,UT_EOL_ALT_CHAR));
  257             *scan=UT_EOL_CHAR)
  258             ;
  259     
  260     Si il peut y avoir des 0 dans le texte avant l'appel de cette fonction, il faut faire
  261     une double boucle pour avancer d'un octet si scan!=end alors que strchr renvoi NULL
  262     }
  263     */
  264 
  265 
  266 /***************************************************************************/
  267 /*!
  268  * \brief Scan the text to detect EOL type and replace EOL by UT_EOL_CHAR or UT_EOL_ALT_CHAR.
  269  *  
  270  * EOL are recognized and replaced by UT_EOL_CHAR (null char), and eventually UT_EOL_ALT_CHAR 
  271  * if EOL type is UT_EOL_CRLF_CR or UT_EOL_CRLF_LF (see UtEolType).
  272  * ut_session->progress_function() is called only if ( text->flags & UT_F_TRANSFORM_EOL )
  273  *
  274  * \return UT_OK on success, error code otherwise.
  275  */
  276 
  277 UtCode ut_eol_pass (UtText * text) {
  278 
  279     char * scan = text->data;
  280     char * scan_end = text->data+text->size;
  281     ASSERT ( *scan_end == 0 )
  282     //ASSERT ( text->flags & UT_F_TRANSFORM_EOL )
  283     text->nb_lines = 0;
  284     text->nb_lines_alt = 0;
  285     ulong cumul=1;
  286     
  287     //while (scan < scan_end) {
  288     
  289     UtEolType eol1 = UT_EOL_NONE;
  290     UtEolType eol2 = UT_EOL_NONE;
  291     
  292 
  293     for (;;) {
  294         DBG3_S ("<%d>", *scan);
  295         
  296         if ((u_char)*scan<0x20) {           //======== control code =============
  297             if (!*scan) {                       //--------null char
  298                 if (scan>=scan_end) {   
  299                     ASSERT (scan==scan_end)
  300                     break;
  301                 } else if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
  302                     if (!ut_update_progress (text, scan - text->data, false)) break;
  303                     cumul++;
  304                 }               
  305             }
  306             if (*scan == 0xA) {                 //-------- LF (+CR?)    -------------
  307                 DBG3_S ("*");
  308                 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
  309                     ut_update_progress (text, scan - text->data, false);
  310                     cumul++;
  311                 }
  312 
  313                 if (*(scan+1) == 0xD) { //LFCR
  314                     switch (eol1) {
  315                       case UT_EOL_LFCR:
  316                       case UT_EOL_MIX:
  317                         if (*(scan+2) == 0xA) goto LF_only;
  318                         break;
  319                       case UT_EOL_CRLF:
  320                         if (*(scan+2) == 0xA) goto LF_only;
  321                         eol1 = UT_EOL_MIX;
  322                         if (eol2 != UT_EOL_NONE) {
  323                             ERROR ("EOL2 todo...")
  324                         }
  325                         break;
  326                       case UT_EOL_CR:
  327                       case UT_EOL_LF:
  328                         if (*(scan+2) == 0xA) goto LF_only;
  329                         ASSERT (eol2 == UT_EOL_NONE)
  330                         eol2 = eol1;
  331                         text->nb_lines_alt = text->nb_lines;
  332                         text->nb_lines = 0;
  333                         *scan = UT_EOL_CHAR;
  334                         ut_change_EOL1toEOL2 (text->data, scan);
  335                       case UT_EOL_NONE:
  336                         eol1 = UT_EOL_LFCR;
  337                         break;
  338                       default:
  339                         ERROR ("Forbiden case!?!")
  340                     }
  341                     *scan++ = UT_EOL_CHAR;
  342                     *scan++ = text->skip_char;
  343                     text->nb_lines++;
  344                 } else {     //LF only
  345                     LF_only:
  346                     switch (eol1) {
  347                       case UT_EOL_NONE:
  348                         eol1 = UT_EOL_LF;
  349                       case UT_EOL_LF:
  350                       case UT_EOL_MIX:
  351                         *scan++ = UT_EOL_CHAR;
  352                         text->nb_lines++;
  353                         break;
  354                       case UT_EOL_CR:
  355                         eol1 = UT_EOL_MIX;
  356                         *scan++ = UT_EOL_CHAR;
  357                         text->nb_lines++;
  358                         break;
  359                       case UT_EOL_CRLF:
  360                       case UT_EOL_LFCR:
  361                         switch (eol2) {
  362                           case UT_EOL_NONE: 
  363                             eol2 = UT_EOL_LF;
  364                             break;
  365                           case UT_EOL_CR:
  366                             eol2 = UT_EOL_MIX;
  367                           case UT_EOL_LF:
  368                           case UT_EOL_MIX:
  369                             break;  
  370                           default:
  371                             ERROR ("Forbiden case!?!")
  372                         }
  373                         *scan++ = UT_EOL_ALT_CHAR;
  374                         text->nb_lines_alt++;
  375                         break;
  376                       default:
  377                             ERROR ("Forbiden case!?!")
  378                     } //switch
  379                 } // else LF
  380             } else if (*scan == 0xD) {      //--------- CR (LF?)      ------------
  381                 DBG3_S ("*");
  382                 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
  383                     ut_update_progress (text, scan - text->data, false);
  384                     cumul++;
  385                 }
  386 
  387                 if (*(scan+1) == 0xA) { //CRLF
  388                     switch (eol1) {
  389                       case UT_EOL_CRLF:
  390                       case UT_EOL_MIX:
  391                         break;
  392                       case UT_EOL_LFCR:
  393                         eol1 = UT_EOL_MIX;
  394                         if (eol2 != UT_EOL_NONE) {
  395                             ERROR ("EOL2 todo...")
  396                         }
  397                         break;
  398                       case UT_EOL_CR:
  399                       case UT_EOL_LF:
  400                         ASSERT (eol2 == UT_EOL_NONE)
  401                         eol2 = eol1;
  402                         text->nb_lines_alt = text->nb_lines;
  403                         text->nb_lines = 0;
  404                         *scan = UT_EOL_CHAR;
  405                         ut_change_EOL1toEOL2 (text->data, scan);
  406                       case UT_EOL_NONE:
  407                         eol1 = UT_EOL_CRLF;
  408                         break;
  409                       default:
  410                         ERROR ("Forbiden case!?!")
  411                     }
  412                     *scan++ = UT_EOL_CHAR;
  413                     *scan++ = text->skip_char;
  414                     text->nb_lines++;
  415                 } else {     //CR only
  416                     switch (eol1) {
  417                       case UT_EOL_NONE:
  418                         eol1 = UT_EOL_CR;
  419                       case UT_EOL_CR:
  420                       case UT_EOL_MIX:
  421                         *scan++ = UT_EOL_CHAR;
  422                         text->nb_lines++;
  423                         break;
  424                       case UT_EOL_LF:
  425                         eol1 = UT_EOL_MIX;
  426                         *scan++ = UT_EOL_CHAR;
  427                         text->nb_lines++;
  428                         break;
  429                       case UT_EOL_CRLF:
  430                       case UT_EOL_LFCR:
  431                         switch (eol2) {
  432                           case UT_EOL_CR:
  433                           case UT_EOL_MIX:
  434                             break;  
  435                           case UT_EOL_NONE: 
  436                             eol2 = UT_EOL_CR;
  437                             break;
  438                           case UT_EOL_LF:
  439                             eol2 = UT_EOL_MIX;
  440                             break;
  441                           default:
  442                             ERROR ("Forbiden case!?!")
  443                         }
  444                         *scan++ = UT_EOL_ALT_CHAR;
  445                         text->nb_lines_alt++;
  446                         break;
  447                       default:
  448                             ERROR ("Forbiden case!?!")
  449                     } //switch
  450                 } // else CR
  451             } else if (*scan == 0x9 ) {         //------------- tab ----------
  452                 scan++;
  453             } else if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) {
  454                 *scan++ = text->skip_char;
  455             } //else
  456             
  457         } else {                    //======== non control code =============
  458             if (*scan == 0x7F && (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) ) {  //control char del
  459                 *scan++ = text->skip_char;
  460             } else {
  461                 scan++;
  462             } //else
  463         } //else
  464     } //while
  465 
  466     //interrupted?
  467     if (scan<scan_end) {
  468         return UT_INTERRUPTED_BY_USER;
  469     }
  470     
  471     if (text->flags & UT_F_ADD_FINAL_EOL) {
  472         //add EOE if missinG
  473         if (   (*(scan-2) != UT_EOL_CHAR || *(scan-1) != text->skip_char)
  474             &&  *(scan-1) != UT_EOL_CHAR ) {
  475             if (text->flags & UT_F_TRANSFORM_EOL) {
  476                 *scan = UT_EOL_CHAR;
  477                 text->size++;
  478             } /* text->flags & UT_F_TRANSFORM_EOL should be true
  479             else { switch (text->eol) {
  480               case UT_EOL_CR:
  481                 *scan = 0xD;
  482                 text->size++;
  483                 break;
  484               case UT_EOL_LF:
  485                 *scan = 0xA;
  486                 text->size++;
  487               case UT_EOL_LF:
  488                 *scan++ = 0xD;
  489                 *scan   = 0xA;
  490                 text->size+=2;
  491             } } //else switch
  492             */
  493             text->nb_lines++;
  494         } // if *scan
  495     } //if text->flags
  496     
  497     if (text->eol == UT_EOL_UNSET) {
  498         text->eol = eol1;
  499         text->eol_alt = eol2;
  500     } else {
  501         text->nb_lines = UT_UNSET;
  502         text->nb_lines_alt = UT_UNSET;
  503     }
  504     
  505     //verify EOF
  506     ASSERT (*scan == UT_EOF_CHAR)
  507 
  508     DBG2 ("End Of Line pass done! (%lu B)", text->size)
  509 
  510     return UT_OK;
  511 }
  512 
  513 // ************* Check for UTF16 - big endian & little endian *********
  514 /*
  515 {
  516     ulong error_utf16 = 0 ; //, error_utf16be = 0, error_utf16le = 0;
  517     ushort * scanw;
  518     ushort * scanw_end;
  519     
  520     if ( ifd->data_size%2) {
  521         error_utf16 = -1U;
  522     } else {
  523         scanw = (ushort *) ifd->data;
  524         scanw_end = scanw+ifd->data_size/2;
  525         for (;;) {
  526             if (!*scanw && scanw==scanw_end) break;
  527             if (0xD800 <=*scanw && *scanw < 0xDC00) { //surrogate?
  528                 unicode = (*scanw & 0x3FF) + 0x400;
  529                 scanw++;
  530                 if (!(0xDC00 <= *scanw && *scanw < 0xE000 )) {
  531                     error_utf16++;
  532                     if (scanw==scanw_end) break;
  533                 }
  534                 unicode <<= 10;
  535                 unicode |= *scanw & 0x3FF;
  536             } else {
  537                 unicode = *scanw;   
  538             }
  539             if (   ( 0xFDD0 <= unicode && unicode <= 0xFDEF )
  540                 || ( (unicode & 0xFFF0FFFE) == 0x0000FFFE)
  541                 || ( unicode >= 0x0010FFFE)
  542                 || ( 0xD800 <=unicode && unicode < 0xE000) ) {
  543                 error_utf16++;
  544             }
  545             scanw++;
  546         } //for (;;)
  547         printf ("UTF16 : %lu errors\n", error_utf16);
  548     } //else
  549     
  550 
  551     //ulong error_utf32be = 0, error_utf32le = 0;
  552 } */