"Fossies" - the Fresh Open Source Software Archive

Member "utrac-0.3.2/src/ut_conversion.c" (4 Jan 2009, 20503 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ut_conversion.c" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *            ut_conversion.c
    3  *
    4  *  Wed May 26 11:57:43 2004
    5  *  Copyright  2004 Alliance MCA
    6  *  Author Antoine Calando (antoine@alliancemca.net)
    7  ****************************************************************************/
    8 /*
    9  *  This program is free software; you can redistribute it and/or modify
   10  *  it under the terms of the GNU General Public License as published by
   11  *  the Free Software Foundation; either version 2 of the License, or
   12  *  (at your option) any later version.
   13  *
   14  *  This program is distributed in the hope that it will be useful,
   15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17  *  GNU Library General Public License for more details.
   18  *
   19  *  You should have received a copy of the GNU General Public License
   20  *  along with this program; if not, write to the Free Software
   21  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   22  */
   23 
   24 /*! \file
   25  *  \brief Conversion functions from one charset to another.
   26  *
   27  *  \todo EC les fonction inline be fonctionne qu'avec gcc !! il faudrait mettre
   28  *        une macro UT_INLINE dans un header.   
   29  *        AC "inline" est pourtant posix, non?
   30  *  \todo ASSERT sur tous les parametres.
   31  *
   32  */
   33  
   34 #include <stdlib.h>
   35 #include <stdio.h>
   36 #include "utrac.h"
   37 
   38 //#undef UT_DEBUG
   39 //#define UT_DEBUG 3
   40 #include "debug.h"
   41 
   42 //! \brief Test if c is an extended character.
   43 static inline bool is_ext (char c) {
   44     return (u_char) c >= 0x80;  
   45 }
   46 
   47 /**************************************************************************/
   48 /*!
   49  * \brief Return the size in byte of an Unicode character in UTF-8.
   50  *
   51  * - UTF-8 on 1 byte:  0000 0000  0xxx xxxx => 0xxx xxxx
   52  * - UTF-8 on 2 bytes: 0000 0yyy  yyxx xxxx => 110y yyyy  10xx xxxx
   53  * - UTF-8 on 3 bytes: zzzz yyyy  yyxx xxxx => 1110 zzzz  10yy yyyy  10xx xxxx
   54  * - UTF-8 on 4 bytes: 00uu uuzz  zzzz yyyy  yyxx xxxx => 1111 0uuu 10zz zzzz  10yy yyyy  10xx xxxx
   55  *
   56  * \param unicode The scalar value of the Unicode character.
   57  *
   58  * \return Size in byte.
   59  *
   60  * \bug EC Il n'y a pas de gestion de l'ordre des octets dans le long, ceci ne fonctionne donc
   61  *      pas sous mac (à moins que cela soit fait en amont ?).
   62  *      AC On teste ici la valeur scalaire, c'est à dire un long et rien d'autre, donc pas de prb d'endian.
   63  *
   64  * \bug EC Que ce passe-t-il si unicode>0x10FFFF ? La fonction renvoi 0... si la fonction
   65  *      appelant ne le test pas, on obtiendra une boucle sans fin. Il vaut mieux un programme
   66  *      qui plante qu'un programme qui freeze, la valeur 0 n'est donc pas indiquée.
   67  *      AC Pas de freeze possible! Par contre les carctères illégaux peuvent être remplacé
   68  *      par un carctère au choix de l'utilisateur, il faudrait donc renvoyer la taille de
   69  *      ce caractère d'erreur, afin qu'un éventuel malloc ne soit pas trop court. Mais il
   70  *      faudrait peut être plutôt gérer ça dans les autres fonctions pour ne pas avoir à
   71  *      rajouter une structure UtText en argument.
   72  *
   73  * \note EC Ce code étant extraimement simple et concis, il serait surement interressant de
   74  *       le mettre en inline. De plus si ce code est beaucoup utilisé, on peut faire
   75  *       quelques optimisations (test < == > sur 0xFFFF si le premier test échoue permettrait
   76  *       1 à 2 tests à la place de 1 à 4 tests par exemple).
   77  *       AC Pour l'inline, oui, mais la fonction n'est pas appelé si souvent que ça.
   78  *       Pour l'organisation des tests, les cas les plus fréquents (pour du latin) sont
   79  *       le cas 1, puis le 2, puis le 3 etc... donc tests bien ordonnés.
   80  */
   81 
   82 int ut_size_unicode (ulong unicode) {
   83     if ( !(unicode & ~0x7F)) {
   84         //UTF-8 on 1 byte: 0000 0000  0xxx xxxx => 0xxx xxxx
   85         return 1;
   86     } else if ( !(unicode & ~0x7FF)) {
   87         //UTF-8 on 2 bytes: 0000 0yyy  yyxx xxxx => 110y yyyy  10xx xxxx
   88         return 2;
   89     } else if ( !(unicode & ~0xFFFF)) {
   90         //UTF-8 on 3 bytes: zzzz yyyy  yyxx xxxx => 1110 zzzz  10yy yyyy  10xx xxxx
   91         return 3;
   92     } else if (unicode <= 0x10FFFF ) {
   93         //UTF-8 on 4 bytes: 00uu uuzz  zzzz yyyy  yyxx xxxx => 1111 0uuu 10zz zzzz  10yy yyyy  10xx xxxx
   94         return 4;
   95     } else {
   96         return 0;
   97     }
   98 }
   99 
  100 /*!
  101  * \brief Convert an UTF-8 character to Unicode scalar value.
  102  *
  103  * \param src_p address of the pointer on the beginning of the character. This pointer
  104  *        is incremented to the beginning of the following character after conversion.
  105  *
  106  * \return Unicode scalar value of the converted character.
  107  *
  108  * \bug EC Dans le cas d'un caractère invalide, si c'est le premier il est comptabilisé, mais
  109  *      si ce n'est pas le premier, il ne l'est pas. C'est vraiment ce que l'on veut ?
  110  *      AC ???  Si il est compatibilisé! (voir "while (size--) {...}")
  111 
  112  */
  113 
  114 ulong ut_utf8c_to_unicode (char ** src_p) {
  115     
  116     ulong unicode;
  117     int size;
  118 
  119     if (! (**src_p&0x80)) { //==0xxx xxxx (d=done,x=don't care)
  120         unicode = **src_p; (*src_p)++;
  121         return unicode;
  122     } else if (! (**src_p&0x40)) { //==d0xx xxxx error!
  123         (*src_p)++;
  124         return UT_UNICODE_NONCHAR;
  125     } else if (! (**src_p&0x20)) { //==dd0x xxxx
  126         size = 1;
  127         unicode = **src_p & 0x1F;
  128     } else if (! (**src_p&0x10)) { //==ddd0 xxxx
  129         size = 2;
  130         unicode = **src_p & 0x0F;
  131     } else if (! (**src_p&0x08)) { //==dddd 0xxx
  132         size = 3;
  133         unicode = **src_p & 0x07;
  134     } else {
  135         #if UT_DEBUG > 1
  136         printf("<%X:%x:%x:", **src_p & 0xFF, (**src_p|0x20), (**src_p|0x10));
  137         ut_print_binary (**src_p & 0xFF);
  138         putchar('>');
  139         #endif
  140         (*src_p)++;
  141         return UT_UNICODE_NONCHAR;
  142     }
  143     (*src_p)++;
  144     
  145     while (size--) {
  146         if ((**src_p&0xC0) != 0x80) return UT_UNICODE_NONCHAR; //!=10xx xxxx
  147         unicode<<=6;
  148         unicode |= **src_p & 0x3F;
  149         (*src_p)++;
  150     }
  151     return unicode;
  152 }
  153 
  154 /*!
  155  * \brief Convert an Unicode scalar value to UTF-8 character.
  156  * 
  157  * \param dst_p address of the pointer on the buffer where the character is going to
  158  *        be written. This pointer is incremented to the end of the character + 1
  159  *        after conversion.
  160  * \param unicode Unicode scalar value of the character to convert.
  161  *
  162  * \bug EC il y a des cas d'erreur, donc il faut un retour, si l'on tombe dans le cas d'erreur
  163  *      vu qu'il n'y a plus d'incrémentation de dst_p on risque une boucle sans fin.
  164  *      AC Pas de boucle sans fin, et erreur peu grave.
  165  *
  166  * \note EC une fonction inline serait peut être la bien venue.
  167  *       AC vrai, il faudrait mettre la fonction dans un include...
  168  */
  169 
  170 void ut_unicode_to_utf8c (ulong unicode, char ** dst_p) {
  171     
  172     if ( !(unicode & ~0x7F)) {
  173         //UTF-8 on 1 byte: 0000 0000  0xxx xxxx => 0xxx xxxx
  174         *(*dst_p)++ = (char) unicode;
  175     } else if ( !(unicode & ~0x7FF)) {
  176         //UTF-8 on 2 bytes: 0000 0yyy  yyxx xxxx => 110y yyyy  10xx xxxx
  177         *(*dst_p)++ = ((char) (unicode>>6) & 0x1F) | 0xC0; //=> 110y yyyy
  178         *(*dst_p)++ = ((char) unicode & 0x3F) | 0x80; //=> 10xx xxxx
  179     } else if ( !(unicode & ~0xFFFF)) {
  180         //UTF-8 on 3 bytes: zzzz yyyy  yyxx xxxx => 1110 zzzz  10yy yyyy  10xx xxxx
  181         *(*dst_p)++ = ((char) (unicode>>12) & 0x0F) | 0xE0; //=> 1110 zzzz
  182         *(*dst_p)++ = ((char) (unicode>>6) & 0x3F) | 0x80; //=> 10yy yyyy
  183         *(*dst_p)++ = ((char) unicode & 0x3F) | 0x80; //=> 10xx xxxx
  184     } else {
  185         ERROR ("*** UTF8 CHAR ON 4 BYTES!!!***");
  186     }
  187 }
  188 
  189 /*!
  190  * \brief Return size in byte of a character after conversion.
  191  *
  192  * \param text UtText structure containing the source charset and the destination charset.
  193  * \param src_p Address of the pointer on the beginning of the character encoded with the
  194  *        source charset . This pointer will be incremented to the beginning of the
  195  *        following character.
  196  *
  197  * \return The size of the character when it will be encoded with the destination charset.
  198  *
  199  * \warning Il n'y a pas de return final !
  200  * \bug AC voir ut_size_unicode() quand retour = 0
  201  */
  202 
  203 int ut_size_char (char **src_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) {
  204 
  205     ASSERT (*src_p)
  206     ASSERT (src_charset != UT_UNSET)
  207     if (dst_charset == UT_UNSET) dst_charset = ut_session->charset_default;
  208             
  209     ulong unicode;
  210     UtCharset * src_cs = &(ut_session->charset [src_charset]);
  211     UtCharset * dst_cs = &(ut_session->charset [dst_charset]);
  212     
  213     if (src_cs->type == UT_CST_ASCII || dst_cs->type == UT_CST_ASCII) {
  214         if (src_cs->type == UT_CST_UTF_8) ut_utf8c_to_unicode(src_p);
  215         else (*src_p)++;
  216         if (ut_session->nomapping_char<0x80) return 1;
  217         else return 0;
  218     } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
  219         unicode = src_cs->unicode [(u_char) **src_p];
  220         (*src_p)++;
  221     } else if (src_cs->type == UT_CST_UTF_8) { 
  222         unicode = ut_utf8c_to_unicode (src_p); 
  223     } else { 
  224         ERROR ("charset type not managed : %d", src_cs->type)
  225     }
  226 
  227     if (unicode==UT_UNICODE_NONCHAR) unicode = ut_session->nomapping_char;
  228     
  229     if (dst_cs->type == UT_CST_UTF_8) {
  230         return ut_size_unicode (unicode);
  231     } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
  232         return 1; //often 1 and seldom 0... so let's answer quickly
  233         /*if (unicode<0x80) return 1;
  234         else {
  235             int i; for (i=0x80; i<0x100; i++)   if (unicode==dst_cs->unicode[i]) break;
  236             if(i!=0x100) return 1
  237                 && ut_session->nomapping_char >= 0x100) return 0;
  238             else return 1;
  239         }*/
  240     } else { ERROR ("charset type not managed : %d", src_cs->type) }
  241     
  242 }
  243 
  244 /*!
  245  * \brief Convert a character.
  246  *
  247  * \param text UtText structure containing the source charset and the destination charset.
  248  * \param src_p Address of the pointer on the beginning of the character encoded with the
  249  *        source charset . This pointer will be incremented to the beginning of the
  250  *        following character.
  251  * \param dst_p address of the pointer on the buffer where the converted character will be
  252  *        written. This pointer is incremented to the end of the character + 1 after conversion.
  253  *
  254  * \todo EC Il y a des cas d'erreur (ERROR), on doit donc pourvoir retourner l'erreur.
  255  *       AC Les erreurs sont gérées : elles sont indiquées par un 'ut_session->nomapping_char' dans le
  256  *       texte.
  257  */
  258 
  259 void ut_conv_char (char ** src_p, char ** dst_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) {
  260     ASSERT (*src_p)     
  261     ASSERT (*dst_p)     
  262     ASSERT (src_charset != UT_UNSET)
  263     if (dst_charset == UT_UNSET) dst_charset = ut_session->charset_default;
  264 
  265     ulong unicode;
  266     UtCharset * src_cs = &(ut_session->charset [src_charset]);
  267     UtCharset * dst_cs = &(ut_session->charset [dst_charset]);
  268     
  269     if (src_cs->type == UT_CST_ASCII || dst_cs->type == UT_CST_ASCII) {
  270         if (src_cs->type == UT_CST_UTF_8) ut_utf8c_to_unicode(src_p);
  271         else (*src_p)++;
  272         if (ut_session->nomapping_char<0x80) *(*dst_p)++ = (char) ut_session->nomapping_char;
  273         return;
  274     } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
  275         unicode = src_cs->unicode [(u_char) **src_p];
  276         (*src_p)++;
  277     } else if (src_cs->type == UT_CST_UTF_8) {
  278         unicode = ut_utf8c_to_unicode (src_p);
  279     } else {ERROR ("charset type not managed : %d", src_cs->type) }
  280 
  281     if (unicode!=UT_UNICODE_NONCHAR) {
  282         if (dst_cs->type == UT_CST_UTF_8) {
  283             if (unicode==UT_UNICODE_NONCHAR) unicode = ut_session->nomapping_char;
  284             ut_unicode_to_utf8c (unicode, dst_p);
  285         } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
  286             if (unicode<0x80) {
  287                 *(*dst_p)++ = (char) unicode;
  288             } else {
  289                 int i; for (i=0x80; i<0x100; i++)   if (unicode==dst_cs->unicode[i]) break;
  290                 if(i<0x100) {
  291                     *(*dst_p)++ = (char) i;
  292                 } else {
  293                     if (ut_session->nomapping_char < 0x100) *(*dst_p)++ = (char) ut_session->nomapping_char;
  294                 }
  295             }
  296         } else {
  297             ERROR ("charset type not managed : %d", src_cs->type)   
  298         }
  299     } else {
  300         if (ut_session->nomapping_char < 0x80) *(*dst_p)++ = (char) ut_session->nomapping_char;
  301         else ERROR ("nomapping char must be < 0x80") ;
  302     }
  303 
  304 }
  305 
  306 
  307 void ut_insert_eol (char ** dst_p, UtEolType dst_eol) {
  308 
  309     switch (dst_eol) {
  310       case UT_EOL_CRLF:
  311     DBG3_S ("+CR");
  312         *(*dst_p)++ = 0xD; 
  313       case UT_EOL_LF:
  314     DBG3_S ("+LF");
  315         *(*dst_p)++ = 0xA; break;
  316       case UT_EOL_LFCR:
  317     DBG3_S ("+LF");
  318         *(*dst_p)++ = 0xA; 
  319       case UT_EOL_CR:
  320     DBG3_S ("+CR");
  321         *(*dst_p)++ = 0xD; break;
  322       case UT_EOL_BSN:
  323     DBG3_S ("+BSN");
  324         *(*dst_p)++ = '\\'; *(*dst_p)++ = 'n'; break;
  325       case UT_EOL_NUL:
  326     DBG3_S ("+NUL");
  327         *(*dst_p)++ = 0; break;
  328       default:
  329         ERROR ("EOL not accepted for conversion : %d", dst_eol)
  330       }
  331 }
  332 
  333 /*!
  334  * \brief Count the number of extended character in a text.
  335  */
  336 
  337 uint ut_count_ext_char (UtText * text) {
  338     uint count = 0, i;  
  339     for (i=0x80; i<0x100; i++)
  340         count += text->distribution[i];
  341     return count;
  342 }
  343 
  344 
  345 
  346 /*!
  347  * \brief Return the difference between the size of a text and its size after conversion.
  348  *
  349  * \param text UtText structure containing the text, the source and the destination charsets
  350  *
  351  * \return The size difference. If value is negative, the text will be smaller, if positive,
  352  *         the text will be bigger.
  353  *
  354  * \todo EC Cette fonction ne retourne pas de code d'erreur alors qu'il y a des ERROR() et que
  355  *       de mauvais paramètres doivent pouvoir la faire pantée. Il faut donc mettre en 
  356  *       parametre un pointeur sur la variable à fixer (ou l'intégrer dans UtText) et mettre
  357  *       le type de retour à UtCode.
  358  *       AC Effectivement, bien qu'il s'agisse alors d'erreurs dues à une mauvaise utilisation
  359  *       de l'API.
  360  * \bug AC voir ut_size_unicode() quand retour = 0
  361  */
  362 
  363 int ut_size_difference (UtText * src_text, UtText * dst_text) {
  364     
  365     ASSERT (src_text->charset != UT_UNSET)  
  366     ASSERT (dst_text->charset != UT_UNSET)
  367     ASSERT (src_text->eol != UT_EOL_UNSET)  
  368     ASSERT (dst_text->eol != UT_EOL_UNSET)
  369     ASSERT (src_text->eol_alt != UT_EOL_UNSET)  
  370     ASSERT (dst_text->eol_alt != UT_EOL_UNSET)
  371     
  372     long size;
  373     
  374     DBG3("*********** size diff********")
  375 
  376     UtCharset * src_cs = &(ut_session->charset [src_text->charset]);
  377     UtCharset * dst_cs = &(ut_session->charset [dst_text->charset]);
  378     
  379     if (src_cs->type == UT_CST_ASCII ) {
  380         if (dst_cs->type == UT_CST_ASCII) {
  381             if (ut_session->nomapping_char && ut_session->nomapping_char <0x80) size = 0;
  382             else size = - ut_count_ext_char (src_text);
  383         } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
  384             if (ut_session->nomapping_char <0x100) size = 0;
  385             else size = - ut_count_ext_char (src_text);
  386         } else if (dst_cs->type == UT_CST_UTF_8) {
  387             if (ut_session->nomapping_char != UT_UNICODE_NONCHAR) 
  388                 size = (ut_size_unicode (ut_session->nomapping_char)-1) * ut_count_ext_char (src_text); 
  389             else size = - ut_count_ext_char (src_text); 
  390         } else {
  391             ERROR ("charset type not managed : %d", dst_cs->type)
  392         }
  393 
  394     } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
  395         if (dst_cs->type == UT_CST_ASCII) {
  396             if (ut_session->nomapping_char <0x80) size = 0;
  397             else size = - ut_count_ext_char (src_text);
  398 
  399         } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
  400             int count = 0;
  401             if (ut_session->nomapping_char>=0x100) {
  402                 int i; for (i=0x80; i<0x100; i++) {
  403                     if (src_text->distribution[i]) {
  404                         ulong unicode = src_cs->unicode[i];
  405                         int j; for (j=0x80; j<0x100; j++) if (unicode==dst_cs->unicode[j]) break;
  406                         if (i==0x100) count -= src_text->distribution[i];
  407                     }
  408                 }
  409             }
  410             size = count;
  411                 
  412         } else if (dst_cs->type == UT_CST_UTF_8) {
  413             int count = 0;
  414             int i; for (i=0x80; i<0x100; i++) {
  415                 if (src_text->distribution[i]) {
  416                     ulong unicode = src_cs->unicode[i];
  417                     if (unicode != UT_UNICODE_NONCHAR) 
  418                         count += (ut_size_unicode (unicode) - 1)*src_text->distribution[i];
  419                     else if (ut_session->nomapping_char!=UT_UNICODE_NONCHAR) 
  420                         count += (ut_size_unicode (ut_session->nomapping_char) - 1)*src_text->distribution[i];
  421                     else count -= src_text->distribution[i];
  422                 }
  423             }
  424             size = count;
  425         } else {
  426             ERROR ("charset type not managed : %d", dst_cs->type)
  427         }
  428     } else if (src_cs->type == UT_CST_UTF_8 ) {
  429         if (dst_cs->type == UT_CST_ASCII) {
  430             if (ut_session->nomapping_char <0x80) size = 0;
  431             else size = - ut_count_ext_char (src_text);
  432 
  433         } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
  434             size = 0;
  435 
  436         } else if (dst_cs->type == UT_CST_UTF_8) {
  437             if (ut_session->nomapping_char == UT_UNICODE_NONCHAR) size = 0;
  438             else size = - (ut_size_unicode (ut_session->nomapping_char) - 1) * ut_count_ext_char (src_text);
  439 
  440         } else {
  441             ERROR ("charset type not managed : %d", dst_cs->type)
  442         }
  443     } else {
  444         ERROR ("charset type not managed : %d", dst_cs->type)
  445     }
  446     
  447     DBG3( "** size diff chars : % ld", size);
  448     
  449     switch (src_text->eol) {
  450       case UT_EOL_NONE:
  451         break;      
  452       case UT_EOL_CRLF:
  453       case UT_EOL_LFCR:
  454         switch (dst_text->eol) { 
  455           case UT_EOL_CRLF:
  456           case UT_EOL_LFCR:
  457           case UT_EOL_BSN:
  458           //case UT_EOL_NOCHANGE:     
  459               break;
  460           case UT_EOL_CR:
  461           case UT_EOL_LF:
  462             size -= src_text->nb_lines; break;
  463           case UT_EOL_NONE:
  464             size -= 2*src_text->nb_lines; break;
  465           default: //+UT_EOL_NON_STD:
  466             ERROR ("dst EOL type unsupported")
  467         } break;
  468         
  469       case UT_EOL_NUL:
  470       case UT_EOL_CR:
  471       case UT_EOL_LF:
  472       case UT_EOL_MIX:      //1 or 2 bytes, we consider 1 for secureness
  473         switch (dst_text->eol) { 
  474           case UT_EOL_CR:
  475           case UT_EOL_LF:
  476           case UT_EOL_NUL:    
  477           //case UT_EOL_NOCHANGE:     
  478             break;
  479           case UT_EOL_CRLF:
  480           case UT_EOL_LFCR:
  481           case UT_EOL_BSN:
  482             size += src_text->nb_lines; break;
  483           case UT_EOL_NONE:
  484             size -= src_text->nb_lines; break;
  485           default: //+UT_EOL_NON_STD:
  486             ERROR ("dst EOL type unsupported")
  487         } break;
  488       default:
  489         ERROR ("src EOL type unsupported")
  490     }
  491 
  492     DBG3( "** size diff chars+eol : % ld", size);
  493     
  494     switch (src_text->eol_alt) {
  495       case UT_EOL_NONE:
  496         break;      
  497       case UT_EOL_NUL:
  498       case UT_EOL_CR:
  499       case UT_EOL_LF:
  500       case UT_EOL_MIX:      //1 or 2 bytes, we consider 1 for secureness
  501         switch (dst_text->eol_alt) { 
  502           case UT_EOL_CR:
  503           case UT_EOL_LF:
  504           case UT_EOL_NUL:    
  505           //case UT_EOL_NOCHANGE:     
  506             break;
  507           case UT_EOL_CRLF:
  508           case UT_EOL_LFCR:
  509           case UT_EOL_BSN:
  510             size += src_text->nb_lines_alt; break;
  511           case UT_EOL_NONE:
  512             size -= src_text->nb_lines_alt; break;
  513           default: //+UT_EOL_NON_STD:
  514             ERROR ("dst EOL type unsupported")
  515         } break;
  516       default:
  517         ERROR ("src EOL type unsupported")
  518     }
  519 
  520     DBG3( "** size diff chars+eol+alt : % ld", size);
  521     
  522     return size;    
  523 }
  524 
  525 
  526 /*!
  527  * \brief Convert extended characters and EOL.
  528  * 
  529  * The conversion consists to :
  530  * - remove skip characters,
  531  * - change null characters to EOL tpye specified in UtText::dst_eol,
  532  * - change extended characters encoded with UtText::src_charset to UtText::dst_charset encoding.
  533  *
  534  * \param text UtText to convert. Updates UtText::data and UtText::size.
  535  *
  536  * \return UT_OK on success, error code otherwise.
  537 */
  538 
  539 UtCode ut_conversion_pass (UtText * src_text, UtText * dst_text) {
  540     
  541     ASSERT (src_text)
  542     ASSERT (dst_text)
  543 
  544     ASSERT (dst_text->data == NULL)
  545     
  546     //TODO? create dst_text?
  547     if (dst_text->eol==UT_EOL_UNSET) dst_text->eol = src_text->eol;
  548     if (dst_text->eol_alt==UT_EOL_UNSET) dst_text->eol_alt = src_text->eol_alt;
  549     free (dst_text->data);
  550     dst_text->data = NULL;
  551     
  552     long newsize = ut_size_difference (src_text, dst_text);
  553     
  554     DBG3 ("size diff : %ld   ext char : %d", newsize, ut_count_ext_char (src_text) )
  555     newsize += src_text->size;
  556     DBG3 ("old size: %lu  new size: %lu", src_text->size, newsize)
  557 
  558     //Allocate new buffer for dst
  559     char *dst_beg = (char*) malloc (newsize+1); //+1 for UT_EOE_CHAR
  560     if (!dst_beg) return UT_MALLOC_ERROR;
  561 
  562     char *src = src_text->data;
  563     char *src_end = src_text->data + src_text->size;
  564     char *dst = dst_beg;
  565     int cumul=1;
  566     
  567     for (;;) {
  568         DBG3_S ("<%d>", *src);
  569         if (!is_ext (*src)) {
  570             if (*src) {
  571                 if (*src==src_text->skip_char) {
  572                     src++;
  573                 } else if (*src==UT_EOL_ALT_CHAR) {
  574                     ut_insert_eol (&dst, dst_text->eol_alt);
  575                     src++;
  576                 } else {
  577                     *dst++ = *src++;
  578                 }
  579             } else { //UT_EOL_CHAR
  580                 if (src - src_text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
  581                     if (!ut_update_progress (src_text, src - src_text->data, false)) break;
  582                     cumul++;
  583                 }
  584                 if (src >= src_end) {
  585                     ASSERT (src==src_end)
  586                     *dst = 0;
  587                     break; //last line?
  588                 }
  589                 ut_insert_eol (&dst, dst_text->eol);
  590                 src++;
  591                 DBG3_S ("!")
  592             }
  593         } else { //ext_char
  594             ut_conv_char (&src, &dst, src_text->charset, dst_text->charset);
  595         }
  596     } //for (;;)
  597 
  598     if (src < src_end) {
  599         //CLEAN HERE!
  600         DBG3 ( "interrupted! : src:%d   srcend: %d dst:%d", src - src_text->data, src_end - src_text->data, dst - dst_beg)
  601         free (dst_beg);
  602         return UT_INTERRUPTED_BY_USER;
  603     }
  604 
  605 
  606     ASSERT ( dst - dst_beg <= newsize )
  607     DBG3 ( "precalculated size: %ld   actual size: %d", newsize, dst - dst_beg)
  608     
  609     //free (src_text->data);
  610     dst_text->data = dst_beg;
  611     dst_text->size = dst - dst_beg;
  612     
  613     DBG2 ("Conversion done!")
  614     return UT_OK;
  615 }