"Fossies" - the Fresh Open Source Software Archive 
Member "utrac-0.3.2/src/ut_conversion.c" (4 Jan 2009, 20503 Bytes) of package /linux/privat/old/utrac-0.3.2.tgz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "ut_conversion.c" see the
Fossies "Dox" file reference documentation.
1 /***************************************************************************
2 * ut_conversion.c
3 *
4 * Wed May 26 11:57:43 2004
5 * Copyright 2004 Alliance MCA
6 * Author Antoine Calando (antoine@alliancemca.net)
7 ****************************************************************************/
8 /*
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Library General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 */
23
24 /*! \file
25 * \brief Conversion functions from one charset to another.
26 *
27 * \todo EC les fonction inline be fonctionne qu'avec gcc !! il faudrait mettre
28 * une macro UT_INLINE dans un header.
29 * AC "inline" est pourtant posix, non?
30 * \todo ASSERT sur tous les parametres.
31 *
32 */
33
34 #include <stdlib.h>
35 #include <stdio.h>
36 #include "utrac.h"
37
38 //#undef UT_DEBUG
39 //#define UT_DEBUG 3
40 #include "debug.h"
41
42 //! \brief Test if c is an extended character.
43 static inline bool is_ext (char c) {
44 return (u_char) c >= 0x80;
45 }
46
47 /**************************************************************************/
48 /*!
49 * \brief Return the size in byte of an Unicode character in UTF-8.
50 *
51 * - UTF-8 on 1 byte: 0000 0000 0xxx xxxx => 0xxx xxxx
52 * - UTF-8 on 2 bytes: 0000 0yyy yyxx xxxx => 110y yyyy 10xx xxxx
53 * - UTF-8 on 3 bytes: zzzz yyyy yyxx xxxx => 1110 zzzz 10yy yyyy 10xx xxxx
54 * - UTF-8 on 4 bytes: 00uu uuzz zzzz yyyy yyxx xxxx => 1111 0uuu 10zz zzzz 10yy yyyy 10xx xxxx
55 *
56 * \param unicode The scalar value of the Unicode character.
57 *
58 * \return Size in byte.
59 *
60 * \bug EC Il n'y a pas de gestion de l'ordre des octets dans le long, ceci ne fonctionne donc
61 * pas sous mac (à moins que cela soit fait en amont ?).
62 * AC On teste ici la valeur scalaire, c'est à dire un long et rien d'autre, donc pas de prb d'endian.
63 *
64 * \bug EC Que ce passe-t-il si unicode>0x10FFFF ? La fonction renvoi 0... si la fonction
65 * appelant ne le test pas, on obtiendra une boucle sans fin. Il vaut mieux un programme
66 * qui plante qu'un programme qui freeze, la valeur 0 n'est donc pas indiquée.
67 * AC Pas de freeze possible! Par contre les carctères illégaux peuvent être remplacé
68 * par un carctère au choix de l'utilisateur, il faudrait donc renvoyer la taille de
69 * ce caractère d'erreur, afin qu'un éventuel malloc ne soit pas trop court. Mais il
70 * faudrait peut être plutôt gérer ça dans les autres fonctions pour ne pas avoir à
71 * rajouter une structure UtText en argument.
72 *
73 * \note EC Ce code étant extraimement simple et concis, il serait surement interressant de
74 * le mettre en inline. De plus si ce code est beaucoup utilisé, on peut faire
75 * quelques optimisations (test < == > sur 0xFFFF si le premier test échoue permettrait
76 * 1 à 2 tests à la place de 1 à 4 tests par exemple).
77 * AC Pour l'inline, oui, mais la fonction n'est pas appelé si souvent que ça.
78 * Pour l'organisation des tests, les cas les plus fréquents (pour du latin) sont
79 * le cas 1, puis le 2, puis le 3 etc... donc tests bien ordonnés.
80 */
81
82 int ut_size_unicode (ulong unicode) {
83 if ( !(unicode & ~0x7F)) {
84 //UTF-8 on 1 byte: 0000 0000 0xxx xxxx => 0xxx xxxx
85 return 1;
86 } else if ( !(unicode & ~0x7FF)) {
87 //UTF-8 on 2 bytes: 0000 0yyy yyxx xxxx => 110y yyyy 10xx xxxx
88 return 2;
89 } else if ( !(unicode & ~0xFFFF)) {
90 //UTF-8 on 3 bytes: zzzz yyyy yyxx xxxx => 1110 zzzz 10yy yyyy 10xx xxxx
91 return 3;
92 } else if (unicode <= 0x10FFFF ) {
93 //UTF-8 on 4 bytes: 00uu uuzz zzzz yyyy yyxx xxxx => 1111 0uuu 10zz zzzz 10yy yyyy 10xx xxxx
94 return 4;
95 } else {
96 return 0;
97 }
98 }
99
100 /*!
101 * \brief Convert an UTF-8 character to Unicode scalar value.
102 *
103 * \param src_p address of the pointer on the beginning of the character. This pointer
104 * is incremented to the beginning of the following character after conversion.
105 *
106 * \return Unicode scalar value of the converted character.
107 *
108 * \bug EC Dans le cas d'un caractère invalide, si c'est le premier il est comptabilisé, mais
109 * si ce n'est pas le premier, il ne l'est pas. C'est vraiment ce que l'on veut ?
110 * AC ??? Si il est compatibilisé! (voir "while (size--) {...}")
111
112 */
113
114 ulong ut_utf8c_to_unicode (char ** src_p) {
115
116 ulong unicode;
117 int size;
118
119 if (! (**src_p&0x80)) { //==0xxx xxxx (d=done,x=don't care)
120 unicode = **src_p; (*src_p)++;
121 return unicode;
122 } else if (! (**src_p&0x40)) { //==d0xx xxxx error!
123 (*src_p)++;
124 return UT_UNICODE_NONCHAR;
125 } else if (! (**src_p&0x20)) { //==dd0x xxxx
126 size = 1;
127 unicode = **src_p & 0x1F;
128 } else if (! (**src_p&0x10)) { //==ddd0 xxxx
129 size = 2;
130 unicode = **src_p & 0x0F;
131 } else if (! (**src_p&0x08)) { //==dddd 0xxx
132 size = 3;
133 unicode = **src_p & 0x07;
134 } else {
135 #if UT_DEBUG > 1
136 printf("<%X:%x:%x:", **src_p & 0xFF, (**src_p|0x20), (**src_p|0x10));
137 ut_print_binary (**src_p & 0xFF);
138 putchar('>');
139 #endif
140 (*src_p)++;
141 return UT_UNICODE_NONCHAR;
142 }
143 (*src_p)++;
144
145 while (size--) {
146 if ((**src_p&0xC0) != 0x80) return UT_UNICODE_NONCHAR; //!=10xx xxxx
147 unicode<<=6;
148 unicode |= **src_p & 0x3F;
149 (*src_p)++;
150 }
151 return unicode;
152 }
153
154 /*!
155 * \brief Convert an Unicode scalar value to UTF-8 character.
156 *
157 * \param dst_p address of the pointer on the buffer where the character is going to
158 * be written. This pointer is incremented to the end of the character + 1
159 * after conversion.
160 * \param unicode Unicode scalar value of the character to convert.
161 *
162 * \bug EC il y a des cas d'erreur, donc il faut un retour, si l'on tombe dans le cas d'erreur
163 * vu qu'il n'y a plus d'incrémentation de dst_p on risque une boucle sans fin.
164 * AC Pas de boucle sans fin, et erreur peu grave.
165 *
166 * \note EC une fonction inline serait peut être la bien venue.
167 * AC vrai, il faudrait mettre la fonction dans un include...
168 */
169
170 void ut_unicode_to_utf8c (ulong unicode, char ** dst_p) {
171
172 if ( !(unicode & ~0x7F)) {
173 //UTF-8 on 1 byte: 0000 0000 0xxx xxxx => 0xxx xxxx
174 *(*dst_p)++ = (char) unicode;
175 } else if ( !(unicode & ~0x7FF)) {
176 //UTF-8 on 2 bytes: 0000 0yyy yyxx xxxx => 110y yyyy 10xx xxxx
177 *(*dst_p)++ = ((char) (unicode>>6) & 0x1F) | 0xC0; //=> 110y yyyy
178 *(*dst_p)++ = ((char) unicode & 0x3F) | 0x80; //=> 10xx xxxx
179 } else if ( !(unicode & ~0xFFFF)) {
180 //UTF-8 on 3 bytes: zzzz yyyy yyxx xxxx => 1110 zzzz 10yy yyyy 10xx xxxx
181 *(*dst_p)++ = ((char) (unicode>>12) & 0x0F) | 0xE0; //=> 1110 zzzz
182 *(*dst_p)++ = ((char) (unicode>>6) & 0x3F) | 0x80; //=> 10yy yyyy
183 *(*dst_p)++ = ((char) unicode & 0x3F) | 0x80; //=> 10xx xxxx
184 } else {
185 ERROR ("*** UTF8 CHAR ON 4 BYTES!!!***");
186 }
187 }
188
189 /*!
190 * \brief Return size in byte of a character after conversion.
191 *
192 * \param text UtText structure containing the source charset and the destination charset.
193 * \param src_p Address of the pointer on the beginning of the character encoded with the
194 * source charset . This pointer will be incremented to the beginning of the
195 * following character.
196 *
197 * \return The size of the character when it will be encoded with the destination charset.
198 *
199 * \warning Il n'y a pas de return final !
200 * \bug AC voir ut_size_unicode() quand retour = 0
201 */
202
203 int ut_size_char (char **src_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) {
204
205 ASSERT (*src_p)
206 ASSERT (src_charset != UT_UNSET)
207 if (dst_charset == UT_UNSET) dst_charset = ut_session->charset_default;
208
209 ulong unicode;
210 UtCharset * src_cs = &(ut_session->charset [src_charset]);
211 UtCharset * dst_cs = &(ut_session->charset [dst_charset]);
212
213 if (src_cs->type == UT_CST_ASCII || dst_cs->type == UT_CST_ASCII) {
214 if (src_cs->type == UT_CST_UTF_8) ut_utf8c_to_unicode(src_p);
215 else (*src_p)++;
216 if (ut_session->nomapping_char<0x80) return 1;
217 else return 0;
218 } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
219 unicode = src_cs->unicode [(u_char) **src_p];
220 (*src_p)++;
221 } else if (src_cs->type == UT_CST_UTF_8) {
222 unicode = ut_utf8c_to_unicode (src_p);
223 } else {
224 ERROR ("charset type not managed : %d", src_cs->type)
225 }
226
227 if (unicode==UT_UNICODE_NONCHAR) unicode = ut_session->nomapping_char;
228
229 if (dst_cs->type == UT_CST_UTF_8) {
230 return ut_size_unicode (unicode);
231 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
232 return 1; //often 1 and seldom 0... so let's answer quickly
233 /*if (unicode<0x80) return 1;
234 else {
235 int i; for (i=0x80; i<0x100; i++) if (unicode==dst_cs->unicode[i]) break;
236 if(i!=0x100) return 1
237 && ut_session->nomapping_char >= 0x100) return 0;
238 else return 1;
239 }*/
240 } else { ERROR ("charset type not managed : %d", src_cs->type) }
241
242 }
243
244 /*!
245 * \brief Convert a character.
246 *
247 * \param text UtText structure containing the source charset and the destination charset.
248 * \param src_p Address of the pointer on the beginning of the character encoded with the
249 * source charset . This pointer will be incremented to the beginning of the
250 * following character.
251 * \param dst_p address of the pointer on the buffer where the converted character will be
252 * written. This pointer is incremented to the end of the character + 1 after conversion.
253 *
254 * \todo EC Il y a des cas d'erreur (ERROR), on doit donc pourvoir retourner l'erreur.
255 * AC Les erreurs sont gérées : elles sont indiquées par un 'ut_session->nomapping_char' dans le
256 * texte.
257 */
258
259 void ut_conv_char (char ** src_p, char ** dst_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) {
260 ASSERT (*src_p)
261 ASSERT (*dst_p)
262 ASSERT (src_charset != UT_UNSET)
263 if (dst_charset == UT_UNSET) dst_charset = ut_session->charset_default;
264
265 ulong unicode;
266 UtCharset * src_cs = &(ut_session->charset [src_charset]);
267 UtCharset * dst_cs = &(ut_session->charset [dst_charset]);
268
269 if (src_cs->type == UT_CST_ASCII || dst_cs->type == UT_CST_ASCII) {
270 if (src_cs->type == UT_CST_UTF_8) ut_utf8c_to_unicode(src_p);
271 else (*src_p)++;
272 if (ut_session->nomapping_char<0x80) *(*dst_p)++ = (char) ut_session->nomapping_char;
273 return;
274 } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
275 unicode = src_cs->unicode [(u_char) **src_p];
276 (*src_p)++;
277 } else if (src_cs->type == UT_CST_UTF_8) {
278 unicode = ut_utf8c_to_unicode (src_p);
279 } else {ERROR ("charset type not managed : %d", src_cs->type) }
280
281 if (unicode!=UT_UNICODE_NONCHAR) {
282 if (dst_cs->type == UT_CST_UTF_8) {
283 if (unicode==UT_UNICODE_NONCHAR) unicode = ut_session->nomapping_char;
284 ut_unicode_to_utf8c (unicode, dst_p);
285 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
286 if (unicode<0x80) {
287 *(*dst_p)++ = (char) unicode;
288 } else {
289 int i; for (i=0x80; i<0x100; i++) if (unicode==dst_cs->unicode[i]) break;
290 if(i<0x100) {
291 *(*dst_p)++ = (char) i;
292 } else {
293 if (ut_session->nomapping_char < 0x100) *(*dst_p)++ = (char) ut_session->nomapping_char;
294 }
295 }
296 } else {
297 ERROR ("charset type not managed : %d", src_cs->type)
298 }
299 } else {
300 if (ut_session->nomapping_char < 0x80) *(*dst_p)++ = (char) ut_session->nomapping_char;
301 else ERROR ("nomapping char must be < 0x80") ;
302 }
303
304 }
305
306
307 void ut_insert_eol (char ** dst_p, UtEolType dst_eol) {
308
309 switch (dst_eol) {
310 case UT_EOL_CRLF:
311 DBG3_S ("+CR");
312 *(*dst_p)++ = 0xD;
313 case UT_EOL_LF:
314 DBG3_S ("+LF");
315 *(*dst_p)++ = 0xA; break;
316 case UT_EOL_LFCR:
317 DBG3_S ("+LF");
318 *(*dst_p)++ = 0xA;
319 case UT_EOL_CR:
320 DBG3_S ("+CR");
321 *(*dst_p)++ = 0xD; break;
322 case UT_EOL_BSN:
323 DBG3_S ("+BSN");
324 *(*dst_p)++ = '\\'; *(*dst_p)++ = 'n'; break;
325 case UT_EOL_NUL:
326 DBG3_S ("+NUL");
327 *(*dst_p)++ = 0; break;
328 default:
329 ERROR ("EOL not accepted for conversion : %d", dst_eol)
330 }
331 }
332
333 /*!
334 * \brief Count the number of extended character in a text.
335 */
336
337 uint ut_count_ext_char (UtText * text) {
338 uint count = 0, i;
339 for (i=0x80; i<0x100; i++)
340 count += text->distribution[i];
341 return count;
342 }
343
344
345
346 /*!
347 * \brief Return the difference between the size of a text and its size after conversion.
348 *
349 * \param text UtText structure containing the text, the source and the destination charsets
350 *
351 * \return The size difference. If value is negative, the text will be smaller, if positive,
352 * the text will be bigger.
353 *
354 * \todo EC Cette fonction ne retourne pas de code d'erreur alors qu'il y a des ERROR() et que
355 * de mauvais paramètres doivent pouvoir la faire pantée. Il faut donc mettre en
356 * parametre un pointeur sur la variable à fixer (ou l'intégrer dans UtText) et mettre
357 * le type de retour à UtCode.
358 * AC Effectivement, bien qu'il s'agisse alors d'erreurs dues à une mauvaise utilisation
359 * de l'API.
360 * \bug AC voir ut_size_unicode() quand retour = 0
361 */
362
363 int ut_size_difference (UtText * src_text, UtText * dst_text) {
364
365 ASSERT (src_text->charset != UT_UNSET)
366 ASSERT (dst_text->charset != UT_UNSET)
367 ASSERT (src_text->eol != UT_EOL_UNSET)
368 ASSERT (dst_text->eol != UT_EOL_UNSET)
369 ASSERT (src_text->eol_alt != UT_EOL_UNSET)
370 ASSERT (dst_text->eol_alt != UT_EOL_UNSET)
371
372 long size;
373
374 DBG3("*********** size diff********")
375
376 UtCharset * src_cs = &(ut_session->charset [src_text->charset]);
377 UtCharset * dst_cs = &(ut_session->charset [dst_text->charset]);
378
379 if (src_cs->type == UT_CST_ASCII ) {
380 if (dst_cs->type == UT_CST_ASCII) {
381 if (ut_session->nomapping_char && ut_session->nomapping_char <0x80) size = 0;
382 else size = - ut_count_ext_char (src_text);
383 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
384 if (ut_session->nomapping_char <0x100) size = 0;
385 else size = - ut_count_ext_char (src_text);
386 } else if (dst_cs->type == UT_CST_UTF_8) {
387 if (ut_session->nomapping_char != UT_UNICODE_NONCHAR)
388 size = (ut_size_unicode (ut_session->nomapping_char)-1) * ut_count_ext_char (src_text);
389 else size = - ut_count_ext_char (src_text);
390 } else {
391 ERROR ("charset type not managed : %d", dst_cs->type)
392 }
393
394 } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
395 if (dst_cs->type == UT_CST_ASCII) {
396 if (ut_session->nomapping_char <0x80) size = 0;
397 else size = - ut_count_ext_char (src_text);
398
399 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
400 int count = 0;
401 if (ut_session->nomapping_char>=0x100) {
402 int i; for (i=0x80; i<0x100; i++) {
403 if (src_text->distribution[i]) {
404 ulong unicode = src_cs->unicode[i];
405 int j; for (j=0x80; j<0x100; j++) if (unicode==dst_cs->unicode[j]) break;
406 if (i==0x100) count -= src_text->distribution[i];
407 }
408 }
409 }
410 size = count;
411
412 } else if (dst_cs->type == UT_CST_UTF_8) {
413 int count = 0;
414 int i; for (i=0x80; i<0x100; i++) {
415 if (src_text->distribution[i]) {
416 ulong unicode = src_cs->unicode[i];
417 if (unicode != UT_UNICODE_NONCHAR)
418 count += (ut_size_unicode (unicode) - 1)*src_text->distribution[i];
419 else if (ut_session->nomapping_char!=UT_UNICODE_NONCHAR)
420 count += (ut_size_unicode (ut_session->nomapping_char) - 1)*src_text->distribution[i];
421 else count -= src_text->distribution[i];
422 }
423 }
424 size = count;
425 } else {
426 ERROR ("charset type not managed : %d", dst_cs->type)
427 }
428 } else if (src_cs->type == UT_CST_UTF_8 ) {
429 if (dst_cs->type == UT_CST_ASCII) {
430 if (ut_session->nomapping_char <0x80) size = 0;
431 else size = - ut_count_ext_char (src_text);
432
433 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
434 size = 0;
435
436 } else if (dst_cs->type == UT_CST_UTF_8) {
437 if (ut_session->nomapping_char == UT_UNICODE_NONCHAR) size = 0;
438 else size = - (ut_size_unicode (ut_session->nomapping_char) - 1) * ut_count_ext_char (src_text);
439
440 } else {
441 ERROR ("charset type not managed : %d", dst_cs->type)
442 }
443 } else {
444 ERROR ("charset type not managed : %d", dst_cs->type)
445 }
446
447 DBG3( "** size diff chars : % ld", size);
448
449 switch (src_text->eol) {
450 case UT_EOL_NONE:
451 break;
452 case UT_EOL_CRLF:
453 case UT_EOL_LFCR:
454 switch (dst_text->eol) {
455 case UT_EOL_CRLF:
456 case UT_EOL_LFCR:
457 case UT_EOL_BSN:
458 //case UT_EOL_NOCHANGE:
459 break;
460 case UT_EOL_CR:
461 case UT_EOL_LF:
462 size -= src_text->nb_lines; break;
463 case UT_EOL_NONE:
464 size -= 2*src_text->nb_lines; break;
465 default: //+UT_EOL_NON_STD:
466 ERROR ("dst EOL type unsupported")
467 } break;
468
469 case UT_EOL_NUL:
470 case UT_EOL_CR:
471 case UT_EOL_LF:
472 case UT_EOL_MIX: //1 or 2 bytes, we consider 1 for secureness
473 switch (dst_text->eol) {
474 case UT_EOL_CR:
475 case UT_EOL_LF:
476 case UT_EOL_NUL:
477 //case UT_EOL_NOCHANGE:
478 break;
479 case UT_EOL_CRLF:
480 case UT_EOL_LFCR:
481 case UT_EOL_BSN:
482 size += src_text->nb_lines; break;
483 case UT_EOL_NONE:
484 size -= src_text->nb_lines; break;
485 default: //+UT_EOL_NON_STD:
486 ERROR ("dst EOL type unsupported")
487 } break;
488 default:
489 ERROR ("src EOL type unsupported")
490 }
491
492 DBG3( "** size diff chars+eol : % ld", size);
493
494 switch (src_text->eol_alt) {
495 case UT_EOL_NONE:
496 break;
497 case UT_EOL_NUL:
498 case UT_EOL_CR:
499 case UT_EOL_LF:
500 case UT_EOL_MIX: //1 or 2 bytes, we consider 1 for secureness
501 switch (dst_text->eol_alt) {
502 case UT_EOL_CR:
503 case UT_EOL_LF:
504 case UT_EOL_NUL:
505 //case UT_EOL_NOCHANGE:
506 break;
507 case UT_EOL_CRLF:
508 case UT_EOL_LFCR:
509 case UT_EOL_BSN:
510 size += src_text->nb_lines_alt; break;
511 case UT_EOL_NONE:
512 size -= src_text->nb_lines_alt; break;
513 default: //+UT_EOL_NON_STD:
514 ERROR ("dst EOL type unsupported")
515 } break;
516 default:
517 ERROR ("src EOL type unsupported")
518 }
519
520 DBG3( "** size diff chars+eol+alt : % ld", size);
521
522 return size;
523 }
524
525
526 /*!
527 * \brief Convert extended characters and EOL.
528 *
529 * The conversion consists to :
530 * - remove skip characters,
531 * - change null characters to EOL tpye specified in UtText::dst_eol,
532 * - change extended characters encoded with UtText::src_charset to UtText::dst_charset encoding.
533 *
534 * \param text UtText to convert. Updates UtText::data and UtText::size.
535 *
536 * \return UT_OK on success, error code otherwise.
537 */
538
539 UtCode ut_conversion_pass (UtText * src_text, UtText * dst_text) {
540
541 ASSERT (src_text)
542 ASSERT (dst_text)
543
544 ASSERT (dst_text->data == NULL)
545
546 //TODO? create dst_text?
547 if (dst_text->eol==UT_EOL_UNSET) dst_text->eol = src_text->eol;
548 if (dst_text->eol_alt==UT_EOL_UNSET) dst_text->eol_alt = src_text->eol_alt;
549 free (dst_text->data);
550 dst_text->data = NULL;
551
552 long newsize = ut_size_difference (src_text, dst_text);
553
554 DBG3 ("size diff : %ld ext char : %d", newsize, ut_count_ext_char (src_text) )
555 newsize += src_text->size;
556 DBG3 ("old size: %lu new size: %lu", src_text->size, newsize)
557
558 //Allocate new buffer for dst
559 char *dst_beg = (char*) malloc (newsize+1); //+1 for UT_EOE_CHAR
560 if (!dst_beg) return UT_MALLOC_ERROR;
561
562 char *src = src_text->data;
563 char *src_end = src_text->data + src_text->size;
564 char *dst = dst_beg;
565 int cumul=1;
566
567 for (;;) {
568 DBG3_S ("<%d>", *src);
569 if (!is_ext (*src)) {
570 if (*src) {
571 if (*src==src_text->skip_char) {
572 src++;
573 } else if (*src==UT_EOL_ALT_CHAR) {
574 ut_insert_eol (&dst, dst_text->eol_alt);
575 src++;
576 } else {
577 *dst++ = *src++;
578 }
579 } else { //UT_EOL_CHAR
580 if (src - src_text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
581 if (!ut_update_progress (src_text, src - src_text->data, false)) break;
582 cumul++;
583 }
584 if (src >= src_end) {
585 ASSERT (src==src_end)
586 *dst = 0;
587 break; //last line?
588 }
589 ut_insert_eol (&dst, dst_text->eol);
590 src++;
591 DBG3_S ("!")
592 }
593 } else { //ext_char
594 ut_conv_char (&src, &dst, src_text->charset, dst_text->charset);
595 }
596 } //for (;;)
597
598 if (src < src_end) {
599 //CLEAN HERE!
600 DBG3 ( "interrupted! : src:%d srcend: %d dst:%d", src - src_text->data, src_end - src_text->data, dst - dst_beg)
601 free (dst_beg);
602 return UT_INTERRUPTED_BY_USER;
603 }
604
605
606 ASSERT ( dst - dst_beg <= newsize )
607 DBG3 ( "precalculated size: %ld actual size: %d", newsize, dst - dst_beg)
608
609 //free (src_text->data);
610 dst_text->data = dst_beg;
611 dst_text->size = dst - dst_beg;
612
613 DBG2 ("Conversion done!")
614 return UT_OK;
615 }