unrarsrc  6.1.7
About: unrar extracts, views and tests the contents of archives created with the RAR archiver.
  Fossies Dox: unrarsrc-6.1.7.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

unicode.cpp
Go to the documentation of this file.
1#include "rar.hpp"
2#define MBFUNCTIONS
3
4#if defined(_UNIX) && defined(MBFUNCTIONS)
5
6static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
7static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
8
9// In Unix we map high ASCII characters which cannot be converted to Unicode
10// to 0xE000 - 0xE0FF private use Unicode area.
11static const uint MapAreaStart=0xE000;
12
13// Mapped string marker. Initially we used 0xFFFF for this purpose,
14// but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15// While we could workaround it, it is safer to use another character.
16static const uint MappedStringMark=0xFFFE;
17
18#endif
19
20bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
21{
22 bool RetCode=true;
23 *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24
25#ifdef _WIN_ALL
26 if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27 RetCode=false;
28
29// wcstombs is broken in Android NDK r9.
30#elif defined(_APPLE)
31 WideToUtf(Src,Dest,DestSize);
32
33#elif defined(MBFUNCTIONS)
34 if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35 {
36 mbstate_t ps; // Use thread safe external state based functions.
37 memset (&ps, 0, sizeof(ps));
38 const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39
40 // Some implementations of wcsrtombs can cause memory analyzing tools
41 // like valgrind to report uninitialized data access. It happens because
42 // internally these implementations call SSE4 based wcslen function,
43 // which reads 16 bytes at once including those beyond of trailing 0.
44 size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
45
46 if (ResultingSize==(size_t)-1 && errno==EILSEQ)
47 {
48 // Aborted on inconvertible character not zero terminating the result.
49 // EILSEQ helps to distinguish it from small output buffer abort.
50 // We want to convert as much as we can, so we clean the output buffer
51 // and repeat conversion.
52 memset (&ps, 0, sizeof(ps));
53 SrcParam=Src; // wcsrtombs can change the pointer.
54 memset(Dest,0,DestSize);
55 ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
56 }
57
58 if (ResultingSize==(size_t)-1)
59 RetCode=false;
60 if (ResultingSize==0 && *Src!=0)
61 RetCode=false;
62 }
63#else
64 for (int I=0;I<DestSize;I++)
65 {
66 Dest[I]=(char)Src[I];
67 if (Src[I]==0)
68 break;
69 }
70#endif
71 if (DestSize>0)
72 Dest[DestSize-1]=0;
73
74 // We tried to return the empty string if conversion is failed,
75 // but it does not work well. WideCharToMultiByte returns 'failed' code
76 // and partially converted string even if we wanted to convert only a part
77 // of string and passed DestSize smaller than required for fully converted
78 // string. Such call is the valid behavior in RAR code and we do not expect
79 // the empty string in this case.
80
81 return RetCode;
82}
83
84
85bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
86{
87 bool RetCode=true;
88 *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
89
90#ifdef _WIN_ALL
91 if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
92 RetCode=false;
93
94// mbstowcs is broken in Android NDK r9.
95#elif defined(_APPLE)
96 UtfToWide(Src,Dest,DestSize);
97
98#elif defined(MBFUNCTIONS)
99 mbstate_t ps;
100 memset (&ps, 0, sizeof(ps));
101 const char *SrcParam=Src; // mbsrtowcs can change the pointer.
102 size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
103 if (ResultingSize==(size_t)-1)
104 RetCode=false;
105 if (ResultingSize==0 && *Src!=0)
106 RetCode=false;
107
108 if (RetCode==false && DestSize>1)
109 CharToWideMap(Src,Dest,DestSize,RetCode);
110#else
111 for (int I=0;I<DestSize;I++)
112 {
113 Dest[I]=(wchar_t)Src[I];
114 if (Src[I]==0)
115 break;
116 }
117#endif
118 if (DestSize>0)
119 Dest[DestSize-1]=0;
120
121 // We tried to return the empty string if conversion is failed,
122 // but it does not work well. MultiByteToWideChar returns 'failed' code
123 // even if we wanted to convert only a part of string and passed DestSize
124 // smaller than required for fully converted string. Such call is the valid
125 // behavior in RAR code and we do not expect the empty string in this case.
126
127 return RetCode;
128}
129
130
131#if defined(_UNIX) && defined(MBFUNCTIONS)
132// Convert and restore mapped inconvertible Unicode characters.
133// We use it for extended ASCII names in Unix.
134bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
135{
136 // String with inconvertible characters mapped to private use Unicode area
137 // must have the mark code somewhere.
138 if (wcschr(Src,(wchar)MappedStringMark)==NULL)
139 return false;
140
141 // Seems to be that wcrtomb in some memory analyzing libraries
142 // can produce uninitilized output while reporting success on garbage input.
143 // So we clean the destination to calm analyzers.
144 memset(Dest,0,DestSize);
145
146 Success=true;
147 uint SrcPos=0,DestPos=0;
148 while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
149 {
150 if (uint(Src[SrcPos])==MappedStringMark)
151 {
152 SrcPos++;
153 continue;
154 }
155 // For security reasons do not restore low ASCII codes, so mapping cannot
156 // be used to hide control codes like path separators.
157 if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
158 Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
159 else
160 {
161 mbstate_t ps;
162 memset(&ps,0,sizeof(ps));
163 if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
164 {
165 Dest[DestPos]='_';
166 Success=false;
167 }
168 SrcPos++;
169 memset(&ps,0,sizeof(ps));
170 int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
171 DestPos+=Max(Length,1);
172 }
173 }
174 Dest[Min(DestPos,DestSize-1)]=0;
175 return true;
176}
177#endif
178
179
180#if defined(_UNIX) && defined(MBFUNCTIONS)
181// Convert and map inconvertible Unicode characters.
182// We use it for extended ASCII names in Unix.
183void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
184{
185 // Map inconvertible characters to private use Unicode area 0xE000.
186 // Mark such string by placing special non-character code before
187 // first inconvertible character.
188 Success=false;
189 bool MarkAdded=false;
190 uint SrcPos=0,DestPos=0;
191 while (DestPos<DestSize)
192 {
193 if (Src[SrcPos]==0)
194 {
195 Success=true;
196 break;
197 }
198 mbstate_t ps;
199 memset(&ps,0,sizeof(ps));
200 size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
201 if (res==(size_t)-1 || res==(size_t)-2)
202 {
203 // For security reasons we do not want to map low ASCII characters,
204 // so we do not have additional .. and path separator codes.
205 if (byte(Src[SrcPos])>=0x80)
206 {
207 if (!MarkAdded)
208 {
209 Dest[DestPos++]=MappedStringMark;
210 MarkAdded=true;
211 if (DestPos>=DestSize)
212 break;
213 }
214 Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
215 }
216 else
217 break;
218 }
219 else
220 {
221 memset(&ps,0,sizeof(ps));
222 int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
223 SrcPos+=Max(Length,1);
224 DestPos++;
225 }
226 }
227 Dest[Min(DestPos,DestSize-1)]=0;
228}
229#endif
230
231
232// SrcSize is in wide characters, not in bytes.
233byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
234{
235 for (size_t I=0;I<SrcSize;I++,Src++)
236 {
237 Dest[I*2]=(byte)*Src;
238 Dest[I*2+1]=(byte)(*Src>>8);
239 if (*Src==0)
240 break;
241 }
242 return Dest;
243}
244
245
246wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
247{
248 for (size_t I=0;I<DestSize;I++)
249 if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
250 break;
251 return Dest;
252}
253
254
255void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
256{
257 long dsize=(long)DestSize;
258 dsize--;
259 while (*Src!=0 && --dsize>=0)
260 {
261 uint c=*(Src++);
262 if (c<0x80)
263 *(Dest++)=c;
264 else
265 if (c<0x800 && --dsize>=0)
266 {
267 *(Dest++)=(0xc0|(c>>6));
268 *(Dest++)=(0x80|(c&0x3f));
269 }
270 else
271 {
272 if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
273 {
274 c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
275 Src++;
276 }
277 if (c<0x10000 && (dsize-=2)>=0)
278 {
279 *(Dest++)=(0xe0|(c>>12));
280 *(Dest++)=(0x80|((c>>6)&0x3f));
281 *(Dest++)=(0x80|(c&0x3f));
282 }
283 else
284 if (c < 0x200000 && (dsize-=3)>=0)
285 {
286 *(Dest++)=(0xf0|(c>>18));
287 *(Dest++)=(0x80|((c>>12)&0x3f));
288 *(Dest++)=(0x80|((c>>6)&0x3f));
289 *(Dest++)=(0x80|(c&0x3f));
290 }
291 }
292 }
293 *Dest=0;
294}
295
296
297size_t WideToUtfSize(const wchar *Src)
298{
299 size_t Size=0;
300 for (;*Src!=0;Src++)
301 if (*Src<0x80)
302 Size++;
303 else
304 if (*Src<0x800)
305 Size+=2;
306 else
307 if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
308 {
309 if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
310 {
311 Size+=4; // 4 output bytes for Unicode surrogate pair.
312 Src++;
313 }
314 else
315 Size+=3;
316 }
317 else
318 if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
319 Size+=4;
320 return Size+1; // Include terminating zero.
321}
322
323
324bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
325{
326 bool Success=true;
327 long dsize=(long)DestSize;
328 dsize--;
329 while (*Src!=0)
330 {
331 uint c=byte(*(Src++)),d;
332 if (c<0x80)
333 d=c;
334 else
335 if ((c>>5)==6)
336 {
337 if ((*Src&0xc0)!=0x80)
338 {
339 Success=false;
340 break;
341 }
342 d=((c&0x1f)<<6)|(*Src&0x3f);
343 Src++;
344 }
345 else
346 if ((c>>4)==14)
347 {
348 if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
349 {
350 Success=false;
351 break;
352 }
353 d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
354 Src+=2;
355 }
356 else
357 if ((c>>3)==30)
358 {
359 if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
360 {
361 Success=false;
362 break;
363 }
364 d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
365 Src+=3;
366 }
367 else
368 {
369 Success=false;
370 break;
371 }
372 if (--dsize<0)
373 break;
374 if (d>0xffff)
375 {
376 if (--dsize<0)
377 break;
378 if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
379 {
380 Success=false;
381 continue;
382 }
383 if (sizeof(*Dest)==2) // Use the surrogate pair.
384 {
385 *(Dest++)=((d-0x10000)>>10)+0xd800;
386 *(Dest++)=(d&0x3ff)+0xdc00;
387 }
388 else
389 *(Dest++)=d;
390 }
391 else
392 *(Dest++)=d;
393 }
394 *Dest=0;
395 return Success;
396}
397
398
399// For zero terminated strings.
400bool IsTextUtf8(const byte *Src)
401{
402 return IsTextUtf8(Src,strlen((const char *)Src));
403}
404
405
406// Source data can be both with and without UTF-8 BOM.
407bool IsTextUtf8(const byte *Src,size_t SrcSize)
408{
409 while (SrcSize-- > 0)
410 {
411 byte C=*(Src++);
412 int HighOne=0; // Number of leftmost '1' bits.
413 for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
414 HighOne++;
415 if (HighOne==1 || HighOne>6)
416 return false;
417 while (--HighOne > 0)
418 if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
419 return false;
420 }
421 return true;
422}
423
424
425int wcsicomp(const wchar *s1,const wchar *s2)
426{
427#ifdef _WIN_ALL
428 return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
429#else
430 while (true)
431 {
432 wchar u1 = towupper(*s1);
433 wchar u2 = towupper(*s2);
434 if (u1 != u2)
435 return u1 < u2 ? -1 : 1;
436 if (*s1==0)
437 break;
438 s1++;
439 s2++;
440 }
441 return 0;
442#endif
443}
444
445
446int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
447{
448#ifdef _WIN_ALL
449 // If we specify 'n' exceeding the actual string length, CompareString goes
450 // beyond the trailing zero and compares garbage. So we need to limit 'n'
451 // to real string length.
452 size_t l1=Min(wcslen(s1)+1,n);
453 size_t l2=Min(wcslen(s2)+1,n);
454 return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
455#else
456 if (n==0)
457 return 0;
458 while (true)
459 {
460 wchar u1 = towupper(*s1);
461 wchar u2 = towupper(*s2);
462 if (u1 != u2)
463 return u1 < u2 ? -1 : 1;
464 if (*s1==0 || --n==0)
465 break;
466 s1++;
467 s2++;
468 }
469 return 0;
470#endif
471}
472
473
474// Case insensitive wcsstr().
475const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
476{
477 for (size_t i=0;str[i]!=0;i++)
478 for (size_t j=0;;j++)
479 {
480 if (search[j]==0)
481 return str+i;
482 if (tolowerw(str[i+j])!=tolowerw(search[j]))
483 break;
484 }
485 return NULL;
486}
487
488
489#ifndef SFX_MODULE
491{
492#ifdef _WIN_ALL
493 // _wcslwr requires setlocale and we do not want to depend on setlocale
494 // in Windows. Also CharLower involves less overhead.
495 CharLower(s);
496#else
497 for (wchar *c=s;*c!=0;c++)
498 *c=towlower(*c);
499#endif
500 return s;
501}
502#endif
503
504
505#ifndef SFX_MODULE
507{
508#ifdef _WIN_ALL
509 // _wcsupr requires setlocale and we do not want to depend on setlocale
510 // in Windows. Also CharUpper involves less overhead.
511 CharUpper(s);
512#else
513 for (wchar *c=s;*c!=0;c++)
514 *c=towupper(*c);
515#endif
516 return s;
517}
518#endif
519
520
521
522
523int toupperw(int ch)
524{
525#if defined(_WIN_ALL)
526 // CharUpper is more reliable than towupper in Windows, which seems to be
527 // C locale dependent even in Unicode version. For example, towupper failed
528 // to convert lowercase Russian characters. Use 0xffff mask to prevent crash
529 // if value larger than 0xffff is passed to this function.
530 return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
531#else
532 return towupper(ch);
533#endif
534}
535
536
537int tolowerw(int ch)
538{
539#if defined(_WIN_ALL)
540 // CharLower is more reliable than towlower in Windows.
541 // See comment for towupper above. Use 0xffff mask to prevent crash
542 // if value larger than 0xffff is passed to this function.
543 return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
544#else
545 return towlower(ch);
546#endif
547}
548
549
550int atoiw(const wchar *s)
551{
552 return (int)atoilw(s);
553}
554
555
557{
558 bool sign=false;
559 if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
560 {
561 s++;
562 sign=true;
563 }
564 // Use unsigned type here, since long string can overflow the variable
565 // and signed integer overflow is undefined behavior in C++.
566 uint64 n=0;
567 while (*s>='0' && *s<='9')
568 {
569 n=n*10+(*s-'0');
570 s++;
571 }
572 // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
573 // when negating 0x8000000000000000.
574 return sign && int64(n)>=0 ? -int64(n) : int64(n);
575}
576
577
578#ifdef DBCS_SUPPORTED
579SupportDBCS gdbcs;
580
581SupportDBCS::SupportDBCS()
582{
583 Init();
584}
585
586
587void SupportDBCS::Init()
588{
589 CPINFO CPInfo;
590 GetCPInfo(CP_ACP,&CPInfo);
591 DBCSMode=CPInfo.MaxCharSize > 1;
592 for (uint I=0;I<ASIZE(IsLeadByte);I++)
593 IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
594}
595
596
597char* SupportDBCS::charnext(const char *s)
598{
599 // Zero cannot be the trail byte. So if next byte after the lead byte
600 // is 0, the string is corrupt and we'll better return the pointer to 0,
601 // to break string processing loops.
602 return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
603}
604#endif
605
606
#define Min(x, y)
Definition: rardefs.hpp:4
#define Max(x, y)
Definition: rardefs.hpp:5
#define ASIZE(x)
Definition: rardefs.hpp:10
wchar_t wchar
Definition: rartypes.hpp:13
int64_t int64
Definition: rartypes.hpp:12
unsigned int uint
Definition: rartypes.hpp:8
uint8_t byte
Definition: rartypes.hpp:6
uint64_t uint64
Definition: rartypes.hpp:11
static const uint MappedStringMark
Definition: unicode.cpp:16
bool WideToChar(const wchar *Src, char *Dest, size_t DestSize)
Definition: unicode.cpp:20
static const uint MapAreaStart
Definition: unicode.cpp:11
int wcsicomp(const wchar *s1, const wchar *s2)
Definition: unicode.cpp:425
wchar * wcslower(wchar *s)
Definition: unicode.cpp:490
int64 atoilw(const wchar *s)
Definition: unicode.cpp:556
int atoiw(const wchar *s)
Definition: unicode.cpp:550
static void CharToWideMap(const char *Src, wchar *Dest, size_t DestSize, bool &Success)
Definition: unicode.cpp:183
byte * WideToRaw(const wchar *Src, byte *Dest, size_t SrcSize)
Definition: unicode.cpp:233
const wchar_t * wcscasestr(const wchar_t *str, const wchar_t *search)
Definition: unicode.cpp:475
bool CharToWide(const char *Src, wchar *Dest, size_t DestSize)
Definition: unicode.cpp:85
static bool WideToCharMap(const wchar *Src, char *Dest, size_t DestSize, bool &Success)
Definition: unicode.cpp:134
void WideToUtf(const wchar *Src, char *Dest, size_t DestSize)
Definition: unicode.cpp:255
int toupperw(int ch)
Definition: unicode.cpp:523
wchar * RawToWide(const byte *Src, wchar *Dest, size_t DestSize)
Definition: unicode.cpp:246
int wcsnicomp(const wchar *s1, const wchar *s2, size_t n)
Definition: unicode.cpp:446
int tolowerw(int ch)
Definition: unicode.cpp:537
bool UtfToWide(const char *Src, wchar *Dest, size_t DestSize)
Definition: unicode.cpp:324
size_t WideToUtfSize(const wchar *Src)
Definition: unicode.cpp:297
bool IsTextUtf8(const byte *Src)
Definition: unicode.cpp:400
wchar * wcsupper(wchar *s)
Definition: unicode.cpp:506
#define charnext(s)
Definition: unicode.hpp:47