"Fossies" - the Fresh Open Source Software Archive

Member "tcpflow-1.6.1/src/be13_api/unicode_escape.cpp" (19 Feb 2021, 10793 Bytes) of package /linux/misc/tcpflow-1.6.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "unicode_escape.cpp" see the Fossies "Dox" file reference documentation.

    1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 /**
    3  * unicode_escape.cpp:
    4  * Escape unicode that is not valid.
    5  * 
    6  * References:
    7  * http://www.ietf.org/rfc/rfc3987.txt
    8  * http://en.wikipedia.org/wiki/UTF-8
    9  *
   10  * @author Simson Garfinkel
   11  *
   12  *
   13  * The software provided here is released by the Naval Postgraduate
   14  * School, an agency of the U.S. Department of Navy.  The software
   15  * bears no warranty, either expressed or implied. NPS does not assume
   16  * legal liability nor responsibility for a User's use of the software
   17  * or the results of such use.
   18  *
   19  * Please note that within the United States, copyright protection,
   20  * under Section 105 of the United States Code, Title 17, is not
   21  * available for any work of the United States Government and/or for
   22  * any works created by United States Government employees. User
   23  * acknowledges that this software contains work which was created by
   24  * NPS government employees and is therefore in the public domain and
   25  * not subject to copyright.
   26  */
   27 
   28 #ifndef PACKAGE_NAME
   29 #include "config.h"
   30 #endif
   31 
   32 #include "unicode_escape.h"
   33 
   34 #include <stdio.h>
   35 #include <assert.h>
   36 #include <iostream>
   37 #include <fstream>
   38 
   39 #ifndef __STDC_FORMAT_MACROS
   40 #define __STDC_FORMAT_MACROS
   41 #endif
   42 
   43 #ifdef HAVE_STDINT_H
   44 #include <stdint.h>
   45 #endif
   46 
   47 #define IS_IN_RANGE(c, f, l)    (((c) >= (f)) && ((c) <= (l)))
   48 
   49 #include "utf8.h"
   50 
   51 //extern int debug;
   52 
   53 std::string hexesc(unsigned char ch)
   54 {
   55     char buf[10];
   56     snprintf(buf,sizeof(buf),"\\x%02X",ch);
   57     return std::string(buf);
   58 }
   59 
   60 /** returns true if this is a UTF8 continuation character */
   61 bool utf8cont(unsigned char ch)
   62 {
   63     return ((ch&0x80)==0x80) &&  ((ch & 0x40)==0);
   64 }
   65 
   66 /**
   67  * After a UTF-8 sequence is decided, this function is called
   68  * to determine if the character is invalid. The UTF-8 spec now
   69  * says that if a UTF-8 decoding produces an invalid character, or
   70  * a surrogate, it is not valid. (There were some nasty security
   71  * vulnerabilities that were exploited before this came out.)
   72  * So we do a lot of checks here.
   73  */
   74 bool valid_utf8codepoint(uint32_t unichar)
   75 {
   76     // Check for invalid characters in the bmp
   77     switch(unichar){
   78     case 0xfffe: return false;          // reversed BOM
   79     case 0xffff: return false;
   80     default:
   81         break;
   82     }
   83     if(unichar >= 0xd800 && unichar <=0xdfff) return false; // high and low surrogates
   84     if(unichar < 0x10000) return true;  // looks like it is in the BMP
   85 
   86     // check some regions outside the bmp
   87 
   88     // Plane 1:
   89     if(unichar > 0x13fff && unichar < 0x16000) return false;
   90     if(unichar > 0x16fff && unichar < 0x1b000) return false;
   91     if(unichar > 0x1bfff && unichar < 0x1d000) return false;
   92         
   93     // Plane 2
   94     if(unichar > 0x2bfff && unichar < 0x2f000) return false;
   95     
   96     // Planes 3--13 are unassigned
   97     if(unichar >= 0x30000 && unichar < 0xdffff) return false;
   98 
   99     // Above Plane 16 is invalid
  100     if(unichar > 0x10FFFF) return false;        // above plane 16?
  101     
  102     return true;                        // must be valid
  103 }
  104 
  105 /**
  106  * validateOrEscapeUTF8
  107  * Input: UTF8 string (possibly corrupt)
  108  * Input: do_escape, indicating whether invalid encodings shall be escaped.
  109  * Note:
  110  *    - if not escaping but an invalid encoding is present and DEBUG_PEDANTIC is set, then assert() is called.
  111  *    - DO NOT USE wchar_t because it is 16-bits on Windows and 32-bits on Unix.
  112  * Output: 
  113  *   - UTF8 string.  If do_escape is set, then corruptions are escaped in \xFF notation where FF is a hex character.
  114  */
  115 
  116 //int count=0;
  117 bool validateOrEscapeUTF8_validate=false;
  118 std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8,bool escape_backslash)
  119 {
  120     // 
  121     // skip the validation if not escaping and not DEBUG_PEDANTIC
  122     if (escape_bad_utf8==false && escape_backslash==false && !validateOrEscapeUTF8_validate){
  123         return input;
  124     }
  125         
  126     // validate or escape input
  127     std::string output;
  128     for(std::string::size_type i =0; i< input.length(); ) {
  129         uint8_t ch = (uint8_t)input.at(i);
  130         
  131         // utf8 1 byte prefix (0xxx xxxx)
  132         if((ch & 0x80)==0x00){          // 00 .. 0x7f
  133             if(ch=='\\' && escape_backslash){   // escape the escape character as \x92
  134                 output += hexesc(ch);
  135                 i++;
  136                 continue;
  137             }
  138 
  139             if( ch < ' '){              // not printable are escaped
  140                 output += hexesc(ch);
  141                 i++;
  142                 continue;
  143             }
  144             output += ch;               // printable is not escaped
  145             i++;
  146             continue;
  147         }
  148 
  149         // utf8 2 bytes  (110x xxxx) prefix
  150         if(((ch & 0xe0)==0xc0)  // 2-byte prefix
  151            && (i+1 < input.length())
  152            && utf8cont((uint8_t)input.at(i+1))){
  153             uint32_t unichar = (((uint8_t)input.at(i) & 0x1f) << 6) | (((uint8_t)input.at(i+1) & 0x3f));
  154 
  155             // check for valid 2-byte encoding
  156             if(valid_utf8codepoint(unichar)
  157                && ((uint8_t)input.at(i)!=0xc0)
  158                && (unichar >= 0x80)){ 
  159                 output += (uint8_t)input.at(i++);       // byte1
  160                 output += (uint8_t)input.at(i++);       // byte2
  161                 continue;
  162             }
  163         }
  164                 
  165         // utf8 3 bytes (1110 xxxx prefix)
  166         if(((ch & 0xf0) == 0xe0)
  167            && (i+2 < input.length())
  168            && utf8cont((uint8_t)input.at(i+1))
  169            && utf8cont((uint8_t)input.at(i+2))){
  170             uint32_t unichar = (((uint8_t)input.at(i) & 0x0f) << 12)
  171                 | (((uint8_t)input.at(i+1) & 0x3f) << 6)
  172                 | (((uint8_t)input.at(i+2) & 0x3f));
  173             
  174             // check for a valid 3-byte code point
  175             if(valid_utf8codepoint(unichar)
  176                && unichar>=0x800){                     
  177                 output += (uint8_t)input.at(i++);       // byte1
  178                 output += (uint8_t)input.at(i++);       // byte2
  179                 output += (uint8_t)input.at(i++);       // byte3
  180                 continue;
  181             }
  182         }
  183             
  184         // utf8 4 bytes (1111 0xxx prefix)
  185         if((( ch & 0xf8) == 0xf0)
  186            && (i+3 < input.length())
  187            && utf8cont((uint8_t)input.at(i+1))
  188            && utf8cont((uint8_t)input.at(i+2))
  189            && utf8cont((uint8_t)input.at(i+3))){
  190             uint32_t unichar =( (((uint8_t)input.at(i) & 0x07) << 18)
  191                                 |(((uint8_t)input.at(i+1) & 0x3f) << 12)
  192                                 |(((uint8_t)input.at(i+2) & 0x3f) <<  6)
  193                                 |(((uint8_t)input.at(i+3) & 0x3f)));
  194 
  195             if(valid_utf8codepoint(unichar) && unichar>=0x1000000){
  196                 output += (uint8_t)input.at(i++);       // byte1
  197                 output += (uint8_t)input.at(i++);       // byte2
  198                 output += (uint8_t)input.at(i++);       // byte3
  199                 output += (uint8_t)input.at(i++);       // byte4
  200                 continue;
  201             }
  202         }
  203 
  204         if (escape_bad_utf8) {
  205             // Just escape the next byte and carry on
  206             output += hexesc((uint8_t)input.at(i++));
  207         } else {
  208             // fatal if we are debug pedantic, otherwise just ignore
  209             // note: we shouldn't be here anyway, since if we are not escaping and we are not
  210             // pedantic we should have returned above
  211             if(validateOrEscapeUTF8_validate){
  212                 std::ofstream os("bad_unicode.txt");
  213                 os << input << "\n";
  214                 os.close();
  215                 std::cerr << "INTERNAL ERROR: bad unicode stored in bad_unicode.txt\n";
  216                 assert(0);
  217             }
  218         }
  219     }
  220     return output;
  221 }
  222 
  223 #ifdef STANDALONE
  224 
  225 void show(const std::string &ugly)
  226 {
  227     for(size_t j=0;j<ugly.size();j++){
  228         printf("%02X ",(unsigned char)ugly[j]);
  229     }
  230 }
  231 
  232 void check(const std::string &ugly,bool verbose)
  233 {
  234     std::string res = validateOrEscapeUTF8(ugly,true);
  235     std::wstring utf16;
  236     /* Now check to make sure it is valid UTF8 */
  237     try {
  238         utf8::utf8to16(res.begin(),res.end(),std::back_inserter(utf16));
  239         if(verbose){
  240             show(ugly);
  241             printf(" successfully encodes as ");
  242             show(res);
  243             printf(" (\"%s\")\n",res.c_str());
  244         }
  245     } catch(utf8::exception){
  246         printf("utf8 error hex sequence: ");
  247         show(ugly);
  248         printf(" encoded as: ");
  249         show(res);
  250         printf("\n");
  251     } catch(std::exception){
  252         std::cout << "other exception \n";
  253     }
  254 }
  255 
  256 void testfile(const char *fn)
  257 {
  258     validateOrEscapeUTF8_validate = true;
  259 
  260     std::cout << "testing file " << fn << "\n";
  261     ifstream i(fn);
  262     if(i.is_open()){
  263         string line;
  264         getline(i,line);
  265         std::cout << "line length: " << line.size() << "\n";
  266         std::cout << "calling ValidateOrEscapeUTF8 to escape...\n";
  267         string l2 = validateOrEscapeUTF8(line,true);
  268         std::cout << "     length l2: " << l2.size() << "\n";
  269         std::cout << "calling ValidateOrEscapeUTF8 to validate...\n";
  270         validateOrEscapeUTF8(l2,false);
  271         std::cout << "calling check...\n";
  272         check(l2,false);
  273     }
  274     std::cout << "done\n";
  275     exit(0);
  276 }
  277 
  278 int main(int argc,char **argv)
  279 {
  280     std::cout << "Unicode Escape Regression Tester\n";
  281     int ch;
  282     while ((ch = getopt(argc,argv,"r:h")) != -1){
  283         switch(ch) {
  284         case 'r':
  285             testfile(optarg);
  286             break;
  287         }
  288     }
  289 
  290 
  291     const char buf[] = {0xef, 0xbe, 0xad, 0x5c};
  292     check(std::string(buf,1),true);
  293     check(std::string(buf,2),true);
  294     check(std::string(buf,3),true);
  295     check(std::string(buf,4),true);
  296 
  297     /* Runs 16 copies simultaneously... */
  298     uint32_t max=0xFFFFFFFF;            // 2^32-1
  299     for(uint64_t prefix=0;prefix<max;prefix+=0x10000000){
  300         pid_t child = fork();
  301         if(child==0){
  302             /* Try all 4-byte sequences in the prefix range...*/
  303             for(uint32_t k=0;k<=0x0FFFFFFF;k++){
  304                 uint32_t i=prefix+k;
  305                 std::string ugly((char *)&i,4);
  306                 check(ugly,false);
  307                 if((i & 0x00FFFFFF)==0x00FFFFFF){
  308                     printf("pid=%d prefix=%x i=%x\n",getpid(),(uint32_t)prefix,(uint32_t)i);
  309                     fflush(stdout);
  310                 }
  311             }
  312             exit(0);
  313         }
  314         printf("Launched PID %d\n",child);
  315         fflush(stdout);
  316     }
  317     for(int i=0;i<16;i++){
  318         int s=0;
  319         pid_t p = wait(&s);
  320         printf("pid %d finished with exit code %d\n",p,s);
  321     }
  322     std::cout << "done\n";
  323     exit(1);
  324 
  325     /* Generic fuzzing. Try random attempts */
  326     std::string line;
  327     while(getline(std::cin,line)){
  328         std::cout << validateOrEscapeUTF8(line,true) << "\n";
  329     }
  330         
  331 }
  332 #endif