tcpflow  1.6.1
About: tcpflow is a TCP/IP packet demultiplexer that captures data transmitted as part of TCP connections (flows), and stores the data in a way that is convenient for protocol analysis and debugging.
  Fossies Dox: tcpflow-1.6.1.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

unicode_escape.cpp
Go to the documentation of this file.
1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /**
3  * unicode_escape.cpp:
4  * Escape unicode that is not valid.
5  *
6  * References:
7  * http://www.ietf.org/rfc/rfc3987.txt
8  * http://en.wikipedia.org/wiki/UTF-8
9  *
10  * @author Simson Garfinkel
11  *
12  *
13  * The software provided here is released by the Naval Postgraduate
14  * School, an agency of the U.S. Department of Navy. The software
15  * bears no warranty, either expressed or implied. NPS does not assume
16  * legal liability nor responsibility for a User's use of the software
17  * or the results of such use.
18  *
19  * Please note that within the United States, copyright protection,
20  * under Section 105 of the United States Code, Title 17, is not
21  * available for any work of the United States Government and/or for
22  * any works created by United States Government employees. User
23  * acknowledges that this software contains work which was created by
24  * NPS government employees and is therefore in the public domain and
25  * not subject to copyright.
26  */
27 
28 #ifndef PACKAGE_NAME
29 #include "config.h"
30 #endif
31 
32 #include "unicode_escape.h"
33 
34 #include <stdio.h>
35 #include <assert.h>
36 #include <iostream>
37 #include <fstream>
38 
39 #ifndef __STDC_FORMAT_MACROS
40 #define __STDC_FORMAT_MACROS
41 #endif
42 
43 #ifdef HAVE_STDINT_H
44 #include <stdint.h>
45 #endif
46 
47 #define IS_IN_RANGE(c, f, l) (((c) >= (f)) && ((c) <= (l)))
48 
49 #include "utf8.h"
50 
51 //extern int debug;
52 
53 std::string hexesc(unsigned char ch)
54 {
55  char buf[10];
56  snprintf(buf,sizeof(buf),"\\x%02X",ch);
57  return std::string(buf);
58 }
59 
60 /** returns true if this is a UTF8 continuation character */
61 bool utf8cont(unsigned char ch)
62 {
63  return ((ch&0x80)==0x80) && ((ch & 0x40)==0);
64 }
65 
66 /**
67  * After a UTF-8 sequence is decided, this function is called
68  * to determine if the character is invalid. The UTF-8 spec now
69  * says that if a UTF-8 decoding produces an invalid character, or
70  * a surrogate, it is not valid. (There were some nasty security
71  * vulnerabilities that were exploited before this came out.)
72  * So we do a lot of checks here.
73  */
75 {
76  // Check for invalid characters in the bmp
77  switch(unichar){
78  case 0xfffe: return false; // reversed BOM
79  case 0xffff: return false;
80  default:
81  break;
82  }
83  if(unichar >= 0xd800 && unichar <=0xdfff) return false; // high and low surrogates
84  if(unichar < 0x10000) return true; // looks like it is in the BMP
85 
86  // check some regions outside the bmp
87 
88  // Plane 1:
89  if(unichar > 0x13fff && unichar < 0x16000) return false;
90  if(unichar > 0x16fff && unichar < 0x1b000) return false;
91  if(unichar > 0x1bfff && unichar < 0x1d000) return false;
92 
93  // Plane 2
94  if(unichar > 0x2bfff && unichar < 0x2f000) return false;
95 
96  // Planes 3--13 are unassigned
97  if(unichar >= 0x30000 && unichar < 0xdffff) return false;
98 
99  // Above Plane 16 is invalid
100  if(unichar > 0x10FFFF) return false; // above plane 16?
101 
102  return true; // must be valid
103 }
104 
105 /**
106  * validateOrEscapeUTF8
107  * Input: UTF8 string (possibly corrupt)
108  * Input: do_escape, indicating whether invalid encodings shall be escaped.
109  * Note:
110  * - if not escaping but an invalid encoding is present and DEBUG_PEDANTIC is set, then assert() is called.
111  * - DO NOT USE wchar_t because it is 16-bits on Windows and 32-bits on Unix.
112  * Output:
113  * - UTF8 string. If do_escape is set, then corruptions are escaped in \xFF notation where FF is a hex character.
114  */
115 
116 //int count=0;
118 std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8,bool escape_backslash)
119 {
120  //
121  // skip the validation if not escaping and not DEBUG_PEDANTIC
122  if (escape_bad_utf8==false && escape_backslash==false && !validateOrEscapeUTF8_validate){
123  return input;
124  }
125 
126  // validate or escape input
127  std::string output;
128  for(std::string::size_type i =0; i< input.length(); ) {
129  uint8_t ch = (uint8_t)input.at(i);
130 
131  // utf8 1 byte prefix (0xxx xxxx)
132  if((ch & 0x80)==0x00){ // 00 .. 0x7f
133  if(ch=='\\' && escape_backslash){ // escape the escape character as \x92
134  output += hexesc(ch);
135  i++;
136  continue;
137  }
138 
139  if( ch < ' '){ // not printable are escaped
140  output += hexesc(ch);
141  i++;
142  continue;
143  }
144  output += ch; // printable is not escaped
145  i++;
146  continue;
147  }
148 
149  // utf8 2 bytes (110x xxxx) prefix
150  if(((ch & 0xe0)==0xc0) // 2-byte prefix
151  && (i+1 < input.length())
152  && utf8cont((uint8_t)input.at(i+1))){
153  uint32_t unichar = (((uint8_t)input.at(i) & 0x1f) << 6) | (((uint8_t)input.at(i+1) & 0x3f));
154 
155  // check for valid 2-byte encoding
156  if(valid_utf8codepoint(unichar)
157  && ((uint8_t)input.at(i)!=0xc0)
158  && (unichar >= 0x80)){
159  output += (uint8_t)input.at(i++); // byte1
160  output += (uint8_t)input.at(i++); // byte2
161  continue;
162  }
163  }
164 
165  // utf8 3 bytes (1110 xxxx prefix)
166  if(((ch & 0xf0) == 0xe0)
167  && (i+2 < input.length())
168  && utf8cont((uint8_t)input.at(i+1))
169  && utf8cont((uint8_t)input.at(i+2))){
170  uint32_t unichar = (((uint8_t)input.at(i) & 0x0f) << 12)
171  | (((uint8_t)input.at(i+1) & 0x3f) << 6)
172  | (((uint8_t)input.at(i+2) & 0x3f));
173 
174  // check for a valid 3-byte code point
175  if(valid_utf8codepoint(unichar)
176  && unichar>=0x800){
177  output += (uint8_t)input.at(i++); // byte1
178  output += (uint8_t)input.at(i++); // byte2
179  output += (uint8_t)input.at(i++); // byte3
180  continue;
181  }
182  }
183 
184  // utf8 4 bytes (1111 0xxx prefix)
185  if((( ch & 0xf8) == 0xf0)
186  && (i+3 < input.length())
187  && utf8cont((uint8_t)input.at(i+1))
188  && utf8cont((uint8_t)input.at(i+2))
189  && utf8cont((uint8_t)input.at(i+3))){
190  uint32_t unichar =( (((uint8_t)input.at(i) & 0x07) << 18)
191  |(((uint8_t)input.at(i+1) & 0x3f) << 12)
192  |(((uint8_t)input.at(i+2) & 0x3f) << 6)
193  |(((uint8_t)input.at(i+3) & 0x3f)));
194 
195  if(valid_utf8codepoint(unichar) && unichar>=0x1000000){
196  output += (uint8_t)input.at(i++); // byte1
197  output += (uint8_t)input.at(i++); // byte2
198  output += (uint8_t)input.at(i++); // byte3
199  output += (uint8_t)input.at(i++); // byte4
200  continue;
201  }
202  }
203 
204  if (escape_bad_utf8) {
205  // Just escape the next byte and carry on
206  output += hexesc((uint8_t)input.at(i++));
207  } else {
208  // fatal if we are debug pedantic, otherwise just ignore
209  // note: we shouldn't be here anyway, since if we are not escaping and we are not
210  // pedantic we should have returned above
212  std::ofstream os("bad_unicode.txt");
213  os << input << "\n";
214  os.close();
215  std::cerr << "INTERNAL ERROR: bad unicode stored in bad_unicode.txt\n";
216  assert(0);
217  }
218  }
219  }
220  return output;
221 }
222 
223 #ifdef STANDALONE
224 
225 void show(const std::string &ugly)
226 {
227  for(size_t j=0;j<ugly.size();j++){
228  printf("%02X ",(unsigned char)ugly[j]);
229  }
230 }
231 
232 void check(const std::string &ugly,bool verbose)
233 {
234  std::string res = validateOrEscapeUTF8(ugly,true);
235  std::wstring utf16;
236  /* Now check to make sure it is valid UTF8 */
237  try {
238  utf8::utf8to16(res.begin(),res.end(),std::back_inserter(utf16));
239  if(verbose){
240  show(ugly);
241  printf(" successfully encodes as ");
242  show(res);
243  printf(" (\"%s\")\n",res.c_str());
244  }
245  } catch(utf8::exception){
246  printf("utf8 error hex sequence: ");
247  show(ugly);
248  printf(" encoded as: ");
249  show(res);
250  printf("\n");
251  } catch(std::exception){
252  std::cout << "other exception \n";
253  }
254 }
255 
256 void testfile(const char *fn)
257 {
259 
260  std::cout << "testing file " << fn << "\n";
261  ifstream i(fn);
262  if(i.is_open()){
263  string line;
264  getline(i,line);
265  std::cout << "line length: " << line.size() << "\n";
266  std::cout << "calling ValidateOrEscapeUTF8 to escape...\n";
267  string l2 = validateOrEscapeUTF8(line,true);
268  std::cout << " length l2: " << l2.size() << "\n";
269  std::cout << "calling ValidateOrEscapeUTF8 to validate...\n";
270  validateOrEscapeUTF8(l2,false);
271  std::cout << "calling check...\n";
272  check(l2,false);
273  }
274  std::cout << "done\n";
275  exit(0);
276 }
277 
278 int main(int argc,char **argv)
279 {
280  std::cout << "Unicode Escape Regression Tester\n";
281  int ch;
282  while ((ch = getopt(argc,argv,"r:h")) != -1){
283  switch(ch) {
284  case 'r':
285  testfile(optarg);
286  break;
287  }
288  }
289 
290 
291  const char buf[] = {0xef, 0xbe, 0xad, 0x5c};
292  check(std::string(buf,1),true);
293  check(std::string(buf,2),true);
294  check(std::string(buf,3),true);
295  check(std::string(buf,4),true);
296 
297  /* Runs 16 copies simultaneously... */
298  uint32_t max=0xFFFFFFFF; // 2^32-1
299  for(uint64_t prefix=0;prefix<max;prefix+=0x10000000){
300  pid_t child = fork();
301  if(child==0){
302  /* Try all 4-byte sequences in the prefix range...*/
303  for(uint32_t k=0;k<=0x0FFFFFFF;k++){
304  uint32_t i=prefix+k;
305  std::string ugly((char *)&i,4);
306  check(ugly,false);
307  if((i & 0x00FFFFFF)==0x00FFFFFF){
308  printf("pid=%d prefix=%x i=%x\n",getpid(),(uint32_t)prefix,(uint32_t)i);
309  fflush(stdout);
310  }
311  }
312  exit(0);
313  }
314  printf("Launched PID %d\n",child);
315  fflush(stdout);
316  }
317  for(int i=0;i<16;i++){
318  int s=0;
319  pid_t p = wait(&s);
320  printf("pid %d finished with exit code %d\n",p,s);
321  }
322  std::cout << "done\n";
323  exit(1);
324 
325  /* Generic fuzzing. Try random attempts */
326  std::string line;
327  while(getline(std::cin,line)){
328  std::cout << validateOrEscapeUTF8(line,true) << "\n";
329  }
330 
331 }
332 #endif
bool validateOrEscapeUTF8_validate
std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8, bool escape_backslash)
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
Definition: checked.h:234
unsigned int uint32_t
Definition: core.h:40
int main(int argc, char *argv[])
Definition: tcpflow.cpp:565
bool valid_utf8codepoint(uint32_t unichar)
std::string hexesc(unsigned char ch)
bool utf8cont(unsigned char ch)
unsigned char uint8_t
Definition: util.h:6