tcpflow  1.6.1
About: tcpflow is a TCP/IP packet demultiplexer that captures data transmitted as part of TCP connections (flows), and stores the data in a way that is convenient for protocol analysis and debugging.
  Fossies Dox: tcpflow-1.6.1.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

histogram.cpp
Go to the documentation of this file.
1 /**
2  * histogram.cpp:
3  * Maintain a histogram for Unicode strings provided as UTF-8 and UTF-16 encodings.
4  * Track number of each coding provided.
5  *
6  * TK: Reimplement top-n with a priority queue.
7  * http://www.cplusplus.com/reference/queue/priority_queue/
8  */
9 
10 #include "config.h"
11 #include "bulk_extractor_i.h"
12 #include "unicode_escape.h"
13 #include "histogram.h"
14 #include "utf8.h"
15 
16 using namespace std;
17 
18 ostream & operator << (ostream &os, const HistogramMaker::FrequencyReportVector &rep){
19  for(HistogramMaker::FrequencyReportVector::const_iterator i = rep.begin(); i!=rep.end();i++){
20  const HistogramMaker::ReportElement &r = *(*i);
21  os << "n=" << r.tally.count << "\t" << validateOrEscapeUTF8(r.value, true, true);
22  if(r.tally.count16>0) os << "\t(utf16=" << r.tally.count16<<")";
23  os << "\n";
24  }
25  return os;
26 }
27 
29 {
31  for(HistogramMap::const_iterator it = h.begin(); it != h.end(); it++){
32  rep->push_back(new ReportElement(it->first,it->second));
33  }
34  sort(rep->begin(),rep->end(),ReportElement::compare);
35  return rep;
36 }
37 
38 /* This would be better done with a priority queue */
40 {
41  HistogramMaker::FrequencyReportVector *r2 = makeReport(); // gets a new report
42  HistogramMaker::FrequencyReportVector::iterator i = r2->begin();
43  while(topN>0 && i!=r2->end()){ // iterate through the first set
44  i++;
45  topN--;
46  }
47 
48  /* Delete the elements we won't use */
49  for(HistogramMaker::FrequencyReportVector::iterator j=i;j!=r2->end();j++){
50  delete (*j);
51  }
52  r2->erase(i,r2->end());
53  return r2;
54 }
55 
56 /* static */
57 bool HistogramMaker::looks_like_utf16(const std::string &str,bool &little_endian)
58 {
59  if((uint8_t)str[0]==0xff && (uint8_t)str[1]==0xfe){
60  little_endian = true;
61  return true; // begins with FFFE
62  }
63  if((uint8_t)str[0]==0xfe && (uint8_t)str[1]==0xff){
64  little_endian = false;
65  return true; // begins with FFFE
66  }
67  /* If none of the even characters are NULL and some of the odd characters are NULL, it's UTF-16 */
68  uint32_t even_null_count = 0;
69  uint32_t odd_null_count = 0;
70  for(size_t i=0;i+1<str.size();i+=2){
71  if(str[i]==0) even_null_count++;
72  if(str[i+1]==0) odd_null_count++;
73  }
74  if(even_null_count==0 && odd_null_count>1){
75  little_endian = true;
76  return true;
77  }
78  if(odd_null_count==0 && even_null_count>1){
79  little_endian = false;
80  return true;
81  }
82  return false;
83 }
84 
85 /**
86  * Converts a utf16 with a byte order to utf8, returning an ALLOCATED STRING if conversion is
87  * successful, and returning 0 if it is not.
88  */
89 /* static */
90 std::string *HistogramMaker::convert_utf16_to_utf8(const std::string &key,bool little_endian)
91 {
92  /* re-image this string as UTF16*/
93  std::wstring utf16;
94  for(size_t i=0;i<key.size();i+=2){
95  if(little_endian) utf16.push_back(key[i] | (key[i+1]<<8));
96  else utf16.push_back(key[i]<<8 | (key[i+1]));
97  }
98  /* Now convert it to a UTF-8;
99  * set tempKey to be the utf-8 string that will be erased.
100  */
101  std::string *tempKey = new std::string;
102  try {
103  utf8::utf16to8(utf16.begin(),utf16.end(),std::back_inserter(*tempKey));
104  /* Erase any nulls if present */
105  while(tempKey->size()>0) {
106  size_t nullpos = tempKey->find('\000');
107  if(nullpos==string::npos) break;
108  tempKey->erase(nullpos,1);
109  }
110  } catch(utf8::invalid_utf16){
111  /* Exception; bad UTF16 encoding */
112  delete tempKey;
113  tempKey = 0; // give up on temp key; otherwise its invalidated below
114  return 0;
115  }
116  return tempKey;
117 }
118 
119 std::string *HistogramMaker::convert_utf16_to_utf8(const std::string &key)
120 {
121  bool little_endian=false;
122  if(looks_like_utf16(key,little_endian)){
123  return convert_utf16_to_utf8(key,little_endian);
124  }
125  return 0;
126 }
127 
128 std::string *HistogramMaker::make_utf8(const std::string &key)
129 {
130  std::string *utf8 = convert_utf16_to_utf8(key);
131  if(utf8==0) utf8 = new std::string(key);
132  return utf8;
133 }
134 
135 /**
136  * Takes a string (the key) and adds it to the histogram.
137  * automatically determines if the key is UTF-16 and converts
138  * it to UTF8 if so.
139  */
140 
142 void HistogramMaker::add(const std::string &key)
143 {
144  if(key.size()==0) return; // don't deal with zero-length keys
145 
146  /**
147  * "key" passed in is a const reference.
148  * But we might want to change it. So keyToAdd points to what will be added.
149  * If we need to change key, we allocate more memory, and make keyToAdd
150  * point to the memory that was allocated. This way we only make a copy
151  * if we need to make a copy.
152  */
153 
154  const std::string *keyToAdd = &key; // should be a reference, but that doesn't work
155  std::string *tempKey = 0; // place to hold UTF8 key
156  bool found_utf16 = false;
157  bool little_endian=false;
158  if(looks_like_utf16(*keyToAdd,little_endian)){
159  tempKey = convert_utf16_to_utf8(*keyToAdd,little_endian);
160  if(tempKey){
161  keyToAdd = tempKey;
162  found_utf16 = true;
163  }
164  }
165 
166  /* If any conversion is necessary AND we have not converted key from UTF-16 to UTF-8,
167  * then the original key is still in 'key'. Allocate tempKey and copy key to tempKey.
168  */
170  if(tempKey==0){
171  tempKey = new std::string(key);
172  keyToAdd = tempKey;
173  }
174  }
175 
176 
177  /* Apply the flags */
178  // See: http://stackoverflow.com/questions/1081456/wchar-t-vs-wint-t
179  if(flags & FLAG_LOWERCASE){
180  /* keyToAdd is UTF-8; convert to UTF-16, downcase, and convert back to UTF-8 */
181  try{
182  std::wstring utf16key;
183  utf8::utf8to16(tempKey->begin(),tempKey->end(),std::back_inserter(utf16key));
184  for(std::wstring::iterator it = utf16key.begin();it!=utf16key.end();it++){
185  *it = towlower(*it);
186  }
187  /* erase tempKey and copy the utf16 back into tempKey as utf8 */
188  tempKey->clear(); // erase the characters
189  utf8::utf16to8(utf16key.begin(),utf16key.end(),std::back_inserter(*tempKey));
190  } catch(utf8::exception){
191  /* Exception thrown during utf8 or 16 conversions.
192  * So the string we thought was valid utf8 wasn't valid utf8 afterall.
193  * tempKey will remain unchanged.
194  */
195  }
196  }
197  if(flags & FLAG_NUMERIC){
198  /* keyToAdd is UTF-8; convert to UTF-16, extract digits, and convert back to UTF-8 */
199  std::string originalTempKey(*tempKey);
200  try{
201  std::wstring utf16key;
202  std::wstring utf16digits;
203  utf8::utf8to16(tempKey->begin(),tempKey->end(),std::back_inserter(utf16key));
204  for(std::wstring::iterator it = utf16key.begin();it!=utf16key.end();it++){
205  if(iswdigit(*it) || *it==static_cast<uint16_t>('+')){
206  utf16digits.push_back(*it);
207  }
208  }
209  /* convert it back */
210  tempKey->clear(); // erase the characters
211  utf8::utf16to8(utf16digits.begin(),utf16digits.end(),std::back_inserter(*tempKey));
212  } catch(utf8::exception){
213  /* Exception during utf8 or 16 conversions*.
214  * So the string wasn't utf8. Fall back to just extracting the digits
215  */
216  tempKey->clear();
217  for(std::string::iterator it = originalTempKey.begin(); it!=originalTempKey.end(); it++){
218  if(isdigit(*it)){
219  tempKey->push_back(*it);
220  }
221  }
222  }
223  }
224 
225  /* For debugging low-memory handling logic,
226  * specify DEBUG_MALLOC_FAIL to make malloc occasionally fail
227  */
230  throw bad_alloc();
231  }
232  }
233 
234  h[*keyToAdd].count++;
235  if(found_utf16) h[*keyToAdd].count16++; // track how many UTF16s were converted
236  if(tempKey){ // if we allocated tempKey, free it
237  delete tempKey;
238  }
239 }
240 
std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8, bool escape_backslash)
FrequencyReportVector * makeReport() const
Definition: histogram.cpp:28
static bool looks_like_utf16(const std::string &str, bool &little_endian)
Definition: histogram.cpp:57
const std::string value
Definition: histogram.h:84
static const int FLAG_LOWERCASE
Definition: histogram.h:64
static std::string * make_utf8(const std::string &key)
Definition: histogram.cpp:128
static uint32_t debug_histogram_malloc_fail_frequency
Definition: histogram.h:66
static const int FLAG_NUMERIC
Definition: histogram.h:65
void add(const std::string &key)
Definition: histogram.cpp:142
HistogramMap h
Definition: histogram.h:103
static bool compare(const ReportElement *e1, const ReportElement *e2)
Definition: histogram.h:91
ostream & operator<<(ostream &os, const HistogramMaker::FrequencyReportVector &rep)
Definition: histogram.cpp:18
std::vector< ReportElement * > FrequencyReportVector
Definition: histogram.h:124
static std::string * convert_utf16_to_utf8(const std::string &str)
Definition: histogram.cpp:119
flags
Definition: http_parser.h:216
Definition: checked.h:35
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
Definition: checked.h:234
unsigned int uint32_t
Definition: core.h:40
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
Definition: checked.h:207
unsigned short uint16_t
Definition: util.h:7
unsigned char uint8_t
Definition: util.h:6