"Fossies" - the Fresh Open Source Software Archive

Member "tcpflow-1.6.1/src/be13_api/histogram.cpp" (19 Feb 2021, 7967 Bytes) of package /linux/misc/tcpflow-1.6.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "histogram.cpp" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.4.4_vs_1.4.5.

    1 /**
    2  * histogram.cpp:
    3  * Maintain a histogram for Unicode strings provided as UTF-8 and UTF-16 encodings.
    4  * Track number of each coding provided.
    5  * 
    6  * TK: Reimplement top-n with a priority queue.
    7  *  http://www.cplusplus.com/reference/queue/priority_queue/
    8  */
    9 
   10 #include "config.h"
   11 #include "bulk_extractor_i.h"
   12 #include "unicode_escape.h"
   13 #include "histogram.h"
   14 #include "utf8.h"
   15 
   16 using namespace std;
   17 
   18 ostream & operator << (ostream &os, const HistogramMaker::FrequencyReportVector &rep){
   19     for(HistogramMaker::FrequencyReportVector::const_iterator i = rep.begin(); i!=rep.end();i++){
   20         const HistogramMaker::ReportElement &r = *(*i);
   21     os << "n=" << r.tally.count << "\t" << validateOrEscapeUTF8(r.value, true, true);
   22     if(r.tally.count16>0) os << "\t(utf16=" << r.tally.count16<<")";
   23     os << "\n";
   24     }
   25     return os;
   26 }
   27 
   28 HistogramMaker::FrequencyReportVector *HistogramMaker::makeReport()  const
   29 {
   30     FrequencyReportVector *rep = new FrequencyReportVector();
   31     for(HistogramMap::const_iterator it = h.begin(); it != h.end(); it++){
   32     rep->push_back(new ReportElement(it->first,it->second));
   33     }
   34     sort(rep->begin(),rep->end(),ReportElement::compare);
   35     return rep;
   36 }
   37 
   38 /* This would be better done with a priority queue */
   39 HistogramMaker::FrequencyReportVector *HistogramMaker::makeReport(int topN) const
   40 {
   41     HistogramMaker::FrequencyReportVector         *r2 = makeReport();   // gets a new report
   42     HistogramMaker::FrequencyReportVector::iterator i = r2->begin();
   43     while(topN>0 && i!=r2->end()){  // iterate through the first set
   44     i++;
   45     topN--;
   46     }
   47 
   48     /* Delete the elements we won't use */
   49     for(HistogramMaker::FrequencyReportVector::iterator j=i;j!=r2->end();j++){
   50         delete (*j);
   51     }
   52     r2->erase(i,r2->end());
   53     return r2;
   54 }
   55 
   56 /* static */
   57 bool HistogramMaker::looks_like_utf16(const std::string &str,bool &little_endian)
   58 {
   59     if((uint8_t)str[0]==0xff && (uint8_t)str[1]==0xfe){
   60     little_endian = true;
   61     return true; // begins with FFFE
   62     }
   63     if((uint8_t)str[0]==0xfe && (uint8_t)str[1]==0xff){
   64     little_endian = false;
   65     return true; // begins with FFFE
   66     }
   67     /* If none of the even characters are NULL and some of the odd characters are NULL, it's UTF-16 */
   68     uint32_t even_null_count = 0;
   69     uint32_t odd_null_count = 0;
   70     for(size_t i=0;i+1<str.size();i+=2){
   71     if(str[i]==0) even_null_count++;
   72     if(str[i+1]==0) odd_null_count++;
   73     }
   74     if(even_null_count==0 && odd_null_count>1){
   75     little_endian = true;
   76     return true;
   77     }
   78     if(odd_null_count==0 && even_null_count>1){
   79     little_endian = false;
   80     return true;
   81     }
   82     return false;
   83 }
   84 
   85 /**
   86  * Converts a utf16 with a byte order to utf8, returning an ALLOCATED STRING if conversion is
   87  * successful, and returning 0 if it is not.
   88  */
   89 /* static */
   90 std::string *HistogramMaker::convert_utf16_to_utf8(const std::string &key,bool little_endian)
   91 {
   92     /* re-image this string as UTF16*/
   93     std::wstring utf16;
   94     for(size_t i=0;i<key.size();i+=2){
   95         if(little_endian) utf16.push_back(key[i] | (key[i+1]<<8));
   96         else utf16.push_back(key[i]<<8 | (key[i+1]));
   97     }
   98     /* Now convert it to a UTF-8;
   99      * set tempKey to be the utf-8 string that will be erased.
  100      */
  101     std::string *tempKey = new std::string;
  102     try {
  103         utf8::utf16to8(utf16.begin(),utf16.end(),std::back_inserter(*tempKey));
  104         /* Erase any nulls if present */
  105         while(tempKey->size()>0) {
  106             size_t nullpos = tempKey->find('\000');
  107             if(nullpos==string::npos) break;
  108             tempKey->erase(nullpos,1);
  109         }
  110     } catch(utf8::invalid_utf16){
  111         /* Exception; bad UTF16 encoding */
  112         delete tempKey;
  113         tempKey = 0;        // give up on temp key; otherwise its invalidated below
  114         return 0;
  115     }
  116     return tempKey;
  117 }
  118 
  119 std::string *HistogramMaker::convert_utf16_to_utf8(const std::string &key)
  120 {
  121     bool little_endian=false;
  122     if(looks_like_utf16(key,little_endian)){
  123         return convert_utf16_to_utf8(key,little_endian);
  124     }
  125     return 0;
  126 }
  127 
  128 std::string *HistogramMaker::make_utf8(const std::string &key)
  129 {
  130     std::string *utf8 = convert_utf16_to_utf8(key);
  131     if(utf8==0) utf8 = new std::string(key);
  132     return utf8;
  133 }
  134 
  135 /**
  136  * Takes a string (the key) and adds it to the histogram.
  137  * automatically determines if the key is UTF-16 and converts
  138  * it to UTF8 if so.
  139  */
  140 
  141 uint32_t HistogramMaker::debug_histogram_malloc_fail_frequency = 0;
  142 void HistogramMaker::add(const std::string &key)
  143 {
  144     if(key.size()==0) return;       // don't deal with zero-length keys
  145 
  146     /**
  147      * "key" passed in is a const reference.
  148      * But we might want to change it. So keyToAdd points to what will be added.
  149      * If we need to change key, we allocate more memory, and make keyToAdd
  150      * point to the memory that was allocated. This way we only make a copy
  151      * if we need to make a copy.
  152      */
  153 
  154     const std::string *keyToAdd = &key; // should be a reference, but that doesn't work
  155     std::string *tempKey = 0;       // place to hold UTF8 key
  156     bool found_utf16 = false;
  157     bool little_endian=false;
  158     if(looks_like_utf16(*keyToAdd,little_endian)){
  159         tempKey = convert_utf16_to_utf8(*keyToAdd,little_endian);
  160         if(tempKey){
  161             keyToAdd = tempKey;
  162             found_utf16 = true;
  163         }
  164     }
  165     
  166     /* If any conversion is necessary AND we have not converted key from UTF-16 to UTF-8,
  167      * then the original key is still in 'key'. Allocate tempKey and copy key to tempKey.
  168      */
  169     if(flags & (FLAG_LOWERCASE |FLAG_NUMERIC)){
  170     if(tempKey==0){
  171         tempKey = new std::string(key);
  172         keyToAdd = tempKey;
  173     }
  174     }
  175 
  176 
  177     /* Apply the flags */
  178     // See: http://stackoverflow.com/questions/1081456/wchar-t-vs-wint-t
  179     if(flags & FLAG_LOWERCASE){
  180     /* keyToAdd is UTF-8; convert to UTF-16, downcase, and convert back to UTF-8 */
  181     try{
  182         std::wstring utf16key;
  183         utf8::utf8to16(tempKey->begin(),tempKey->end(),std::back_inserter(utf16key));
  184         for(std::wstring::iterator it = utf16key.begin();it!=utf16key.end();it++){
  185         *it = towlower(*it);
  186         }
  187         /* erase tempKey and copy the utf16 back into tempKey as utf8 */
  188         tempKey->clear();       // erase the characters
  189         utf8::utf16to8(utf16key.begin(),utf16key.end(),std::back_inserter(*tempKey));
  190     } catch(utf8::exception){
  191         /* Exception thrown during utf8 or 16 conversions.
  192          * So the string we thought was valid utf8 wasn't valid utf8 afterall.
  193          * tempKey will remain unchanged.
  194          */
  195     }
  196     }
  197     if(flags & FLAG_NUMERIC){
  198     /* keyToAdd is UTF-8; convert to UTF-16, extract digits, and convert back to UTF-8 */
  199     std::string originalTempKey(*tempKey);
  200     try{
  201         std::wstring utf16key;
  202         std::wstring utf16digits;
  203         utf8::utf8to16(tempKey->begin(),tempKey->end(),std::back_inserter(utf16key));
  204         for(std::wstring::iterator it = utf16key.begin();it!=utf16key.end();it++){
  205         if(iswdigit(*it) || *it==static_cast<uint16_t>('+')){
  206             utf16digits.push_back(*it);
  207         }
  208         }
  209         /* convert it back */
  210         tempKey->clear();       // erase the characters
  211         utf8::utf16to8(utf16digits.begin(),utf16digits.end(),std::back_inserter(*tempKey));
  212     } catch(utf8::exception){
  213         /* Exception during utf8 or 16 conversions*.
  214          * So the string wasn't utf8.  Fall back to just extracting the digits
  215          */
  216         tempKey->clear();
  217         for(std::string::iterator it = originalTempKey.begin(); it!=originalTempKey.end(); it++){
  218         if(isdigit(*it)){
  219             tempKey->push_back(*it);
  220         }
  221         }
  222     }
  223     }
  224 
  225     /* For debugging low-memory handling logic,
  226      * specify DEBUG_MALLOC_FAIL to make malloc occasionally fail
  227      */
  228     if(debug_histogram_malloc_fail_frequency){
  229     if((h.size() % debug_histogram_malloc_fail_frequency)==(debug_histogram_malloc_fail_frequency-1)){
  230         throw bad_alloc();
  231     }
  232     }
  233 
  234     h[*keyToAdd].count++;
  235     if(found_utf16) h[*keyToAdd].count16++;  // track how many UTF16s were converted
  236     if(tempKey){                 // if we allocated tempKey, free it
  237     delete tempKey;
  238     }
  239 }
  240