"Fossies" - the Fresh Open Source Software Archive

Member "tcpflow-1.6.1/src/be13_api/feature_recorder.cpp" (19 Feb 2021, 32417 Bytes) of package /linux/misc/tcpflow-1.6.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "feature_recorder.cpp" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.4.5_vs_1.5.0.

    1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 
    3 #include "config.h"
    4 #include "bulk_extractor_i.h"
    5 #include "unicode_escape.h"
    6 #include "histogram.h"
    7 
    8 #include <unistd.h>
    9 #include <fcntl.h>
   10 #include <sys/stat.h>
   11 
   12 #ifdef HAVE_STDARG_H
   13 #include <stdarg.h>
   14 #endif
   15 
   16 #ifndef MAXPATHLEN
   17 #define MAXPATHLEN 65536
   18 #endif
   19 
   20 #ifndef O_BINARY
   21 #define O_BINARY 0
   22 #endif
   23 
   24 #ifndef DEBUG_PEDANTIC
   25 #define DEBUG_PEDANTIC    0x0001// check values more rigorously
   26 #endif
   27 
   28 #ifndef WIN32
   29 pthread_t feature_recorder::main_threadid = 0;
   30 #endif
   31 size_t  feature_recorder::context_window_default=16;                    /* number of bytes of context */
   32 int64_t feature_recorder::offset_add   = 0;
   33 std::string  feature_recorder::banner_file;
   34 uint32_t feature_recorder::opt_max_context_size=1024*1024;
   35 uint32_t feature_recorder::opt_max_feature_size=1024*1024;
   36 uint32_t feature_recorder::debug=0;
   37 
   38 
   39 /**
   40  * Create a feature recorder object. Each recorder records a certain
   41  * kind of feature.  Features are stored in a file. The filename is
   42  * permutated based on the total number of threads and the current
   43  * thread that's recording. Each thread records to a different file,
   44  * and thus a different feature recorder, to avoid locking
   45  * problems. 
   46  *
   47  * @param feature_recorder_set &fs - common information for all of the feature recorders
   48  * @param name         - the name of the feature being recorded.
   49  */
   50 
   51 feature_recorder::feature_recorder(class feature_recorder_set &fs_,
   52                                    const std::string &name_):
   53     flags(0),
   54     name(name_),ignore_encoding(),ios(),bs(),
   55     histogram_defs(),
   56     fs(fs_),
   57     count_(0),context_window_before(context_window_default),context_window_after(context_window_default),
   58     Mf(),Mr(),mhistograms(),mhistogram_limit(),
   59     stop_list_recorder(0),
   60     file_number_(0),carve_cache(),carve_mode(CARVE_ENCODED)
   61 {
   62     //std::cerr << "feature_recorder(" << name << ") created\n";
   63     open();                         // open if we are created
   64 }
   65 
   66 /* Don't have to delete the stop_list_recorder because it is in the
   67  * feature_recorder_set and will be separately deleted.
   68  */
   69 feature_recorder::~feature_recorder()
   70 {
   71     if(ios.is_open()){
   72         ios.close();
   73     }
   74 }
   75 
   76 void feature_recorder::banner_stamp(std::ostream &os,const std::string &header) const
   77 {
   78     int banner_lines = 0;
   79     if(banner_file.size()>0){
   80         std::ifstream i(banner_file.c_str());
   81         if(i.is_open()){
   82             std::string line;
   83             while(getline(i,line)){
   84                 if(line.size()>0 && ((*line.end()=='\r') || (*line.end()=='\n'))){
   85                     line.erase(line.end()); /* remove the last character while it is a \n or \r */
   86                 }
   87                 os << "# " << line << "\n";
   88                 banner_lines++;
   89             }
   90             i.close();
   91         }
   92     }
   93     if(banner_lines==0){
   94         os << "# BANNER FILE NOT PROVIDED (-b option)\n";
   95     }
   96     
   97     os << bulk_extractor_version_header;
   98     os << "# Feature-Recorder: " << name << "\n";
   99     if(fs.get_input_fname().size()) os << "# Filename: " << fs.get_input_fname() << "\n";
  100     if(debug!=0){
  101         os << "# DEBUG: " << debug << " (";
  102         if(debug & DEBUG_PEDANTIC) os << " DEBUG_PEDANTIC ";
  103         os << ")\n";
  104     }
  105     os << header;
  106 }
  107 
  108 
  109 
  110 /**
  111  * Return the filename with a counter
  112  */
  113 std::string feature_recorder::fname_counter(std::string suffix) const
  114 {
  115     return fs.get_outdir() + "/" + this->name + (suffix.size()>0 ? (std::string("_") + suffix) : "") + ".txt";
  116 }
  117 
  118 
  119 const std::string &feature_recorder::get_outdir() const 
  120 {
  121     return fs.get_outdir();
  122 }
  123 
  124 /**
  125  * open a feature recorder file in the specified output directory.
  126  * Called by create_name(). Not clear why it isn't called when created.
  127  */
  128 
  129 void feature_recorder::open()
  130 { 
  131     if (fs.flag_set(feature_recorder_set::SET_DISABLED)) return;        // feature recorder set is disabled
  132 
  133     /* write to a database? Create tables if necessary and create a prepared statement */
  134     if (fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS)) {  
  135         char buf[1024];
  136         fs.db_create_table(name);
  137         snprintf(buf,sizeof(buf),db_insert_stmt,name.c_str());
  138         bs = new besql_stmt(fs.db3,buf);
  139     }
  140 
  141     /* Write to a file? Open the file and seek to the last line if it exist, otherwise just open database */
  142     if (fs.flag_notset(feature_recorder_set::DISABLE_FILE_RECORDERS)){
  143         /* Open the file recorder */
  144         std::string fname = fname_counter("");
  145         ios.open(fname.c_str(),std::ios_base::in|std::ios_base::out|std::ios_base::ate);
  146         if(ios.is_open()){                  // opened existing stream
  147             ios.seekg(0L,std::ios_base::end);
  148             while(ios.is_open()){
  149                 /* Get current position */
  150                 if(int(ios.tellg())==0){            // at beginning of file; stamp and return
  151                     ios.seekp(0L,std::ios_base::beg);    // be sure we are at the beginning of the file
  152                     return;
  153                 }
  154                 ios.seekg(-1,std::ios_base::cur); // backup to once less than the end of the file
  155                 if (ios.peek()=='\n'){           // we are finally on the \n
  156                     ios.seekg(1L,std::ios_base::cur); // move the getting one forward
  157                     ios.seekp(ios.tellg(),std::ios_base::beg); // put the putter at the getter location
  158                     count_ = 1;                            // greater than zero
  159                     return;
  160                 }
  161             }
  162         }
  163         // Just open the stream for output
  164         ios.open(fname.c_str(),std::ios_base::out);
  165         if(!ios.is_open()){
  166             std::cerr << "*** feature_recorder::open CANNOT OPEN FEATURE FILE FOR WRITING "
  167                       << fname << ":" << strerror(errno) << "\n";
  168             exit(1);
  169         }
  170     }
  171 }
  172 
  173 void feature_recorder::close()
  174 {
  175     if(ios.is_open()){
  176         ios.close();
  177     }
  178 }
  179 
  180 void feature_recorder::flush()
  181 {
  182     cppmutex::lock lock(Mf);            // get the lock; released when object is deallocated.
  183     ios.flush();
  184 }
  185 
  186 
  187 static inline bool isodigit(char c)
  188 {
  189     return c>='0' && c<='7';
  190 }
  191 
  192 /* statics */
  193 const std::string feature_recorder::feature_file_header("# Feature-File-Version: 1.1\n");
  194 const std::string feature_recorder::histogram_file_header("# Histogram-File-Version: 1.1\n");
  195 const std::string feature_recorder::bulk_extractor_version_header("# " PACKAGE_NAME "-Version: " PACKAGE_VERSION " ($Rev: 10844 $)\n");
  196 
  197 static inline int hexval(char ch)
  198 {
  199     switch (ch) {
  200     case '0': return 0;
  201     case '1': return 1;
  202     case '2': return 2;
  203     case '3': return 3;
  204     case '4': return 4;
  205     case '5': return 5;
  206     case '6': return 6;
  207     case '7': return 7;
  208     case '8': return 8;
  209     case '9': return 9;
  210     case 'a': case 'A': return 10;
  211     case 'b': case 'B': return 11;
  212     case 'c': case 'C': return 12;
  213     case 'd': case 'D': return 13;
  214     case 'e': case 'E': return 14;
  215     case 'f': case 'F': return 15;
  216     }
  217     return 0;
  218 }
  219 
  220 /**
  221  * Unquote Python or octal-style quoting of a string
  222  */
  223 std::string feature_recorder::unquote_string(const std::string &s)
  224 {
  225     size_t len = s.size();
  226     if(len<4) return s;                 // too small for a quote
  227 
  228     std::string out;
  229     for(size_t i=0;i<len;i++){
  230         /* Look for octal coding */
  231         if(i+3<len && s[i]=='\\' && isodigit(s[i+1]) && isodigit(s[i+2]) && isodigit(s[i+3])){
  232             uint8_t code = (s[i+1]-'0') * 64 + (s[i+2]-'0') * 8 + (s[i+3]-'0');
  233             out.push_back(code);
  234             i += 3;                     // skip over the digits
  235             continue;
  236         }
  237         /* Look for hex coding */
  238         if(i+3<len && s[i]=='\\' && s[i+1]=='x' && isxdigit(s[i+2]) && isxdigit(s[i+3])){
  239             uint8_t code = (hexval(s[i+2])*16) | hexval(s[i+3]);
  240             out.push_back(code);
  241             i += 3;                     // skip over the digits
  242             continue;
  243         }
  244         out.push_back(s[i]);
  245     }
  246     return out;
  247 }
  248 
  249 /**
  250  * Get the feature which is defined as being between a \t and [\t\n]
  251  */
  252 
  253 /*static*/ std::string feature_recorder::extract_feature(const std::string &line)
  254 {
  255     size_t tab1 = line.find('\t');
  256     if(tab1==std::string::npos) return "";   // no feature
  257     size_t feature_start = tab1+1;
  258     size_t tab2 = line.find('\t',feature_start);
  259     if(tab2!=std::string::npos) return line.substr(feature_start,tab2-feature_start);
  260     return line.substr(feature_start);  // no context to remove
  261 }
  262 
  263 void feature_recorder::set_flag(uint32_t flags_)
  264 {
  265     MAINTHREAD();
  266     flags|=flags_;
  267 }
  268 
  269 void feature_recorder::unset_flag(uint32_t flags_)
  270 {
  271     MAINTHREAD();
  272     flags &= (~flags_);
  273 }
  274 
  275 void feature_recorder::set_memhist_limit(int64_t limit_)
  276 {
  277     MAINTHREAD();
  278     mhistogram_limit = limit_;
  279 }
  280 
  281 
  282 // add a memory histogram; assume the position in the mhistograms is stable
  283 void feature_recorder::enable_memory_histograms()
  284 {
  285     for(histogram_defs_t::const_iterator it=histogram_defs.begin();it!=histogram_defs.end();it++){
  286         mhistograms[*it] = new mhistogram_t(); 
  287     }
  288 }
  289 
  290 
  291 /**
  292  *  Create a histogram for this feature recorder and an extraction pattern.
  293  */
  294 
  295 /* dump_callback_test is a simple callback that just prints to stderr. It's for testing */
  296 int feature_recorder::dump_callback_test(void *user,const feature_recorder &fr,
  297                                           const std::string &str,const uint64_t &count)
  298 {
  299     (void)user;
  300     std::cerr << "dump_cb: user=" << user << " " << str << ": " << count << "\n";
  301     return 0;
  302 }
  303 
  304 /* Make a histogram. If a callback is provided, send the output there. */
  305 class mhistogram_callback {
  306     mhistogram_callback(const mhistogram_callback&);
  307     mhistogram_callback &operator=(const mhistogram_callback &);
  308 public:
  309     mhistogram_callback(void *user_,
  310                         feature_recorder::dump_callback_t *cb_,
  311                         const histogram_def &def_,
  312                         const feature_recorder &fr_,
  313                         uint64_t limit_):user(user_),cb(cb_),def(def_),fr(fr_),callback_count(0),limit(limit_){}
  314     void *user;
  315     feature_recorder::dump_callback_t *cb;
  316     const histogram_def &def;
  317     const feature_recorder &fr;
  318     uint64_t callback_count;
  319     uint64_t limit;
  320     int do_callback(const std::string &str,const uint64_t &tally){
  321         (*cb)(user,fr,def,str,tally);
  322         if(limit && ++callback_count >= limit) return -1;
  323         return 0;
  324     }
  325     static int callback(void *ptr,const std::string &str,const uint64_t &tally) {
  326         return ((mhistogram_callback *)(ptr))->do_callback(str,tally);
  327     }
  328 };
  329 
  330 /****************************************************************
  331  *** PHASE HISTOGRAM (formerly phase 3): Create the histograms
  332  ****************************************************************/
  333 
  334 /**
  335  * We now have three kinds of histograms:
  336  * 1 - Traditional post-processing histograms specified by the histogram library
  337  *   1a - feature-file based traditional ones
  338  *   1b - SQL-based traditional ones.
  339  * 2 - In-memory histograms (used primarily by beapi)
  340  */
  341 
  342 
  343 /** Dump a specific histogram */
  344 void feature_recorder::dump_histogram_file(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const
  345 {
  346     /* This is a file based histogram. We will be reading from one file and writing to another */
  347     std::string ifname = fname_counter("");  // source of features
  348     std::ifstream f(ifname.c_str());
  349     if(!f.is_open()){
  350         std::cerr << "Cannot open histogram input file: " << ifname << "\n";
  351         return;
  352     }
  353 
  354     /* Read each line of the feature file and add it to the histogram.
  355      * If we run out of memory, dump that histogram to a file and start
  356      * on the next histogram.
  357      */
  358     for(int histogram_counter = 0;histogram_counter<max_histogram_files;histogram_counter++){
  359 
  360         HistogramMaker h(def.flags);            /* of seen features, created in pass two */
  361         try {
  362             std::string line;
  363             while(getline(f,line)){
  364                 if(line.size()==0) continue; // empty line
  365                 if(line[0]=='#') continue;   // comment line
  366                 truncate_at(line,'\r');      // truncate at a \r if there is one.
  367 
  368                 /** If there is a string required in the line and it isn't present, don't use this line */
  369                 if(def.require.size()){
  370                     if(line.find_first_of(def.require)==std::string::npos){
  371                         continue;
  372                     }
  373                 }
  374 
  375                 std::string feature = extract_feature(line);
  376                 if(feature.find('\\')!=std::string::npos){
  377                     feature = unquote_string(feature);  // reverse \xxx encoding
  378                 }
  379                 /** If there is a pattern to use to prune down the feature, use it */
  380                 if(def.pattern.size()){
  381                     std::string new_feature = feature;
  382                     if(!def.reg.search(feature,&new_feature,0,0)){
  383                         // no search match; avoid this feature
  384                         continue;               
  385                     }
  386                     feature = new_feature;
  387                 }
  388         
  389                 /* Remove what follows after \t if this is a context file */
  390                 size_t tab=feature.find('\t');
  391                 if(tab!=std::string::npos) feature.erase(tab); // erase from tab to end
  392                 h.add(feature);
  393             }
  394             f.close();
  395         }
  396         catch (const std::exception &e) {
  397             std::cerr << "ERROR: " << e.what() << " generating histogram "
  398                       << name << "\n";
  399         }
  400             
  401         /* Output what we have to a new file ofname */
  402         std::stringstream real_suffix;
  403 
  404         real_suffix << def.suffix;
  405         if(histogram_counter>0) real_suffix << histogram_counter;
  406         std::string ofname = fname_counter(real_suffix.str()); // histogram name
  407         std::ofstream o;
  408         o.open(ofname.c_str());         // open the file
  409         if(!o.is_open()){
  410             std::cerr << "Cannot open histogram output file: " << ofname << "\n";
  411             return;
  412         }
  413 
  414         HistogramMaker::FrequencyReportVector *fr = h.makeReport();
  415         if(fr->size()>0){
  416             banner_stamp(o,histogram_file_header);
  417             o << *fr;                   // sends the entire histogram
  418         }
  419 
  420         for(size_t i = 0;i<fr->size();i++){
  421             delete fr->at(i);
  422         }
  423         delete fr;
  424         o.close();
  425 
  426         if(f.is_open()==false){
  427             return;     // input file was closed
  428         }
  429     }
  430     std::cerr << "Looped " << max_histogram_files
  431               << " times on histogram; something seems wrong\n";
  432 }
  433 
  434 
  435 void feature_recorder::dump_histogram(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const
  436 {
  437     /* Inform that we are dumping this histogram */
  438     if(cb) cb(user,*this,def,"",0); 
  439 
  440     /* If this is a memory histogram, dump it and return */
  441     mhistograms_t::const_iterator it = mhistograms.find(def);
  442     if(it!=mhistograms.end()){
  443         assert(cb!=0);
  444         mhistogram_callback mcbo(user,cb,def,*this,mhistogram_limit);
  445         it->second->dump_sorted(static_cast<void *>(&mcbo),mhistogram_callback::callback);
  446         return;
  447     }
  448 
  449     if (fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS)) {
  450         dump_histogram_db(def,user,cb);
  451     }
  452     
  453 
  454     if (fs.flag_notset(feature_recorder_set::DISABLE_FILE_RECORDERS)) {
  455         dump_histogram_file(def,user,cb);
  456     }
  457 }
  458 
  459 
  460 /* Dump all of this feature recorders histograms */
  461 
  462 
  463 void feature_recorder::dump_histograms(void *user,feature_recorder::dump_callback_t cb,
  464                                            feature_recorder_set::xml_notifier_t xml_error_notifier) const
  465 {
  466     /* If we are recording features to SQL and we have a histogram defintion
  467      * for this feature recorder, we need to create a base histogram first,
  468      * then we can create the extracted histograms if they are presented.
  469      */
  470 
  471 
  472     /* Loop through all the histograms and dump each one.
  473      * This now works for both memory histograms and non-memory histograms.
  474      */
  475     for(histogram_defs_t::const_iterator it = histogram_defs.begin();it!=histogram_defs.end();it++){
  476         try {
  477             dump_histogram((*it),user,cb);
  478         }
  479         catch (const std::exception &e) {
  480             std::cerr << "ERROR: histogram " << name << ": " << e.what() << "\n";
  481             if(xml_error_notifier){
  482                 std::string error = std::string("<error function='phase3' histogram='")
  483                     + name + std::string("</error>");
  484                 (*xml_error_notifier)(error);
  485             }
  486         }
  487     }
  488 }
  489 
  490 
  491 void feature_recorder::add_histogram(const histogram_def &def)
  492 {
  493     histogram_defs.insert(def);
  494 }
  495 
  496 
  497 
  498 /****************************************************************
  499  *** WRITING SUPPORT
  500  ****************************************************************/
  501 
  502 /* Write to the file.
  503  * This is the only place where writing happens.
  504  * So it's an easy place to do UTF-8 validation in debug mode.
  505  */
  506 void feature_recorder::write(const std::string &str)
  507 {
  508     if(debug & DEBUG_PEDANTIC){
  509         if(utf8::find_invalid(str.begin(),str.end()) != str.end()){
  510             std::cerr << "******************************************\n";
  511             std::cerr << "feature recorder: " << name << "\n";
  512             std::cerr << "invalid UTF-8 in write: " << str << "\n";
  513             assert(0);
  514         }
  515     }
  516 
  517     /* This is where the writing happens. Lock the output and write */
  518     if (fs.flag_set(feature_recorder_set::DISABLE_FILE_RECORDERS)) {
  519         return;
  520     }
  521 
  522     cppmutex::lock lock(Mf);
  523     if(ios.is_open()){
  524         if(count_==0){
  525             banner_stamp(ios,feature_file_header);
  526         }
  527 
  528         ios << str << '\n';
  529         if(ios.fail()){
  530             std::cerr << "DISK FULL\n";
  531             ios.close();
  532         }
  533         count_++;
  534     }
  535 }
  536 
  537 void feature_recorder::printf(const char *fmt, ...)
  538 {
  539     const int maxsize = 65536;
  540     managed_malloc<char>p(maxsize);
  541     
  542     if(p.buf==0) return;
  543 
  544     va_list ap;
  545     va_start(ap,fmt);
  546     vsnprintf(p.buf,maxsize,fmt,ap);
  547     va_end(ap);
  548     this->write(p.buf);
  549 }
  550 
  551 
  552 /**
  553  * Combine the pos0, feature and context into a single line and write it to the feature file.
  554  *
  555  * @param feature - The feature, which is valid UTF8 (but may not be exactly the bytes on the disk)
  556  * @param context - The context, which is valid UTF8 (but may not be exactly the bytes on the disk)
  557  *
  558  * Interlocking is done in write().
  559  */
  560 
  561 void feature_recorder::write0(const pos0_t &pos0,const std::string &feature,const std::string &context)
  562 {
  563     if ( fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS ) &&
  564          this->flag_notset(feature_recorder::FLAG_NO_FEATURES_SQL) ) {
  565         db_write0( pos0, feature, context);
  566     }
  567     if ( fs.flag_notset(feature_recorder_set::DISABLE_FILE_RECORDERS )) {
  568         std::stringstream ss;
  569         ss << pos0.shift( feature_recorder::offset_add).str() << '\t' << feature;
  570         if (flag_notset( FLAG_NO_CONTEXT ) && ( context.size()>0 )) ss << '\t' << context;
  571         this->write( ss.str() );
  572     }
  573 }
  574 
  575 
  576 /**
  577  * the main entry point of writing a feature and its context to the feature file.
  578  * processes the stop list
  579  */
  580 
  581 void feature_recorder::quote_if_necessary(std::string &feature,std::string &context)
  582 {
  583     /* By default quote string that is not UTF-8, and quote backslashes. */
  584     bool escape_bad_utf8  = true;
  585     bool escape_backslash = true;
  586 
  587     if(flags & FLAG_NO_QUOTE){          // don't quote either
  588         escape_bad_utf8  = false;
  589         escape_backslash = false;
  590     }
  591 
  592     if(flags & FLAG_XML){               // only quote bad utf8
  593         escape_bad_utf8  = true;
  594         escape_backslash = false;
  595     }
  596 
  597     feature = validateOrEscapeUTF8(feature, escape_bad_utf8,escape_backslash);
  598     if(feature.size() > opt_max_feature_size) feature.resize(opt_max_feature_size);
  599     if(flag_notset(FLAG_NO_CONTEXT)){
  600         context = validateOrEscapeUTF8(context,escape_bad_utf8,escape_backslash);
  601         if(context.size() > opt_max_context_size) context.resize(opt_max_context_size);
  602     }
  603 }
  604 
  605 /**
  606  * write() is the main entry point for writing a feature at a given position with context.
  607  * write() checks the stoplist and escapes non-UTF8 characters, then calls write0().
  608  */
  609 void feature_recorder::write(const pos0_t &pos0,const std::string &feature_,const std::string &context_)
  610 {
  611     if(flags & FLAG_DISABLED) return;           // disabled
  612     if(debug & DEBUG_PEDANTIC){
  613         if(feature_.size() > opt_max_feature_size){
  614             std::cerr << "feature_recorder::write : feature_.size()=" << feature_.size() << "\n";
  615             assert(0);
  616         }
  617         if(context_.size() > opt_max_context_size){
  618             std::cerr << "feature_recorder::write : context_.size()=" << context_.size() << "\n";
  619             assert(0);
  620         }
  621     }
  622 
  623     std::string feature = feature_;
  624     std::string context = flag_set(FLAG_NO_CONTEXT) ? "" : context_;
  625     std::string *feature_utf8 = HistogramMaker::make_utf8(feature); // a utf8 feature
  626 
  627     quote_if_necessary(feature,context);
  628 
  629     if(feature.size()==0){
  630         std::cerr << name << ": zero length feature at " << pos0 << "\n";
  631         if(debug & DEBUG_PEDANTIC) assert(0);
  632         return;
  633     }
  634     if(debug & DEBUG_PEDANTIC){
  635         /* Check for tabs or newlines in feature and and context */
  636         for(size_t i=0;i<feature.size();i++){
  637             if(feature[i]=='\t') assert(0);
  638             if(feature[i]=='\n') assert(0);
  639             if(feature[i]=='\r') assert(0);
  640         }
  641         for(size_t i=0;i<context.size();i++){
  642             if(context[i]=='\t') assert(0);
  643             if(context[i]=='\n') assert(0);
  644             if(context[i]=='\r') assert(0);
  645         }
  646     }
  647         
  648     /* First check to see if the feature is on the stop list.
  649      * Only do this if we have a stop_list_recorder (the stop list recorder itself
  650      * does not have a stop list recorder. If it did we would infinitely recurse.
  651      */
  652     if(flag_notset(FLAG_NO_STOPLIST) && stop_list_recorder){          
  653         if(fs.stop_list
  654            && fs.stop_list->check_feature_context(*feature_utf8,context)){
  655             stop_list_recorder->write(pos0,feature,context);
  656             delete feature_utf8;
  657             return;
  658         }
  659     }
  660 
  661     /* The alert list is a special features that are called out.
  662      * If we have one of those, write it to the redlist.
  663      */
  664     if(flag_notset(FLAG_NO_ALERTLIST)
  665        && fs.alert_list
  666        && fs.alert_list->check_feature_context(*feature_utf8,context)){
  667         std::string alert_fn = fs.get_outdir() + "/ALERTS_found.txt";
  668         cppmutex::lock lock(Mr);                // notice we are locking the alert list
  669         std::ofstream rf(alert_fn.c_str(),std::ios_base::app);
  670         if(rf.is_open()){
  671             rf << pos0.shift(feature_recorder::offset_add).str() << '\t' << feature << '\t' << "\n";
  672         }
  673     }
  674 
  675     /* Support in-memory histograms */
  676     for(mhistograms_t::iterator it = mhistograms.begin(); it!=mhistograms.end();it++){
  677         const histogram_def &def = it->first;
  678         mhistogram_t *m = it->second;
  679         std::string new_feature = *feature_utf8;
  680         if(def.require.size()==0 || new_feature.find_first_of(def.require)!=std::string::npos){
  681             /* If there is a pattern to use, use it */
  682             if(def.pattern.size()){
  683                 if(!def.reg.search(new_feature,&new_feature,0,0)){
  684                     // no search match; avoid this feature
  685                     new_feature = "";
  686                 }
  687             }
  688             if(new_feature.size()) m->add(new_feature,1);
  689         }
  690     }
  691 
  692     /* Finally write out the feature and the context */
  693     if(flag_notset(FLAG_NO_FEATURES)){
  694         this->write0(pos0,feature,context);
  695     }
  696     delete feature_utf8;
  697 }
  698 
  699 /**
  700  * Given a buffer, an offset into that buffer of the feature, and the length
  701  * of the feature, make the context and write it out. This is mostly used
  702  * for writing from within the lexical analyzers.
  703  */
  704 
  705 void feature_recorder::write_buf(const sbuf_t &sbuf,size_t pos,size_t len)
  706 {
  707 #ifdef DEBUG_SCANNER
  708     if(debug & DEBUG_SCANNER){
  709         std::cerr << "*** write_buf " << name << " sbuf=" << sbuf << " pos=" << pos << " len=" << len << "\n";
  710         // for debugging, print Imagine that when pos= the location where the crash is happening.
  711         // then set a breakpoint at std::cerr.
  712         if(pos==9999999){
  713             std::cerr << "Imagine that\n";
  714         }
  715     }
  716 #endif
  717 
  718     /* If we are in the margin, ignore; it will be processed again */
  719     if(pos >= sbuf.pagesize && pos < sbuf.bufsize){
  720         return;
  721     }
  722 
  723     if(pos >= sbuf.bufsize){    /* Sanity checks */
  724         std::cerr << "*** write_buf: WRITE OUTSIDE BUFFER. "
  725                   << " pos="  << pos
  726                   << " sbuf=" << sbuf << "\n";
  727         return;
  728     }
  729 
  730     /* Asked to write beyond bufsize; bring it in */
  731     if(pos+len > sbuf.bufsize){
  732         len = sbuf.bufsize - pos;
  733     }
  734 
  735     std::string feature = sbuf.substr(pos,len);
  736     std::string context;
  737 
  738     if((flags & FLAG_NO_CONTEXT)==0){
  739         /* Context write; create a clean context */
  740         size_t p0 = context_window_before < pos ? pos-context_window_before : 0;
  741         size_t p1 = pos+len+context_window_after;
  742         
  743         if(p1>sbuf.bufsize) p1 = sbuf.bufsize;
  744         assert(p0<=p1);
  745         context = sbuf.substr(p0,p1-p0);
  746     }
  747     this->write(sbuf.pos0+pos,feature,context);
  748 #ifdef DEBUG_SCANNER
  749     if(debug & DEBUG_SCANNER){
  750         std::cerr << ".\n";
  751     }
  752 #endif
  753 }
  754 
  755 
  756 /**
  757  * replace a character in a string with another
  758  */
  759 std::string replace(const std::string &src,char f,char t)
  760 {
  761     std::string ret;
  762     for(size_t i=0;i<src.size();i++){
  763         if(src[i]==f) ret.push_back(t);
  764         else ret.push_back(src[i]);
  765     }
  766     return ret;
  767 }
  768 
  769 /****************************************************************
  770  *** CARVING SUPPORT
  771  ****************************************************************
  772  *
  773  * Carving support.
  774  * 2014-04-24 - $ is no longer valid either
  775  * 2013-08-29 - replace invalid characters in filenames
  776  * 2013-07-30 - automatically bin directories
  777  * 2013-06-08 - filenames are the forensic path.
  778  */
  779 
  780 std::string valid_dosname(std::string in)
  781 {
  782     std::string out;
  783     for(size_t i=0;i<in.size();i++){
  784         uint8_t ch = in.at(i);
  785         if(ch<=32 || ch>=128
  786            || ch=='"' || ch=='*' || ch=='+' || ch==','
  787            || ch=='/' || ch==':' || ch==';' || ch=='<'
  788            || ch=='=' || ch=='>' || ch=='?' || ch=='\\'
  789            || ch=='[' || ch==']' || ch=='|' || ch=='$' ){
  790             out.push_back('_');
  791         } else {
  792             out.push_back(ch);
  793         }
  794     }
  795     return out;
  796 }
  797         
  798 
  799 //const feature_recorder::hash_def &feature_recorder::hasher()
  800 //{
  801 //    return fs.hasher;
  802 //}
  803 
  804 
  805 
  806 #include <iomanip>
  807 /**
  808  * @param sbuf   - the buffer to carve
  809  * @param pos    - offset in the buffer to carve
  810  * @param len    - how many bytes to carve
  811  *
  812  */
  813 std::string feature_recorder::carve(const sbuf_t &sbuf,size_t pos,size_t len,
  814                                     const std::string &ext)
  815 {
  816     if(flags & FLAG_DISABLED) return std::string();           // disabled
  817 
  818     /* If we are in the margin, ignore; it will be processed again */
  819     if(pos >= sbuf.pagesize && pos < sbuf.bufsize){
  820         return std::string();
  821     }
  822     assert(pos < sbuf.bufsize);
  823     
  824 
  825 
  826     /* Carve to a file depending on the carving mode.  The purpose
  827      * of CARVE_ENCODED is to allow us to carve JPEGs when they are
  828      * embedded in, say, GZIP files, but not carve JPEGs that are
  829      * bare.  The difficulty arises when you have a tool that can go
  830      * into, say, ZIP files. In this case, we don't want to carve
  831      * every ZIP file, just the (for example) XORed ZIP files. So the
  832      * ZIP carver doesn't carve every ZIP file, just the ZIP files
  833      * that are in HIBER files.  That is, we want to not carve a path
  834      * of ZIP-234234 but we do want to carve a path of
  835      * 1000-HIBER-33423-ZIP-2343.  This is implemented by having an
  836      * ignore_encoding. the ZIP carver sets it to ZIP so it won't
  837      * carve things that are just found in a ZIP file. This means that
  838      * it won't carve disembodied ZIP files found in unallocated
  839      * space. You might want to do that.  If so, set ZIP's carve mode
  840      * to CARVE_ALL.
  841      */
  842     switch(carve_mode){
  843     case CARVE_NONE:
  844         return std::string();                         // carve nothing
  845     case CARVE_ENCODED:
  846         if(sbuf.pos0.path.size()==0) return std::string(); // not encoded
  847         if(sbuf.pos0.alphaPart()==ignore_encoding) return std::string(); // ignore if it is just encoded with this
  848         break;                                      // otherwise carve
  849     case CARVE_ALL:
  850         break;
  851     }
  852 
  853     /* If the directory doesn't exist, make it.
  854      * If two threads try to make the directory,
  855      * that's okay, because the second one will fail.
  856      */
  857 
  858     sbuf_t cbuf(sbuf,pos,len);          // the buf we are going to carve
  859     std::string carved_hash_hexvalue = (*fs.hasher.func)(cbuf.buf,cbuf.bufsize);
  860 
  861     /* See if this is in the cache */
  862     bool in_cache = carve_cache.check_for_presence_and_insert(carved_hash_hexvalue);
  863 
  864 
  865     uint64_t this_file_number = file_number_add(in_cache ? 0 : 1); // increment if we are not in the cache
  866     std::string dirname1 = fs.get_outdir() + "/" + name;
  867 
  868     std::stringstream ss;
  869     ss << dirname1 << "/" << std::setw(3) << std::setfill('0') << (this_file_number / 1000);
  870 
  871     std::string dirname2 = ss.str(); 
  872     std::string fname         = dirname2 + std::string("/") + valid_dosname(cbuf.pos0.str() + ext);
  873     std::string fname_feature = fname.substr(fs.get_outdir().size()+1); 
  874 
  875     /* Record what was found in the feature file.
  876      */
  877     if (in_cache){
  878         fname="";             // no filename
  879         fname_feature="<CACHED>";
  880     }
  881 
  882     // write to the feature file
  883     ss.str(std::string()); // clear the stringstream
  884     ss << "<fileobject>";
  885     if (!in_cache) ss << "<filename>" << fname << "</filename>";
  886     ss << "<filesize>" << len << "</filesize>";
  887     ss << "<hashdigest type='" << fs.hasher.name << "'>" << carved_hash_hexvalue << "</hashdigest></fileobject>";
  888     this->write(cbuf.pos0,fname_feature,ss.str());
  889     
  890     if (in_cache) return fname;               // do not make directories or write out if we are cached
  891 
  892     /* Make the directory if it doesn't exist.  */
  893     if (access(dirname2.c_str(),R_OK)!=0){
  894 #ifdef WIN32
  895         mkdir(dirname1.c_str());
  896         mkdir(dirname2.c_str());
  897 #else   
  898         mkdir(dirname1.c_str(),0777);
  899         mkdir(dirname2.c_str(),0777);
  900 #endif
  901     }
  902     /* Check to make sure that directory is there. We don't just the return code
  903      * because there could have been two attempts to make the directory simultaneously,
  904      * so the mkdir could fail but the directory could nevertheless exist. We need to
  905      * remember the error number because the access() call may clear it.
  906      */
  907     int oerrno = errno;                 // remember error number
  908     if (access(dirname2.c_str(),R_OK)!=0){
  909         std::cerr << "Could not make directory " << dirname2 << ": " << strerror(oerrno) << "\n";
  910         return std::string();
  911     }
  912 
  913     /* Write the file into the directory */
  914     int fd = ::open(fname.c_str(),O_CREAT|O_BINARY|O_RDWR,0666);
  915     if(fd<0){
  916         std::cerr << "*** carve: Cannot create " << fname << ": " << strerror(errno) << "\n";
  917         return std::string();
  918     }
  919 
  920     ssize_t ret = cbuf.write(fd,0,len);
  921     if(ret<0){
  922         std::cerr << "*** carve: Cannot write(pos=" << fd << "," << pos << " len=" << len << "): "<< strerror(errno) << "\n";
  923     }
  924     ::close(fd);
  925     return fname;
  926 }
  927 
  928 /**
  929  * Currently, we need strptime() and utimes() to set the time.
  930  */
  931 void feature_recorder::set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601) 
  932 {
  933     if(flags & FLAG_DISABLED) return;           // disabled
  934 #if defined(HAVE_STRPTIME) && defined(HAVE_UTIMES)
  935     if(fname.size()){
  936         struct tm tm;
  937         if(strptime(mtime_iso8601.c_str(),"%Y-%m-%dT%H:%M:%S",&tm)){
  938             time_t t = mktime(&tm);
  939             if(t>0){
  940                 const struct timeval times[2] = {{t,0},{t,0}};
  941                 utimes(fname.c_str(),times);
  942             }
  943         }
  944     }
  945 #endif
  946 }
  947