"Fossies" - the Fresh Open Source Software Archive

Member "tcpflow-1.6.1/src/be13_api/feature_recorder.h" (19 Feb 2021, 15791 Bytes) of package /linux/misc/tcpflow-1.6.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "feature_recorder.h" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.4.5_vs_1.5.0.

    1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
    2 
    3 #ifndef FEATURE_RECORDER_H
    4 #define FEATURE_RECORDER_H
    5 
    6 /**
    7  * \addtogroup bulk_extractor_APIs
    8  * @{
    9  */
   10 
   11 /**
   12  * feature_recorder.h:
   13  *
   14  * System for recording features from the scanners into the feature files.
   15  *
   16  * There is one feature_recorder per feature file. It is used both to record
   17  * the features and to perform the histogram calculation.
   18  * (That should probably be moved to a different class.) It also also previously
   19  * had the ability to do a merge sort, but we took that out because it was
   20  * not necessary.
   21  *
   22  * The feature recorders can also check the global alert_list to see
   23  * if the feature should be written to the alert file. It's opened on
   24  * demand and immediately flushed and closed.  A special mutex is used
   25  * to protect it.
   26  *
   27  * Finally, the feature recorder supports the global stop_list, which
   28  * is a list of features that are not written to the main file but are
   29  * written to a stop list.  That is implemented with a second
   30  * feature_recorder.
   31  *
   32  * There is one feature_recorder_set per process.
   33  * The file assumes that bulk_extractor.h is being included.
   34  */
   35  
   36 #include <string>
   37 #include <cstdarg>
   38 #include <fstream>
   39 #include <set>
   40 #include <map>
   41 #include <cassert>
   42 #include <pthread.h>
   43 
   44 #ifdef HAVE_SQLITE3_H
   45 #include <sqlite3.h>
   46 #ifndef BEAPI_SQLITE
   47 #  define BEAPI_SQLITE3 sqlite3
   48 #  define BEAPI_SQLITE3_STMT sqlite3_stmt
   49 #endif
   50 #endif
   51 
   52 #ifndef BEAPI_SQLITE3
   53 #define BEAPI_SQLITE3      void
   54 #define BEAPI_SQLITE3_STMT void
   55 #endif
   56 
   57 
   58 #include "cppmutex.h"
   59 #include "dfxml/src/dfxml_writer.h"
   60 #include "dfxml/src/hash_t.h"
   61 #include "atomic_set_map.h"
   62 #include "beregex.h"
   63 
   64 /**
   65  * histogram_def defines the histograms that will be made by a feature recorder.
   66  * If the mhistogram is set, the histogram is generated when features are recorded
   67  * and kept in memory. If mhistogram is not set, the histogram is generated when the feature recorder is closed.
   68  */
   69 
   70 struct histogram_def {
   71     /**
   72      * @param feature- the feature file to histogram (no .txt)
   73      * @param re     - the regular expression to extract
   74      * @param require- require this string on the line (usually in context)
   75      * @param suffix - the suffix to add to the histogram file after feature name before .txt
   76      * @param flags  - any flags (see above)
   77      */
   78 
   79     histogram_def(std::string feature_,std::string re_,std::string suffix_,uint32_t flags_=0):
   80         feature(feature_),pattern(re_),require(),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){}
   81     histogram_def(std::string feature_,std::string re_,std::string require_,std::string suffix_,uint32_t flags_=0):
   82         feature(feature_),pattern(re_),require(require_),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){ }
   83     const std::string feature;      /* feature file name */
   84     const std::string pattern;      /* extract pattern; "" means use entire feature */
   85     const std::string require;      /* text required somewhere on the feature line; used for IP histograms */
   86     const std::string suffix;       /* suffix to append; "" means "histogram" */
   87     const uint32_t    flags;        // defined in histogram.h
   88     const beregex     reg;          // regular expression for pattern
   89 };
   90 
   91 /* NOTE:
   92  * 1 - This typedef must remain outside the the feature_recorder due
   93  *     to historical reasons and cannot be made a vector
   94  * 2 - Do not make historam_def const!  It breaks some compilers.
   95  */
   96 
   97 typedef  std::set<histogram_def> histogram_defs_t; // a set of histogram definitions
   98 
   99 
  100 inline bool operator <(const histogram_def &h1,const histogram_def &h2)  {
  101     if (h1.feature<h2.feature) return true;
  102     if (h1.feature>h2.feature) return false;
  103     if (h1.pattern<h2.pattern) return true;
  104     if (h1.pattern>h2.pattern) return false;
  105     if (h1.suffix<h2.suffix) return true;
  106     if (h1.suffix>h2.suffix) return false;
  107     return false;                       /* equal */
  108 };
  109 
  110 inline bool operator !=(const histogram_def &h1,const histogram_def &h2)  {
  111     return h1.feature!=h2.feature || h1.pattern!=h2.pattern || h1.suffix!=h2.suffix;
  112 };
  113 
  114 
  115 /* carve object cache */
  116 typedef atomic_set<std::string> carve_cache_t;
  117 
  118 /* in-memory histograms */
  119 typedef atomic_histogram<std::string,uint64_t> mhistogram_t;             // memory histogram
  120 typedef std::map<histogram_def,mhistogram_t *> mhistograms_t;
  121 
  122 
  123 class feature_recorder {
  124     // default copy construction and assignment are meaningless
  125     // and not implemented
  126     feature_recorder(const feature_recorder &);
  127     feature_recorder &operator=(const feature_recorder &);
  128 
  129     static uint32_t debug;              // are we debugging?
  130     static pthread_t main_threadid;     // main threads ID
  131     static void MAINTHREAD();           // called if can only be run in the main thread
  132     uint32_t flags;                     // flags for this feature recorder
  133     /****************************************************************/
  134 
  135 public:
  136     class besql_stmt {
  137         besql_stmt(const besql_stmt &);
  138         besql_stmt &operator=(const besql_stmt &);
  139 public:
  140         cppmutex           Mstmt;      // a mutext to protect it
  141         BEAPI_SQLITE3_STMT *stmt;      // the prepared statement
  142         besql_stmt(BEAPI_SQLITE3 *db3,const char *sql);
  143         virtual ~besql_stmt();
  144         void insert_feature(const pos0_t &pos, // insert it into this table!
  145                             const std::string &feature,const std::string &feature8, const std::string &context);
  146     };
  147 
  148     typedef int (dump_callback_t)(void *user,const feature_recorder &fr,const histogram_def &def,
  149                                   const std::string &feature,const uint64_t &count);
  150     static void set_main_threadid(){
  151 #ifndef WIN32
  152         main_threadid=pthread_self();
  153 #endif
  154     };             // set the main 
  155     static void set_debug(uint32_t ndebug){debug=ndebug;}
  156     typedef std::string offset_t;
  157 
  158     /**
  159      * \name Flags that control scanners
  160      * @{
  161      * These flags control scanners.  Set them with set_flag().
  162      */
  163     /** Disable this recorder. */
  164     static const int FLAG_DISABLED         = 0x01;      // feature recorder is Disabled
  165     static const int FLAG_NO_CONTEXT       = 0x02;      // Do not write context.
  166     static const int FLAG_NO_STOPLIST      = 0x04;      // Do not honor the stoplist/alertlist.
  167     static const int FLAG_NO_ALERTLIST     = 0x08;      // Do not honor the stoplist/alertlist.
  168     /**
  169      * Normally feature recorders automatically quote non-UTF8 characters
  170      * with \x00 notation and quote "\" as \x5C. Specify FLAG_NO_QUOTE to
  171      * disable this behavior.
  172      */
  173     static const int FLAG_NO_QUOTE         = 0x10;         // do not escape UTF8 codes
  174 
  175     /**
  176      * Use this flag the feature recorder is sending UTF-8 XML.
  177      * non-UTF8 will be quoted but "\" will not be escaped.
  178      */
  179     static const int FLAG_XML              = 0x20;         // will be sending XML
  180 
  181     /**
  182      * histogram support.
  183      */
  184     static const uint32_t FLAG_NO_FEATURES = 0x40;  // do not record features (just memory histogram)
  185     static const uint32_t FLAG_NO_FEATURES_SQL = 0x80;  // do not write features to SQL
  186 
  187     /** @} */
  188     static const int max_histogram_files = 10;  // don't make more than 10 files in low-memory conditions
  189     static const std::string histogram_file_header;
  190     static const std::string feature_file_header;
  191     static const std::string bulk_extractor_version_header;
  192 
  193     // These must only be changed in the main thread:
  194     static uint32_t    opt_max_context_size;
  195     static uint32_t    opt_max_feature_size;
  196     static int64_t     offset_add;          // added to every reported offset, for use with hadoop
  197     static std::string banner_file;         // banner for top of every file
  198     static std::string extract_feature(const std::string &line);
  199 
  200     feature_recorder(class feature_recorder_set &fs,
  201                      const std::string &name);
  202     virtual        ~feature_recorder();
  203     virtual void   set_flag(uint32_t flags_);
  204     virtual void   unset_flag(uint32_t flags_);
  205     void           enable_memory_histograms();              // only called from feature_recorder_set
  206     virtual void   set_memhist_limit(int64_t limit_);
  207     bool           flag_set(uint32_t f)    const {return flags & f;}
  208     bool           flag_notset(uint32_t f) const {return !(flags & f);}
  209     uint32_t       get_flags()             const {return flags;}
  210     virtual const std::string &get_outdir() const;
  211 
  212     static size_t context_window_default; // global option
  213     const  std::string name;                  // name of this feature recorder 
  214 
  215 private:
  216     std::string  ignore_encoding;            // encoding to ignore for carving
  217     std::fstream ios;                        // where features are written 
  218     
  219     class besql_stmt *bs;                    // prepared beapi sql statement
  220 
  221 protected:;
  222     histogram_defs_t      histogram_defs;    // histograms that are to be created for this feature recorder
  223 public:
  224     class        feature_recorder_set &fs;   // the set in which this feature_recorder resides
  225 protected:
  226     int64_t      count_;                     /* number of records written */
  227     size_t       context_window_before;      // context window
  228     size_t       context_window_after;       // context window
  229 
  230     mutable cppmutex Mf;                     // protects the file  & file_number_
  231     mutable cppmutex Mr;                     // protects the redlist 
  232     mhistograms_t mhistograms;               // the memory histograms, if we are using them
  233     uint64_t      mhistogram_limit;          // how many we want (per feature recorder limit, rather than per histogram)
  234 
  235     
  236     class feature_recorder *stop_list_recorder; // where stopped features get written
  237     int64_t                file_number_;            /* starts at 0; gets incremented by carve(); */
  238     carve_cache_t          carve_cache;
  239 public:
  240     /* these are not threadsafe and should only be called in startup */
  241     void set_stop_list_recorder(class feature_recorder *fr){
  242         MAINTHREAD();
  243         stop_list_recorder = fr;
  244     }
  245     void set_context_window(size_t win){
  246         MAINTHREAD();
  247         context_window_before = win;
  248         context_window_after = win;
  249     }
  250     void set_context_window_before(size_t win){ MAINTHREAD(); context_window_before = win;}
  251     void set_context_window_after(size_t win){ MAINTHREAD(); context_window_after = win; }
  252     void set_carve_ignore_encoding(const std::string &encoding){ MAINTHREAD();ignore_encoding = encoding;}
  253     /* End non-threadsafe */
  254 
  255     uint64_t file_number_add(uint64_t i){
  256 #ifdef HAVE___SYNC_ADD_AND_FETCH
  257         return __sync_add_and_fetch(&file_number_,i);
  258 #else
  259         cppmutex::lock lock(Mf);
  260         file_number_ += i;
  261         return file_number_;
  262 #endif
  263     }
  264 
  265     void   banner_stamp(std::ostream &os,const std::string &header) const; // stamp banner, and header
  266 
  267     /* where stopped items (on stop_list or context_stop_list) get recorded: */
  268     std::string        fname_counter(std::string suffix) const;
  269     static std::string quote_string(const std::string &feature); // turns unprintable characters to octal escape
  270     static std::string unquote_string(const std::string &feature); // turns octal escape back to binary characters
  271 
  272     //virtual const feature_recorder_set::hash_def &hasher();   // returns hasher in feature_recorder_set
  273 
  274     /* feature file management */
  275     virtual void open();
  276     virtual void close();                       
  277     virtual void flush();
  278     static  int  dump_callback_test(void *user,const feature_recorder &fr,
  279                                     const std::string &str,const uint64_t &count); // test callback for you to use!
  280     
  281 
  282     /* TK: The histogram_def should be provided at the beginning, so it can be used for in-memory histograms.
  283      * The callback needs to have the specific atomic set as the callback as well.
  284      */
  285     virtual void add_histogram(const histogram_def &def); // adds a histogram to process
  286     virtual void dump_histogram_file(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
  287     virtual void dump_histogram_db(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
  288     virtual void dump_histogram(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
  289     typedef void (*xml_notifier_t)(const std::string &xmlstring);
  290     virtual void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const;
  291     
  292     /* Methods to get info */
  293     uint64_t count() const {return count_;}
  294 
  295     /* Methods to write.
  296      * write() is the basic write - you say where, and it does it.
  297      * write_buf() writes from a position within the buffer, with context.
  298      *             It won't write a feature that starts in the margin.
  299      * pos0 gives the location and prefix for the beginning of the buffer
  300      */ 
  301 
  302 
  303     /****************************************************************
  304      *** External entry points.
  305      ****************************************************************/
  306 
  307     /**
  308      * write() actually does the writing to the file.
  309      * It uses locks and is threadsafe.
  310      * Callers therefore do not need locks.
  311      */
  312     virtual void write(const std::string &str);
  313 
  314     /**
  315      * support for writing features
  316      */
  317 
  318     void quote_if_necessary(std::string &feature,std::string &context);
  319 
  320     // only virtual functions may be called by plug-ins
  321     // printf() prints to the feature file.
  322     virtual void printf(const char *fmt_,...) __attribute__((format(printf, 2, 3)));
  323     // 
  324     // write a feature and its context; the feature may be in the context, but doesn't need to be.
  325     // write() calls write0() after histogram, quoting, and stoplist processing
  326     // write0() calls write0_sql() if sqlwriting is enabled
  327     virtual void write0(const pos0_t &pos0,const std::string &feature,const std::string &context);  
  328 private:
  329     virtual void db_write0(const pos0_t &pos0,const std::string &feature,const std::string &context);  
  330     static const char *db_insert_stmt;
  331 public:
  332 
  333     // write a feature and its context; the feature may be in the context, but doesn't need to be.
  334     // entries processed by write below will be processed by histogram system
  335     virtual void write(const pos0_t &pos0,const std::string &feature,const std::string &context);  
  336 
  337     // write a feature located at a given place within an sbuf.
  338     // Context is written automatically
  339     virtual void write_buf(const sbuf_t &sbuf,size_t pos,size_t len); /* writes with context */
  340 
  341     /**
  342      * support for carving.
  343      * Carving writes the filename to the feature file; the context is the file's hash using the provided function.
  344      * Automatically de-duplicates.
  345      */
  346     enum carve_mode_t {
  347         CARVE_NONE=0,
  348         CARVE_ENCODED=1,
  349         CARVE_ALL=2};
  350 #define CARVE_MODE_DESCRIPTION "0=carve none; 1=carve encoded; 2=carve all"
  351     carve_mode_t carve_mode;
  352     typedef      std::string (*hashing_function_t)(const sbuf_t &sbuf); // returns a hex value
  353     void         set_carve_mode(carve_mode_t aMode){MAINTHREAD();carve_mode=aMode;}
  354 
  355     // Carve a file; returns filename of carved file or empty string if nothing carved
  356     virtual std::string carve(const sbuf_t &sbuf,size_t pos,size_t len, 
  357                               const std::string &ext); // appended to forensic path
  358     // Set the time of the carved file to iso8601 file
  359     virtual void set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601);
  360 };
  361 
  362 // function that can only be called from main thread
  363 inline void feature_recorder::MAINTHREAD()
  364 {
  365 #ifndef WIN32
  366         assert(main_threadid==pthread_self());
  367 #endif
  368 };                
  369 
  370 
  371 /** @} */
  372 
  373 #endif