tcpflow  1.6.1
About: tcpflow is a TCP/IP packet demultiplexer that captures data transmitted as part of TCP connections (flows), and stores the data in a way that is convenient for protocol analysis and debugging.
  Fossies Dox: tcpflow-1.6.1.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

feature_recorder.h
Go to the documentation of this file.
1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 
3 #ifndef FEATURE_RECORDER_H
4 #define FEATURE_RECORDER_H
5 
6 /**
7  * \addtogroup bulk_extractor_APIs
8  * @{
9  */
10 
11 /**
12  * feature_recorder.h:
13  *
14  * System for recording features from the scanners into the feature files.
15  *
16  * There is one feature_recorder per feature file. It is used both to record
17  * the features and to perform the histogram calculation.
18  * (That should probably be moved to a different class.) It also also previously
19  * had the ability to do a merge sort, but we took that out because it was
20  * not necessary.
21  *
22  * The feature recorders can also check the global alert_list to see
23  * if the feature should be written to the alert file. It's opened on
24  * demand and immediately flushed and closed. A special mutex is used
25  * to protect it.
26  *
27  * Finally, the feature recorder supports the global stop_list, which
28  * is a list of features that are not written to the main file but are
29  * written to a stop list. That is implemented with a second
30  * feature_recorder.
31  *
32  * There is one feature_recorder_set per process.
33  * The file assumes that bulk_extractor.h is being included.
34  */
35 
36 #include <string>
37 #include <cstdarg>
38 #include <fstream>
39 #include <set>
40 #include <map>
41 #include <cassert>
42 #include <pthread.h>
43 
44 #ifdef HAVE_SQLITE3_H
45 #include <sqlite3.h>
46 #ifndef BEAPI_SQLITE
47 # define BEAPI_SQLITE3 sqlite3
48 # define BEAPI_SQLITE3_STMT sqlite3_stmt
49 #endif
50 #endif
51 
52 #ifndef BEAPI_SQLITE3
53 #define BEAPI_SQLITE3 void
54 #define BEAPI_SQLITE3_STMT void
55 #endif
56 
57 
58 #include "cppmutex.h"
59 #include "dfxml/src/dfxml_writer.h"
60 #include "dfxml/src/hash_t.h"
61 #include "atomic_set_map.h"
62 #include "beregex.h"
63 
64 /**
65  * histogram_def defines the histograms that will be made by a feature recorder.
66  * If the mhistogram is set, the histogram is generated when features are recorded
67  * and kept in memory. If mhistogram is not set, the histogram is generated when the feature recorder is closed.
68  */
69 
70 struct histogram_def {
71  /**
72  * @param feature- the feature file to histogram (no .txt)
73  * @param re - the regular expression to extract
74  * @param require- require this string on the line (usually in context)
75  * @param suffix - the suffix to add to the histogram file after feature name before .txt
76  * @param flags - any flags (see above)
77  */
78 
79  histogram_def(std::string feature_,std::string re_,std::string suffix_,uint32_t flags_=0):
80  feature(feature_),pattern(re_),require(),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){}
81  histogram_def(std::string feature_,std::string re_,std::string require_,std::string suffix_,uint32_t flags_=0):
82  feature(feature_),pattern(re_),require(require_),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){ }
83  const std::string feature; /* feature file name */
84  const std::string pattern; /* extract pattern; "" means use entire feature */
85  const std::string require; /* text required somewhere on the feature line; used for IP histograms */
86  const std::string suffix; /* suffix to append; "" means "histogram" */
87  const uint32_t flags; // defined in histogram.h
88  const beregex reg; // regular expression for pattern
89 };
90 
91 /* NOTE:
92  * 1 - This typedef must remain outside the the feature_recorder due
93  * to historical reasons and cannot be made a vector
94  * 2 - Do not make historam_def const! It breaks some compilers.
95  */
96 
97 typedef std::set<histogram_def> histogram_defs_t; // a set of histogram definitions
98 
99 
100 inline bool operator <(const histogram_def &h1,const histogram_def &h2) {
101  if (h1.feature<h2.feature) return true;
102  if (h1.feature>h2.feature) return false;
103  if (h1.pattern<h2.pattern) return true;
104  if (h1.pattern>h2.pattern) return false;
105  if (h1.suffix<h2.suffix) return true;
106  if (h1.suffix>h2.suffix) return false;
107  return false; /* equal */
108 };
109 
110 inline bool operator !=(const histogram_def &h1,const histogram_def &h2) {
111  return h1.feature!=h2.feature || h1.pattern!=h2.pattern || h1.suffix!=h2.suffix;
112 };
113 
114 
115 /* carve object cache */
117 
118 /* in-memory histograms */
120 typedef std::map<histogram_def,mhistogram_t *> mhistograms_t;
121 
122 
124  // default copy construction and assignment are meaningless
125  // and not implemented
128 
129  static uint32_t debug; // are we debugging?
130  static pthread_t main_threadid; // main threads ID
131  static void MAINTHREAD(); // called if can only be run in the main thread
132  uint32_t flags; // flags for this feature recorder
133  /****************************************************************/
134 
135 public:
136  class besql_stmt {
139 public:
140  cppmutex Mstmt; // a mutext to protect it
141  BEAPI_SQLITE3_STMT *stmt; // the prepared statement
142  besql_stmt(BEAPI_SQLITE3 *db3,const char *sql);
143  virtual ~besql_stmt();
144  void insert_feature(const pos0_t &pos, // insert it into this table!
145  const std::string &feature,const std::string &feature8, const std::string &context);
146  };
147 
148  typedef int (dump_callback_t)(void *user,const feature_recorder &fr,const histogram_def &def,
149  const std::string &feature,const uint64_t &count);
150  static void set_main_threadid(){
151 #ifndef WIN32
152  main_threadid=pthread_self();
153 #endif
154  }; // set the main
155  static void set_debug(uint32_t ndebug){debug=ndebug;}
156  typedef std::string offset_t;
157 
158  /**
159  * \name Flags that control scanners
160  * @{
161  * These flags control scanners. Set them with set_flag().
162  */
163  /** Disable this recorder. */
164  static const int FLAG_DISABLED = 0x01; // feature recorder is Disabled
165  static const int FLAG_NO_CONTEXT = 0x02; // Do not write context.
166  static const int FLAG_NO_STOPLIST = 0x04; // Do not honor the stoplist/alertlist.
167  static const int FLAG_NO_ALERTLIST = 0x08; // Do not honor the stoplist/alertlist.
168  /**
169  * Normally feature recorders automatically quote non-UTF8 characters
170  * with \x00 notation and quote "\" as \x5C. Specify FLAG_NO_QUOTE to
171  * disable this behavior.
172  */
173  static const int FLAG_NO_QUOTE = 0x10; // do not escape UTF8 codes
174 
175  /**
176  * Use this flag the feature recorder is sending UTF-8 XML.
177  * non-UTF8 will be quoted but "\" will not be escaped.
178  */
179  static const int FLAG_XML = 0x20; // will be sending XML
180 
181  /**
182  * histogram support.
183  */
184  static const uint32_t FLAG_NO_FEATURES = 0x40; // do not record features (just memory histogram)
185  static const uint32_t FLAG_NO_FEATURES_SQL = 0x80; // do not write features to SQL
186 
187  /** @} */
188  static const int max_histogram_files = 10; // don't make more than 10 files in low-memory conditions
189  static const std::string histogram_file_header;
190  static const std::string feature_file_header;
191  static const std::string bulk_extractor_version_header;
192 
193  // These must only be changed in the main thread:
196  static int64_t offset_add; // added to every reported offset, for use with hadoop
197  static std::string banner_file; // banner for top of every file
198  static std::string extract_feature(const std::string &line);
199 
201  const std::string &name);
202  virtual ~feature_recorder();
203  virtual void set_flag(uint32_t flags_);
204  virtual void unset_flag(uint32_t flags_);
205  void enable_memory_histograms(); // only called from feature_recorder_set
206  virtual void set_memhist_limit(int64_t limit_);
207  bool flag_set(uint32_t f) const {return flags & f;}
208  bool flag_notset(uint32_t f) const {return !(flags & f);}
209  uint32_t get_flags() const {return flags;}
210  virtual const std::string &get_outdir() const;
211 
212  static size_t context_window_default; // global option
213  const std::string name; // name of this feature recorder
214 
215 private:
216  std::string ignore_encoding; // encoding to ignore for carving
217  std::fstream ios; // where features are written
218 
219  class besql_stmt *bs; // prepared beapi sql statement
220 
221 protected:;
222  histogram_defs_t histogram_defs; // histograms that are to be created for this feature recorder
223 public:
224  class feature_recorder_set &fs; // the set in which this feature_recorder resides
225 protected:
226  int64_t count_; /* number of records written */
227  size_t context_window_before; // context window
228  size_t context_window_after; // context window
229 
230  mutable cppmutex Mf; // protects the file & file_number_
231  mutable cppmutex Mr; // protects the redlist
232  mhistograms_t mhistograms; // the memory histograms, if we are using them
233  uint64_t mhistogram_limit; // how many we want (per feature recorder limit, rather than per histogram)
234 
235 
236  class feature_recorder *stop_list_recorder; // where stopped features get written
237  int64_t file_number_; /* starts at 0; gets incremented by carve(); */
239 public:
240  /* these are not threadsafe and should only be called in startup */
242  MAINTHREAD();
243  stop_list_recorder = fr;
244  }
245  void set_context_window(size_t win){
246  MAINTHREAD();
247  context_window_before = win;
248  context_window_after = win;
249  }
252  void set_carve_ignore_encoding(const std::string &encoding){ MAINTHREAD();ignore_encoding = encoding;}
253  /* End non-threadsafe */
254 
255  uint64_t file_number_add(uint64_t i){
256 #ifdef HAVE___SYNC_ADD_AND_FETCH
257  return __sync_add_and_fetch(&file_number_,i);
258 #else
259  cppmutex::lock lock(Mf);
260  file_number_ += i;
261  return file_number_;
262 #endif
263  }
264 
265  void banner_stamp(std::ostream &os,const std::string &header) const; // stamp banner, and header
266 
267  /* where stopped items (on stop_list or context_stop_list) get recorded: */
268  std::string fname_counter(std::string suffix) const;
269  static std::string quote_string(const std::string &feature); // turns unprintable characters to octal escape
270  static std::string unquote_string(const std::string &feature); // turns octal escape back to binary characters
271 
272  //virtual const feature_recorder_set::hash_def &hasher(); // returns hasher in feature_recorder_set
273 
274  /* feature file management */
275  virtual void open();
276  virtual void close();
277  virtual void flush();
278  static int dump_callback_test(void *user,const feature_recorder &fr,
279  const std::string &str,const uint64_t &count); // test callback for you to use!
280 
281 
282  /* TK: The histogram_def should be provided at the beginning, so it can be used for in-memory histograms.
283  * The callback needs to have the specific atomic set as the callback as well.
284  */
285  virtual void add_histogram(const histogram_def &def); // adds a histogram to process
286  virtual void dump_histogram_file(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
287  virtual void dump_histogram_db(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
288  virtual void dump_histogram(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
289  typedef void (*xml_notifier_t)(const std::string &xmlstring);
290  virtual void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const;
291 
292  /* Methods to get info */
293  uint64_t count() const {return count_;}
294 
295  /* Methods to write.
296  * write() is the basic write - you say where, and it does it.
297  * write_buf() writes from a position within the buffer, with context.
298  * It won't write a feature that starts in the margin.
299  * pos0 gives the location and prefix for the beginning of the buffer
300  */
301 
302 
303  /****************************************************************
304  *** External entry points.
305  ****************************************************************/
306 
307  /**
308  * write() actually does the writing to the file.
309  * It uses locks and is threadsafe.
310  * Callers therefore do not need locks.
311  */
312  virtual void write(const std::string &str);
313 
314  /**
315  * support for writing features
316  */
317 
318  void quote_if_necessary(std::string &feature,std::string &context);
319 
320  // only virtual functions may be called by plug-ins
321  // printf() prints to the feature file.
322  virtual void printf(const char *fmt_,...) __attribute__((format(printf, 2, 3)));
323  //
324  // write a feature and its context; the feature may be in the context, but doesn't need to be.
325  // write() calls write0() after histogram, quoting, and stoplist processing
326  // write0() calls write0_sql() if sqlwriting is enabled
327  virtual void write0(const pos0_t &pos0,const std::string &feature,const std::string &context);
328 private:
329  virtual void db_write0(const pos0_t &pos0,const std::string &feature,const std::string &context);
330  static const char *db_insert_stmt;
331 public:
332 
333  // write a feature and its context; the feature may be in the context, but doesn't need to be.
334  // entries processed by write below will be processed by histogram system
335  virtual void write(const pos0_t &pos0,const std::string &feature,const std::string &context);
336 
337  // write a feature located at a given place within an sbuf.
338  // Context is written automatically
339  virtual void write_buf(const sbuf_t &sbuf,size_t pos,size_t len); /* writes with context */
340 
341  /**
342  * support for carving.
343  * Carving writes the filename to the feature file; the context is the file's hash using the provided function.
344  * Automatically de-duplicates.
345  */
350 #define CARVE_MODE_DESCRIPTION "0=carve none; 1=carve encoded; 2=carve all"
352  typedef std::string (*hashing_function_t)(const sbuf_t &sbuf); // returns a hex value
354 
355  // Carve a file; returns filename of carved file or empty string if nothing carved
356  virtual std::string carve(const sbuf_t &sbuf,size_t pos,size_t len,
357  const std::string &ext); // appended to forensic path
358  // Set the time of the carved file to iso8601 file
359  virtual void set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601);
360 };
361 
362 // function that can only be called from main thread
364 {
365 #ifndef WIN32
366  assert(main_threadid==pthread_self());
367 #endif
368 };
369 
370 
371 /** @} */
372 
373 #endif
void insert_feature(const pos0_t &pos, const std::string &feature, const std::string &feature8, const std::string &context)
besql_stmt(const besql_stmt &)
besql_stmt & operator=(const besql_stmt &)
virtual void write_buf(const sbuf_t &sbuf, size_t pos, size_t len)
static std::string banner_file
static uint32_t opt_max_context_size
virtual void db_write0(const pos0_t &pos0, const std::string &feature, const std::string &context)
static const uint32_t FLAG_NO_FEATURES_SQL
uint64_t file_number_add(uint64_t i)
virtual void dump_histogram(const histogram_def &def, void *user, feature_recorder::dump_callback_t cb) const
virtual ~feature_recorder()
void(* xml_notifier_t)(const std::string &xmlstring)
static const uint32_t FLAG_NO_FEATURES
static int64_t offset_add
class besql_stmt * bs
virtual void set_flag(uint32_t flags_)
void banner_stamp(std::ostream &os, const std::string &header) const
virtual void unset_flag(uint32_t flags_)
virtual void write(const std::string &str)
static const int FLAG_NO_QUOTE
std::string(* hashing_function_t)(const sbuf_t &sbuf)
uint32_t get_flags() const
virtual const std::string & get_outdir() const
static const int FLAG_NO_ALERTLIST
static const char * db_insert_stmt
static const int max_histogram_files
static size_t context_window_default
static std::string extract_feature(const std::string &line)
virtual void write0(const pos0_t &pos0, const std::string &feature, const std::string &context)
std::string offset_t
void set_carve_ignore_encoding(const std::string &encoding)
const std::string name
static const int FLAG_NO_CONTEXT
static std::string quote_string(const std::string &feature)
uint64_t count() const
feature_recorder(const feature_recorder &)
static const int FLAG_NO_STOPLIST
static const int FLAG_XML
virtual void close()
static pthread_t main_threadid
virtual void set_memhist_limit(int64_t limit_)
class feature_recorder * stop_list_recorder
void set_context_window_after(size_t win)
virtual void add_histogram(const histogram_def &def)
int() dump_callback_t(void *user, const feature_recorder &fr, const histogram_def &def, const std::string &feature, const uint64_t &count)
static int dump_callback_test(void *user, const feature_recorder &fr, const std::string &str, const uint64_t &count)
virtual void dump_histograms(void *user, feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const
static void set_debug(uint32_t ndebug)
void quote_if_necessary(std::string &feature, std::string &context)
std::string ignore_encoding
bool flag_notset(uint32_t f) const
virtual std::string carve(const sbuf_t &sbuf, size_t pos, size_t len, const std::string &ext)
static void set_main_threadid()
void set_carve_mode(carve_mode_t aMode)
static uint32_t opt_max_feature_size
virtual void flush()
virtual void dump_histogram_file(const histogram_def &def, void *user, feature_recorder::dump_callback_t cb) const
static uint32_t debug
static const int FLAG_DISABLED
void set_context_window_before(size_t win)
static const std::string bulk_extractor_version_header
static std::string unquote_string(const std::string &feature)
mhistograms_t mhistograms
std::string fname_counter(std::string suffix) const
carve_cache_t carve_cache
carve_mode_t carve_mode
void set_context_window(size_t win)
class feature_recorder_set & fs
virtual void open()
static const std::string feature_file_header
feature_recorder & operator=(const feature_recorder &)
void set_stop_list_recorder(class feature_recorder *fr)
virtual void printf(const char *fmt_,...)
histogram_defs_t histogram_defs
static const std::string histogram_file_header
virtual void dump_histogram_db(const histogram_def &def, void *user, feature_recorder::dump_callback_t cb) const
bool flag_set(uint32_t f) const
virtual void set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601)
Definition: sbuf.h:70
Definition: sbuf.h:221
std::map< histogram_def, mhistogram_t * > mhistograms_t
atomic_set< std::string > carve_cache_t
std::set< histogram_def > histogram_defs_t
atomic_histogram< std::string, uint64_t > mhistogram_t
bool operator<(const histogram_def &h1, const histogram_def &h2)
bool operator!=(const histogram_def &h1, const histogram_def &h2)
static void MAINTHREAD()
#define BEAPI_SQLITE3_STMT
#define BEAPI_SQLITE3
flags
Definition: http_parser.h:216
unsigned int uint32_t
Definition: core.h:40
const std::string feature
histogram_def(std::string feature_, std::string re_, std::string suffix_, uint32_t flags_=0)
histogram_def(std::string feature_, std::string re_, std::string require_, std::string suffix_, uint32_t flags_=0)
const std::string pattern
const uint32_t flags
const std::string require
const std::string suffix
const beregex reg