"Fossies" - the Fresh Open Source Software Archive 
Member "tcpflow-1.6.1/src/be13_api/feature_recorder.h" (19 Feb 2021, 15791 Bytes) of package /linux/misc/tcpflow-1.6.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "feature_recorder.h" see the
Fossies "Dox" file reference documentation and the last
Fossies "Diffs" side-by-side code changes report:
1.4.5_vs_1.5.0.
1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 #ifndef FEATURE_RECORDER_H
4 #define FEATURE_RECORDER_H
5
6 /**
7 * \addtogroup bulk_extractor_APIs
8 * @{
9 */
10
11 /**
12 * feature_recorder.h:
13 *
14 * System for recording features from the scanners into the feature files.
15 *
16 * There is one feature_recorder per feature file. It is used both to record
17 * the features and to perform the histogram calculation.
18 * (That should probably be moved to a different class.) It also also previously
19 * had the ability to do a merge sort, but we took that out because it was
20 * not necessary.
21 *
22 * The feature recorders can also check the global alert_list to see
23 * if the feature should be written to the alert file. It's opened on
24 * demand and immediately flushed and closed. A special mutex is used
25 * to protect it.
26 *
27 * Finally, the feature recorder supports the global stop_list, which
28 * is a list of features that are not written to the main file but are
29 * written to a stop list. That is implemented with a second
30 * feature_recorder.
31 *
32 * There is one feature_recorder_set per process.
33 * The file assumes that bulk_extractor.h is being included.
34 */
35
36 #include <string>
37 #include <cstdarg>
38 #include <fstream>
39 #include <set>
40 #include <map>
41 #include <cassert>
42 #include <pthread.h>
43
44 #ifdef HAVE_SQLITE3_H
45 #include <sqlite3.h>
46 #ifndef BEAPI_SQLITE
47 # define BEAPI_SQLITE3 sqlite3
48 # define BEAPI_SQLITE3_STMT sqlite3_stmt
49 #endif
50 #endif
51
52 #ifndef BEAPI_SQLITE3
53 #define BEAPI_SQLITE3 void
54 #define BEAPI_SQLITE3_STMT void
55 #endif
56
57
58 #include "cppmutex.h"
59 #include "dfxml/src/dfxml_writer.h"
60 #include "dfxml/src/hash_t.h"
61 #include "atomic_set_map.h"
62 #include "beregex.h"
63
64 /**
65 * histogram_def defines the histograms that will be made by a feature recorder.
66 * If the mhistogram is set, the histogram is generated when features are recorded
67 * and kept in memory. If mhistogram is not set, the histogram is generated when the feature recorder is closed.
68 */
69
70 struct histogram_def {
71 /**
72 * @param feature- the feature file to histogram (no .txt)
73 * @param re - the regular expression to extract
74 * @param require- require this string on the line (usually in context)
75 * @param suffix - the suffix to add to the histogram file after feature name before .txt
76 * @param flags - any flags (see above)
77 */
78
79 histogram_def(std::string feature_,std::string re_,std::string suffix_,uint32_t flags_=0):
80 feature(feature_),pattern(re_),require(),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){}
81 histogram_def(std::string feature_,std::string re_,std::string require_,std::string suffix_,uint32_t flags_=0):
82 feature(feature_),pattern(re_),require(require_),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){ }
83 const std::string feature; /* feature file name */
84 const std::string pattern; /* extract pattern; "" means use entire feature */
85 const std::string require; /* text required somewhere on the feature line; used for IP histograms */
86 const std::string suffix; /* suffix to append; "" means "histogram" */
87 const uint32_t flags; // defined in histogram.h
88 const beregex reg; // regular expression for pattern
89 };
90
91 /* NOTE:
92 * 1 - This typedef must remain outside the the feature_recorder due
93 * to historical reasons and cannot be made a vector
94 * 2 - Do not make historam_def const! It breaks some compilers.
95 */
96
97 typedef std::set<histogram_def> histogram_defs_t; // a set of histogram definitions
98
99
100 inline bool operator <(const histogram_def &h1,const histogram_def &h2) {
101 if (h1.feature<h2.feature) return true;
102 if (h1.feature>h2.feature) return false;
103 if (h1.pattern<h2.pattern) return true;
104 if (h1.pattern>h2.pattern) return false;
105 if (h1.suffix<h2.suffix) return true;
106 if (h1.suffix>h2.suffix) return false;
107 return false; /* equal */
108 };
109
110 inline bool operator !=(const histogram_def &h1,const histogram_def &h2) {
111 return h1.feature!=h2.feature || h1.pattern!=h2.pattern || h1.suffix!=h2.suffix;
112 };
113
114
115 /* carve object cache */
116 typedef atomic_set<std::string> carve_cache_t;
117
118 /* in-memory histograms */
119 typedef atomic_histogram<std::string,uint64_t> mhistogram_t; // memory histogram
120 typedef std::map<histogram_def,mhistogram_t *> mhistograms_t;
121
122
123 class feature_recorder {
124 // default copy construction and assignment are meaningless
125 // and not implemented
126 feature_recorder(const feature_recorder &);
127 feature_recorder &operator=(const feature_recorder &);
128
129 static uint32_t debug; // are we debugging?
130 static pthread_t main_threadid; // main threads ID
131 static void MAINTHREAD(); // called if can only be run in the main thread
132 uint32_t flags; // flags for this feature recorder
133 /****************************************************************/
134
135 public:
136 class besql_stmt {
137 besql_stmt(const besql_stmt &);
138 besql_stmt &operator=(const besql_stmt &);
139 public:
140 cppmutex Mstmt; // a mutext to protect it
141 BEAPI_SQLITE3_STMT *stmt; // the prepared statement
142 besql_stmt(BEAPI_SQLITE3 *db3,const char *sql);
143 virtual ~besql_stmt();
144 void insert_feature(const pos0_t &pos, // insert it into this table!
145 const std::string &feature,const std::string &feature8, const std::string &context);
146 };
147
148 typedef int (dump_callback_t)(void *user,const feature_recorder &fr,const histogram_def &def,
149 const std::string &feature,const uint64_t &count);
150 static void set_main_threadid(){
151 #ifndef WIN32
152 main_threadid=pthread_self();
153 #endif
154 }; // set the main
155 static void set_debug(uint32_t ndebug){debug=ndebug;}
156 typedef std::string offset_t;
157
158 /**
159 * \name Flags that control scanners
160 * @{
161 * These flags control scanners. Set them with set_flag().
162 */
163 /** Disable this recorder. */
164 static const int FLAG_DISABLED = 0x01; // feature recorder is Disabled
165 static const int FLAG_NO_CONTEXT = 0x02; // Do not write context.
166 static const int FLAG_NO_STOPLIST = 0x04; // Do not honor the stoplist/alertlist.
167 static const int FLAG_NO_ALERTLIST = 0x08; // Do not honor the stoplist/alertlist.
168 /**
169 * Normally feature recorders automatically quote non-UTF8 characters
170 * with \x00 notation and quote "\" as \x5C. Specify FLAG_NO_QUOTE to
171 * disable this behavior.
172 */
173 static const int FLAG_NO_QUOTE = 0x10; // do not escape UTF8 codes
174
175 /**
176 * Use this flag the feature recorder is sending UTF-8 XML.
177 * non-UTF8 will be quoted but "\" will not be escaped.
178 */
179 static const int FLAG_XML = 0x20; // will be sending XML
180
181 /**
182 * histogram support.
183 */
184 static const uint32_t FLAG_NO_FEATURES = 0x40; // do not record features (just memory histogram)
185 static const uint32_t FLAG_NO_FEATURES_SQL = 0x80; // do not write features to SQL
186
187 /** @} */
188 static const int max_histogram_files = 10; // don't make more than 10 files in low-memory conditions
189 static const std::string histogram_file_header;
190 static const std::string feature_file_header;
191 static const std::string bulk_extractor_version_header;
192
193 // These must only be changed in the main thread:
194 static uint32_t opt_max_context_size;
195 static uint32_t opt_max_feature_size;
196 static int64_t offset_add; // added to every reported offset, for use with hadoop
197 static std::string banner_file; // banner for top of every file
198 static std::string extract_feature(const std::string &line);
199
200 feature_recorder(class feature_recorder_set &fs,
201 const std::string &name);
202 virtual ~feature_recorder();
203 virtual void set_flag(uint32_t flags_);
204 virtual void unset_flag(uint32_t flags_);
205 void enable_memory_histograms(); // only called from feature_recorder_set
206 virtual void set_memhist_limit(int64_t limit_);
207 bool flag_set(uint32_t f) const {return flags & f;}
208 bool flag_notset(uint32_t f) const {return !(flags & f);}
209 uint32_t get_flags() const {return flags;}
210 virtual const std::string &get_outdir() const;
211
212 static size_t context_window_default; // global option
213 const std::string name; // name of this feature recorder
214
215 private:
216 std::string ignore_encoding; // encoding to ignore for carving
217 std::fstream ios; // where features are written
218
219 class besql_stmt *bs; // prepared beapi sql statement
220
221 protected:;
222 histogram_defs_t histogram_defs; // histograms that are to be created for this feature recorder
223 public:
224 class feature_recorder_set &fs; // the set in which this feature_recorder resides
225 protected:
226 int64_t count_; /* number of records written */
227 size_t context_window_before; // context window
228 size_t context_window_after; // context window
229
230 mutable cppmutex Mf; // protects the file & file_number_
231 mutable cppmutex Mr; // protects the redlist
232 mhistograms_t mhistograms; // the memory histograms, if we are using them
233 uint64_t mhistogram_limit; // how many we want (per feature recorder limit, rather than per histogram)
234
235
236 class feature_recorder *stop_list_recorder; // where stopped features get written
237 int64_t file_number_; /* starts at 0; gets incremented by carve(); */
238 carve_cache_t carve_cache;
239 public:
240 /* these are not threadsafe and should only be called in startup */
241 void set_stop_list_recorder(class feature_recorder *fr){
242 MAINTHREAD();
243 stop_list_recorder = fr;
244 }
245 void set_context_window(size_t win){
246 MAINTHREAD();
247 context_window_before = win;
248 context_window_after = win;
249 }
250 void set_context_window_before(size_t win){ MAINTHREAD(); context_window_before = win;}
251 void set_context_window_after(size_t win){ MAINTHREAD(); context_window_after = win; }
252 void set_carve_ignore_encoding(const std::string &encoding){ MAINTHREAD();ignore_encoding = encoding;}
253 /* End non-threadsafe */
254
255 uint64_t file_number_add(uint64_t i){
256 #ifdef HAVE___SYNC_ADD_AND_FETCH
257 return __sync_add_and_fetch(&file_number_,i);
258 #else
259 cppmutex::lock lock(Mf);
260 file_number_ += i;
261 return file_number_;
262 #endif
263 }
264
265 void banner_stamp(std::ostream &os,const std::string &header) const; // stamp banner, and header
266
267 /* where stopped items (on stop_list or context_stop_list) get recorded: */
268 std::string fname_counter(std::string suffix) const;
269 static std::string quote_string(const std::string &feature); // turns unprintable characters to octal escape
270 static std::string unquote_string(const std::string &feature); // turns octal escape back to binary characters
271
272 //virtual const feature_recorder_set::hash_def &hasher(); // returns hasher in feature_recorder_set
273
274 /* feature file management */
275 virtual void open();
276 virtual void close();
277 virtual void flush();
278 static int dump_callback_test(void *user,const feature_recorder &fr,
279 const std::string &str,const uint64_t &count); // test callback for you to use!
280
281
282 /* TK: The histogram_def should be provided at the beginning, so it can be used for in-memory histograms.
283 * The callback needs to have the specific atomic set as the callback as well.
284 */
285 virtual void add_histogram(const histogram_def &def); // adds a histogram to process
286 virtual void dump_histogram_file(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
287 virtual void dump_histogram_db(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
288 virtual void dump_histogram(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const;
289 typedef void (*xml_notifier_t)(const std::string &xmlstring);
290 virtual void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const;
291
292 /* Methods to get info */
293 uint64_t count() const {return count_;}
294
295 /* Methods to write.
296 * write() is the basic write - you say where, and it does it.
297 * write_buf() writes from a position within the buffer, with context.
298 * It won't write a feature that starts in the margin.
299 * pos0 gives the location and prefix for the beginning of the buffer
300 */
301
302
303 /****************************************************************
304 *** External entry points.
305 ****************************************************************/
306
307 /**
308 * write() actually does the writing to the file.
309 * It uses locks and is threadsafe.
310 * Callers therefore do not need locks.
311 */
312 virtual void write(const std::string &str);
313
314 /**
315 * support for writing features
316 */
317
318 void quote_if_necessary(std::string &feature,std::string &context);
319
320 // only virtual functions may be called by plug-ins
321 // printf() prints to the feature file.
322 virtual void printf(const char *fmt_,...) __attribute__((format(printf, 2, 3)));
323 //
324 // write a feature and its context; the feature may be in the context, but doesn't need to be.
325 // write() calls write0() after histogram, quoting, and stoplist processing
326 // write0() calls write0_sql() if sqlwriting is enabled
327 virtual void write0(const pos0_t &pos0,const std::string &feature,const std::string &context);
328 private:
329 virtual void db_write0(const pos0_t &pos0,const std::string &feature,const std::string &context);
330 static const char *db_insert_stmt;
331 public:
332
333 // write a feature and its context; the feature may be in the context, but doesn't need to be.
334 // entries processed by write below will be processed by histogram system
335 virtual void write(const pos0_t &pos0,const std::string &feature,const std::string &context);
336
337 // write a feature located at a given place within an sbuf.
338 // Context is written automatically
339 virtual void write_buf(const sbuf_t &sbuf,size_t pos,size_t len); /* writes with context */
340
341 /**
342 * support for carving.
343 * Carving writes the filename to the feature file; the context is the file's hash using the provided function.
344 * Automatically de-duplicates.
345 */
346 enum carve_mode_t {
347 CARVE_NONE=0,
348 CARVE_ENCODED=1,
349 CARVE_ALL=2};
350 #define CARVE_MODE_DESCRIPTION "0=carve none; 1=carve encoded; 2=carve all"
351 carve_mode_t carve_mode;
352 typedef std::string (*hashing_function_t)(const sbuf_t &sbuf); // returns a hex value
353 void set_carve_mode(carve_mode_t aMode){MAINTHREAD();carve_mode=aMode;}
354
355 // Carve a file; returns filename of carved file or empty string if nothing carved
356 virtual std::string carve(const sbuf_t &sbuf,size_t pos,size_t len,
357 const std::string &ext); // appended to forensic path
358 // Set the time of the carved file to iso8601 file
359 virtual void set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601);
360 };
361
362 // function that can only be called from main thread
363 inline void feature_recorder::MAINTHREAD()
364 {
365 #ifndef WIN32
366 assert(main_threadid==pthread_self());
367 #endif
368 };
369
370
371 /** @} */
372
373 #endif