tcpflow  1.6.1
About: tcpflow is a TCP/IP packet demultiplexer that captures data transmitted as part of TCP connections (flows), and stores the data in a way that is convenient for protocol analysis and debugging.
  Fossies Dox: tcpflow-1.6.1.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

feature_recorder.cpp
Go to the documentation of this file.
1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 
3 #include "config.h"
4 #include "bulk_extractor_i.h"
5 #include "unicode_escape.h"
6 #include "histogram.h"
7 
8 #include <unistd.h>
9 #include <fcntl.h>
10 #include <sys/stat.h>
11 
12 #ifdef HAVE_STDARG_H
13 #include <stdarg.h>
14 #endif
15 
16 #ifndef MAXPATHLEN
17 #define MAXPATHLEN 65536
18 #endif
19 
20 #ifndef O_BINARY
21 #define O_BINARY 0
22 #endif
23 
24 #ifndef DEBUG_PEDANTIC
25 #define DEBUG_PEDANTIC 0x0001// check values more rigorously
26 #endif
27 
28 #ifndef WIN32
30 #endif
31 size_t feature_recorder::context_window_default=16; /* number of bytes of context */
37 
38 
39 /**
40  * Create a feature recorder object. Each recorder records a certain
41  * kind of feature. Features are stored in a file. The filename is
42  * permutated based on the total number of threads and the current
43  * thread that's recording. Each thread records to a different file,
44  * and thus a different feature recorder, to avoid locking
45  * problems.
46  *
47  * @param feature_recorder_set &fs - common information for all of the feature recorders
48  * @param name - the name of the feature being recorded.
49  */
50 
52  const std::string &name_):
53  flags(0),
54  name(name_),ignore_encoding(),ios(),bs(),
55  histogram_defs(),
56  fs(fs_),
57  count_(0),context_window_before(context_window_default),context_window_after(context_window_default),
58  Mf(),Mr(),mhistograms(),mhistogram_limit(),
59  stop_list_recorder(0),
60  file_number_(0),carve_cache(),carve_mode(CARVE_ENCODED)
61 {
62  //std::cerr << "feature_recorder(" << name << ") created\n";
63  open(); // open if we are created
64 }
65 
66 /* Don't have to delete the stop_list_recorder because it is in the
67  * feature_recorder_set and will be separately deleted.
68  */
70 {
71  if(ios.is_open()){
72  ios.close();
73  }
74 }
75 
76 void feature_recorder::banner_stamp(std::ostream &os,const std::string &header) const
77 {
78  int banner_lines = 0;
79  if(banner_file.size()>0){
80  std::ifstream i(banner_file.c_str());
81  if(i.is_open()){
82  std::string line;
83  while(getline(i,line)){
84  if(line.size()>0 && ((*line.end()=='\r') || (*line.end()=='\n'))){
85  line.erase(line.end()); /* remove the last character while it is a \n or \r */
86  }
87  os << "# " << line << "\n";
88  banner_lines++;
89  }
90  i.close();
91  }
92  }
93  if(banner_lines==0){
94  os << "# BANNER FILE NOT PROVIDED (-b option)\n";
95  }
96 
98  os << "# Feature-Recorder: " << name << "\n";
99  if(fs.get_input_fname().size()) os << "# Filename: " << fs.get_input_fname() << "\n";
100  if(debug!=0){
101  os << "# DEBUG: " << debug << " (";
102  if(debug & DEBUG_PEDANTIC) os << " DEBUG_PEDANTIC ";
103  os << ")\n";
104  }
105  os << header;
106 }
107 
108 
109 
110 /**
111  * Return the filename with a counter
112  */
113 std::string feature_recorder::fname_counter(std::string suffix) const
114 {
115  return fs.get_outdir() + "/" + this->name + (suffix.size()>0 ? (std::string("_") + suffix) : "") + ".txt";
116 }
117 
118 
119 const std::string &feature_recorder::get_outdir() const
120 {
121  return fs.get_outdir();
122 }
123 
124 /**
125  * open a feature recorder file in the specified output directory.
126  * Called by create_name(). Not clear why it isn't called when created.
127  */
128 
130 {
131  if (fs.flag_set(feature_recorder_set::SET_DISABLED)) return; // feature recorder set is disabled
132 
133  /* write to a database? Create tables if necessary and create a prepared statement */
135  char buf[1024];
137  snprintf(buf,sizeof(buf),db_insert_stmt,name.c_str());
138  bs = new besql_stmt(fs.db3,buf);
139  }
140 
141  /* Write to a file? Open the file and seek to the last line if it exist, otherwise just open database */
143  /* Open the file recorder */
144  std::string fname = fname_counter("");
145  ios.open(fname.c_str(),std::ios_base::in|std::ios_base::out|std::ios_base::ate);
146  if(ios.is_open()){ // opened existing stream
147  ios.seekg(0L,std::ios_base::end);
148  while(ios.is_open()){
149  /* Get current position */
150  if(int(ios.tellg())==0){ // at beginning of file; stamp and return
151  ios.seekp(0L,std::ios_base::beg); // be sure we are at the beginning of the file
152  return;
153  }
154  ios.seekg(-1,std::ios_base::cur); // backup to once less than the end of the file
155  if (ios.peek()=='\n'){ // we are finally on the \n
156  ios.seekg(1L,std::ios_base::cur); // move the getting one forward
157  ios.seekp(ios.tellg(),std::ios_base::beg); // put the putter at the getter location
158  count_ = 1; // greater than zero
159  return;
160  }
161  }
162  }
163  // Just open the stream for output
164  ios.open(fname.c_str(),std::ios_base::out);
165  if(!ios.is_open()){
166  std::cerr << "*** feature_recorder::open CANNOT OPEN FEATURE FILE FOR WRITING "
167  << fname << ":" << strerror(errno) << "\n";
168  exit(1);
169  }
170  }
171 }
172 
174 {
175  if(ios.is_open()){
176  ios.close();
177  }
178 }
179 
181 {
182  cppmutex::lock lock(Mf); // get the lock; released when object is deallocated.
183  ios.flush();
184 }
185 
186 
187 static inline bool isodigit(char c)
188 {
189  return c>='0' && c<='7';
190 }
191 
192 /* statics */
193 const std::string feature_recorder::feature_file_header("# Feature-File-Version: 1.1\n");
194 const std::string feature_recorder::histogram_file_header("# Histogram-File-Version: 1.1\n");
195 const std::string feature_recorder::bulk_extractor_version_header("# " PACKAGE_NAME "-Version: " PACKAGE_VERSION " ($Rev: 10844 $)\n");
196 
197 static inline int hexval(char ch)
198 {
199  switch (ch) {
200  case '0': return 0;
201  case '1': return 1;
202  case '2': return 2;
203  case '3': return 3;
204  case '4': return 4;
205  case '5': return 5;
206  case '6': return 6;
207  case '7': return 7;
208  case '8': return 8;
209  case '9': return 9;
210  case 'a': case 'A': return 10;
211  case 'b': case 'B': return 11;
212  case 'c': case 'C': return 12;
213  case 'd': case 'D': return 13;
214  case 'e': case 'E': return 14;
215  case 'f': case 'F': return 15;
216  }
217  return 0;
218 }
219 
220 /**
221  * Unquote Python or octal-style quoting of a string
222  */
223 std::string feature_recorder::unquote_string(const std::string &s)
224 {
225  size_t len = s.size();
226  if(len<4) return s; // too small for a quote
227 
228  std::string out;
229  for(size_t i=0;i<len;i++){
230  /* Look for octal coding */
231  if(i+3<len && s[i]=='\\' && isodigit(s[i+1]) && isodigit(s[i+2]) && isodigit(s[i+3])){
232  uint8_t code = (s[i+1]-'0') * 64 + (s[i+2]-'0') * 8 + (s[i+3]-'0');
233  out.push_back(code);
234  i += 3; // skip over the digits
235  continue;
236  }
237  /* Look for hex coding */
238  if(i+3<len && s[i]=='\\' && s[i+1]=='x' && isxdigit(s[i+2]) && isxdigit(s[i+3])){
239  uint8_t code = (hexval(s[i+2])*16) | hexval(s[i+3]);
240  out.push_back(code);
241  i += 3; // skip over the digits
242  continue;
243  }
244  out.push_back(s[i]);
245  }
246  return out;
247 }
248 
249 /**
250  * Get the feature which is defined as being between a \t and [\t\n]
251  */
252 
253 /*static*/ std::string feature_recorder::extract_feature(const std::string &line)
254 {
255  size_t tab1 = line.find('\t');
256  if(tab1==std::string::npos) return ""; // no feature
257  size_t feature_start = tab1+1;
258  size_t tab2 = line.find('\t',feature_start);
259  if(tab2!=std::string::npos) return line.substr(feature_start,tab2-feature_start);
260  return line.substr(feature_start); // no context to remove
261 }
262 
264 {
265  MAINTHREAD();
266  flags|=flags_;
267 }
268 
270 {
271  MAINTHREAD();
272  flags &= (~flags_);
273 }
274 
276 {
277  MAINTHREAD();
278  mhistogram_limit = limit_;
279 }
280 
281 
282 // add a memory histogram; assume the position in the mhistograms is stable
284 {
285  for(histogram_defs_t::const_iterator it=histogram_defs.begin();it!=histogram_defs.end();it++){
286  mhistograms[*it] = new mhistogram_t();
287  }
288 }
289 
290 
291 /**
292  * Create a histogram for this feature recorder and an extraction pattern.
293  */
294 
295 /* dump_callback_test is a simple callback that just prints to stderr. It's for testing */
297  const std::string &str,const uint64_t &count)
298 {
299  (void)user;
300  std::cerr << "dump_cb: user=" << user << " " << str << ": " << count << "\n";
301  return 0;
302 }
303 
304 /* Make a histogram. If a callback is provided, send the output there. */
308 public:
309  mhistogram_callback(void *user_,
311  const histogram_def &def_,
312  const feature_recorder &fr_,
313  uint64_t limit_):user(user_),cb(cb_),def(def_),fr(fr_),callback_count(0),limit(limit_){}
314  void *user;
318  uint64_t callback_count;
319  uint64_t limit;
320  int do_callback(const std::string &str,const uint64_t &tally){
321  (*cb)(user,fr,def,str,tally);
322  if(limit && ++callback_count >= limit) return -1;
323  return 0;
324  }
325  static int callback(void *ptr,const std::string &str,const uint64_t &tally) {
326  return ((mhistogram_callback *)(ptr))->do_callback(str,tally);
327  }
328 };
329 
330 /****************************************************************
331  *** PHASE HISTOGRAM (formerly phase 3): Create the histograms
332  ****************************************************************/
333 
334 /**
335  * We now have three kinds of histograms:
336  * 1 - Traditional post-processing histograms specified by the histogram library
337  * 1a - feature-file based traditional ones
338  * 1b - SQL-based traditional ones.
339  * 2 - In-memory histograms (used primarily by beapi)
340  */
341 
342 
343 /** Dump a specific histogram */
345 {
346  /* This is a file based histogram. We will be reading from one file and writing to another */
347  std::string ifname = fname_counter(""); // source of features
348  std::ifstream f(ifname.c_str());
349  if(!f.is_open()){
350  std::cerr << "Cannot open histogram input file: " << ifname << "\n";
351  return;
352  }
353 
354  /* Read each line of the feature file and add it to the histogram.
355  * If we run out of memory, dump that histogram to a file and start
356  * on the next histogram.
357  */
358  for(int histogram_counter = 0;histogram_counter<max_histogram_files;histogram_counter++){
359 
360  HistogramMaker h(def.flags); /* of seen features, created in pass two */
361  try {
362  std::string line;
363  while(getline(f,line)){
364  if(line.size()==0) continue; // empty line
365  if(line[0]=='#') continue; // comment line
366  truncate_at(line,'\r'); // truncate at a \r if there is one.
367 
368  /** If there is a string required in the line and it isn't present, don't use this line */
369  if(def.require.size()){
370  if(line.find_first_of(def.require)==std::string::npos){
371  continue;
372  }
373  }
374 
375  std::string feature = extract_feature(line);
376  if(feature.find('\\')!=std::string::npos){
377  feature = unquote_string(feature); // reverse \xxx encoding
378  }
379  /** If there is a pattern to use to prune down the feature, use it */
380  if(def.pattern.size()){
381  std::string new_feature = feature;
382  if(!def.reg.search(feature,&new_feature,0,0)){
383  // no search match; avoid this feature
384  continue;
385  }
386  feature = new_feature;
387  }
388 
389  /* Remove what follows after \t if this is a context file */
390  size_t tab=feature.find('\t');
391  if(tab!=std::string::npos) feature.erase(tab); // erase from tab to end
392  h.add(feature);
393  }
394  f.close();
395  }
396  catch (const std::exception &e) {
397  std::cerr << "ERROR: " << e.what() << " generating histogram "
398  << name << "\n";
399  }
400 
401  /* Output what we have to a new file ofname */
402  std::stringstream real_suffix;
403 
404  real_suffix << def.suffix;
405  if(histogram_counter>0) real_suffix << histogram_counter;
406  std::string ofname = fname_counter(real_suffix.str()); // histogram name
407  std::ofstream o;
408  o.open(ofname.c_str()); // open the file
409  if(!o.is_open()){
410  std::cerr << "Cannot open histogram output file: " << ofname << "\n";
411  return;
412  }
413 
415  if(fr->size()>0){
417  o << *fr; // sends the entire histogram
418  }
419 
420  for(size_t i = 0;i<fr->size();i++){
421  delete fr->at(i);
422  }
423  delete fr;
424  o.close();
425 
426  if(f.is_open()==false){
427  return; // input file was closed
428  }
429  }
430  std::cerr << "Looped " << max_histogram_files
431  << " times on histogram; something seems wrong\n";
432 }
433 
434 
436 {
437  /* Inform that we are dumping this histogram */
438  if(cb) cb(user,*this,def,"",0);
439 
440  /* If this is a memory histogram, dump it and return */
441  mhistograms_t::const_iterator it = mhistograms.find(def);
442  if(it!=mhistograms.end()){
443  assert(cb!=0);
444  mhistogram_callback mcbo(user,cb,def,*this,mhistogram_limit);
445  it->second->dump_sorted(static_cast<void *>(&mcbo),mhistogram_callback::callback);
446  return;
447  }
448 
450  dump_histogram_db(def,user,cb);
451  }
452 
453 
455  dump_histogram_file(def,user,cb);
456  }
457 }
458 
459 
460 /* Dump all of this feature recorders histograms */
461 
462 
464  feature_recorder_set::xml_notifier_t xml_error_notifier) const
465 {
466  /* If we are recording features to SQL and we have a histogram defintion
467  * for this feature recorder, we need to create a base histogram first,
468  * then we can create the extracted histograms if they are presented.
469  */
470 
471 
472  /* Loop through all the histograms and dump each one.
473  * This now works for both memory histograms and non-memory histograms.
474  */
475  for(histogram_defs_t::const_iterator it = histogram_defs.begin();it!=histogram_defs.end();it++){
476  try {
477  dump_histogram((*it),user,cb);
478  }
479  catch (const std::exception &e) {
480  std::cerr << "ERROR: histogram " << name << ": " << e.what() << "\n";
481  if(xml_error_notifier){
482  std::string error = std::string("<error function='phase3' histogram='")
483  + name + std::string("</error>");
484  (*xml_error_notifier)(error);
485  }
486  }
487  }
488 }
489 
490 
492 {
493  histogram_defs.insert(def);
494 }
495 
496 
497 
498 /****************************************************************
499  *** WRITING SUPPORT
500  ****************************************************************/
501 
502 /* Write to the file.
503  * This is the only place where writing happens.
504  * So it's an easy place to do UTF-8 validation in debug mode.
505  */
506 void feature_recorder::write(const std::string &str)
507 {
508  if(debug & DEBUG_PEDANTIC){
509  if(utf8::find_invalid(str.begin(),str.end()) != str.end()){
510  std::cerr << "******************************************\n";
511  std::cerr << "feature recorder: " << name << "\n";
512  std::cerr << "invalid UTF-8 in write: " << str << "\n";
513  assert(0);
514  }
515  }
516 
517  /* This is where the writing happens. Lock the output and write */
519  return;
520  }
521 
522  cppmutex::lock lock(Mf);
523  if(ios.is_open()){
524  if(count_==0){
526  }
527 
528  ios << str << '\n';
529  if(ios.fail()){
530  std::cerr << "DISK FULL\n";
531  ios.close();
532  }
533  count_++;
534  }
535 }
536 
537 void feature_recorder::printf(const char *fmt, ...)
538 {
539  const int maxsize = 65536;
540  managed_malloc<char>p(maxsize);
541 
542  if(p.buf==0) return;
543 
544  va_list ap;
545  va_start(ap,fmt);
546  vsnprintf(p.buf,maxsize,fmt,ap);
547  va_end(ap);
548  this->write(p.buf);
549 }
550 
551 
552 /**
553  * Combine the pos0, feature and context into a single line and write it to the feature file.
554  *
555  * @param feature - The feature, which is valid UTF8 (but may not be exactly the bytes on the disk)
556  * @param context - The context, which is valid UTF8 (but may not be exactly the bytes on the disk)
557  *
558  * Interlocking is done in write().
559  */
560 
561 void feature_recorder::write0(const pos0_t &pos0,const std::string &feature,const std::string &context)
562 {
564  this->flag_notset(feature_recorder::FLAG_NO_FEATURES_SQL) ) {
565  db_write0( pos0, feature, context);
566  }
568  std::stringstream ss;
569  ss << pos0.shift( feature_recorder::offset_add).str() << '\t' << feature;
570  if (flag_notset( FLAG_NO_CONTEXT ) && ( context.size()>0 )) ss << '\t' << context;
571  this->write( ss.str() );
572  }
573 }
574 
575 
576 /**
577  * the main entry point of writing a feature and its context to the feature file.
578  * processes the stop list
579  */
580 
581 void feature_recorder::quote_if_necessary(std::string &feature,std::string &context)
582 {
583  /* By default quote string that is not UTF-8, and quote backslashes. */
584  bool escape_bad_utf8 = true;
585  bool escape_backslash = true;
586 
587  if(flags & FLAG_NO_QUOTE){ // don't quote either
588  escape_bad_utf8 = false;
589  escape_backslash = false;
590  }
591 
592  if(flags & FLAG_XML){ // only quote bad utf8
593  escape_bad_utf8 = true;
594  escape_backslash = false;
595  }
596 
597  feature = validateOrEscapeUTF8(feature, escape_bad_utf8,escape_backslash);
598  if(feature.size() > opt_max_feature_size) feature.resize(opt_max_feature_size);
600  context = validateOrEscapeUTF8(context,escape_bad_utf8,escape_backslash);
602  }
603 }
604 
605 /**
606  * write() is the main entry point for writing a feature at a given position with context.
607  * write() checks the stoplist and escapes non-UTF8 characters, then calls write0().
608  */
609 void feature_recorder::write(const pos0_t &pos0,const std::string &feature_,const std::string &context_)
610 {
611  if(flags & FLAG_DISABLED) return; // disabled
612  if(debug & DEBUG_PEDANTIC){
613  if(feature_.size() > opt_max_feature_size){
614  std::cerr << "feature_recorder::write : feature_.size()=" << feature_.size() << "\n";
615  assert(0);
616  }
617  if(context_.size() > opt_max_context_size){
618  std::cerr << "feature_recorder::write : context_.size()=" << context_.size() << "\n";
619  assert(0);
620  }
621  }
622 
623  std::string feature = feature_;
624  std::string context = flag_set(FLAG_NO_CONTEXT) ? "" : context_;
625  std::string *feature_utf8 = HistogramMaker::make_utf8(feature); // a utf8 feature
626 
627  quote_if_necessary(feature,context);
628 
629  if(feature.size()==0){
630  std::cerr << name << ": zero length feature at " << pos0 << "\n";
631  if(debug & DEBUG_PEDANTIC) assert(0);
632  return;
633  }
634  if(debug & DEBUG_PEDANTIC){
635  /* Check for tabs or newlines in feature and and context */
636  for(size_t i=0;i<feature.size();i++){
637  if(feature[i]=='\t') assert(0);
638  if(feature[i]=='\n') assert(0);
639  if(feature[i]=='\r') assert(0);
640  }
641  for(size_t i=0;i<context.size();i++){
642  if(context[i]=='\t') assert(0);
643  if(context[i]=='\n') assert(0);
644  if(context[i]=='\r') assert(0);
645  }
646  }
647 
648  /* First check to see if the feature is on the stop list.
649  * Only do this if we have a stop_list_recorder (the stop list recorder itself
650  * does not have a stop list recorder. If it did we would infinitely recurse.
651  */
653  if(fs.stop_list
654  && fs.stop_list->check_feature_context(*feature_utf8,context)){
655  stop_list_recorder->write(pos0,feature,context);
656  delete feature_utf8;
657  return;
658  }
659  }
660 
661  /* The alert list is a special features that are called out.
662  * If we have one of those, write it to the redlist.
663  */
665  && fs.alert_list
666  && fs.alert_list->check_feature_context(*feature_utf8,context)){
667  std::string alert_fn = fs.get_outdir() + "/ALERTS_found.txt";
668  cppmutex::lock lock(Mr); // notice we are locking the alert list
669  std::ofstream rf(alert_fn.c_str(),std::ios_base::app);
670  if(rf.is_open()){
671  rf << pos0.shift(feature_recorder::offset_add).str() << '\t' << feature << '\t' << "\n";
672  }
673  }
674 
675  /* Support in-memory histograms */
676  for(mhistograms_t::iterator it = mhistograms.begin(); it!=mhistograms.end();it++){
677  const histogram_def &def = it->first;
678  mhistogram_t *m = it->second;
679  std::string new_feature = *feature_utf8;
680  if(def.require.size()==0 || new_feature.find_first_of(def.require)!=std::string::npos){
681  /* If there is a pattern to use, use it */
682  if(def.pattern.size()){
683  if(!def.reg.search(new_feature,&new_feature,0,0)){
684  // no search match; avoid this feature
685  new_feature = "";
686  }
687  }
688  if(new_feature.size()) m->add(new_feature,1);
689  }
690  }
691 
692  /* Finally write out the feature and the context */
694  this->write0(pos0,feature,context);
695  }
696  delete feature_utf8;
697 }
698 
699 /**
700  * Given a buffer, an offset into that buffer of the feature, and the length
701  * of the feature, make the context and write it out. This is mostly used
702  * for writing from within the lexical analyzers.
703  */
704 
705 void feature_recorder::write_buf(const sbuf_t &sbuf,size_t pos,size_t len)
706 {
707 #ifdef DEBUG_SCANNER
708  if(debug & DEBUG_SCANNER){
709  std::cerr << "*** write_buf " << name << " sbuf=" << sbuf << " pos=" << pos << " len=" << len << "\n";
710  // for debugging, print Imagine that when pos= the location where the crash is happening.
711  // then set a breakpoint at std::cerr.
712  if(pos==9999999){
713  std::cerr << "Imagine that\n";
714  }
715  }
716 #endif
717 
718  /* If we are in the margin, ignore; it will be processed again */
719  if(pos >= sbuf.pagesize && pos < sbuf.bufsize){
720  return;
721  }
722 
723  if(pos >= sbuf.bufsize){ /* Sanity checks */
724  std::cerr << "*** write_buf: WRITE OUTSIDE BUFFER. "
725  << " pos=" << pos
726  << " sbuf=" << sbuf << "\n";
727  return;
728  }
729 
730  /* Asked to write beyond bufsize; bring it in */
731  if(pos+len > sbuf.bufsize){
732  len = sbuf.bufsize - pos;
733  }
734 
735  std::string feature = sbuf.substr(pos,len);
736  std::string context;
737 
738  if((flags & FLAG_NO_CONTEXT)==0){
739  /* Context write; create a clean context */
740  size_t p0 = context_window_before < pos ? pos-context_window_before : 0;
741  size_t p1 = pos+len+context_window_after;
742 
743  if(p1>sbuf.bufsize) p1 = sbuf.bufsize;
744  assert(p0<=p1);
745  context = sbuf.substr(p0,p1-p0);
746  }
747  this->write(sbuf.pos0+pos,feature,context);
748 #ifdef DEBUG_SCANNER
749  if(debug & DEBUG_SCANNER){
750  std::cerr << ".\n";
751  }
752 #endif
753 }
754 
755 
756 /**
757  * replace a character in a string with another
758  */
759 std::string replace(const std::string &src,char f,char t)
760 {
761  std::string ret;
762  for(size_t i=0;i<src.size();i++){
763  if(src[i]==f) ret.push_back(t);
764  else ret.push_back(src[i]);
765  }
766  return ret;
767 }
768 
769 /****************************************************************
770  *** CARVING SUPPORT
771  ****************************************************************
772  *
773  * Carving support.
774  * 2014-04-24 - $ is no longer valid either
775  * 2013-08-29 - replace invalid characters in filenames
776  * 2013-07-30 - automatically bin directories
777  * 2013-06-08 - filenames are the forensic path.
778  */
779 
780 std::string valid_dosname(std::string in)
781 {
782  std::string out;
783  for(size_t i=0;i<in.size();i++){
784  uint8_t ch = in.at(i);
785  if(ch<=32 || ch>=128
786  || ch=='"' || ch=='*' || ch=='+' || ch==','
787  || ch=='/' || ch==':' || ch==';' || ch=='<'
788  || ch=='=' || ch=='>' || ch=='?' || ch=='\\'
789  || ch=='[' || ch==']' || ch=='|' || ch=='$' ){
790  out.push_back('_');
791  } else {
792  out.push_back(ch);
793  }
794  }
795  return out;
796 }
797 
798 
799 //const feature_recorder::hash_def &feature_recorder::hasher()
800 //{
801 // return fs.hasher;
802 //}
803 
804 
805 
806 #include <iomanip>
807 /**
808  * @param sbuf - the buffer to carve
809  * @param pos - offset in the buffer to carve
810  * @param len - how many bytes to carve
811  *
812  */
813 std::string feature_recorder::carve(const sbuf_t &sbuf,size_t pos,size_t len,
814  const std::string &ext)
815 {
816  if(flags & FLAG_DISABLED) return std::string(); // disabled
817 
818  /* If we are in the margin, ignore; it will be processed again */
819  if(pos >= sbuf.pagesize && pos < sbuf.bufsize){
820  return std::string();
821  }
822  assert(pos < sbuf.bufsize);
823 
824 
825 
826  /* Carve to a file depending on the carving mode. The purpose
827  * of CARVE_ENCODED is to allow us to carve JPEGs when they are
828  * embedded in, say, GZIP files, but not carve JPEGs that are
829  * bare. The difficulty arises when you have a tool that can go
830  * into, say, ZIP files. In this case, we don't want to carve
831  * every ZIP file, just the (for example) XORed ZIP files. So the
832  * ZIP carver doesn't carve every ZIP file, just the ZIP files
833  * that are in HIBER files. That is, we want to not carve a path
834  * of ZIP-234234 but we do want to carve a path of
835  * 1000-HIBER-33423-ZIP-2343. This is implemented by having an
836  * ignore_encoding. the ZIP carver sets it to ZIP so it won't
837  * carve things that are just found in a ZIP file. This means that
838  * it won't carve disembodied ZIP files found in unallocated
839  * space. You might want to do that. If so, set ZIP's carve mode
840  * to CARVE_ALL.
841  */
842  switch(carve_mode){
843  case CARVE_NONE:
844  return std::string(); // carve nothing
845  case CARVE_ENCODED:
846  if(sbuf.pos0.path.size()==0) return std::string(); // not encoded
847  if(sbuf.pos0.alphaPart()==ignore_encoding) return std::string(); // ignore if it is just encoded with this
848  break; // otherwise carve
849  case CARVE_ALL:
850  break;
851  }
852 
853  /* If the directory doesn't exist, make it.
854  * If two threads try to make the directory,
855  * that's okay, because the second one will fail.
856  */
857 
858  sbuf_t cbuf(sbuf,pos,len); // the buf we are going to carve
859  std::string carved_hash_hexvalue = (*fs.hasher.func)(cbuf.buf,cbuf.bufsize);
860 
861  /* See if this is in the cache */
862  bool in_cache = carve_cache.check_for_presence_and_insert(carved_hash_hexvalue);
863 
864 
865  uint64_t this_file_number = file_number_add(in_cache ? 0 : 1); // increment if we are not in the cache
866  std::string dirname1 = fs.get_outdir() + "/" + name;
867 
868  std::stringstream ss;
869  ss << dirname1 << "/" << std::setw(3) << std::setfill('0') << (this_file_number / 1000);
870 
871  std::string dirname2 = ss.str();
872  std::string fname = dirname2 + std::string("/") + valid_dosname(cbuf.pos0.str() + ext);
873  std::string fname_feature = fname.substr(fs.get_outdir().size()+1);
874 
875  /* Record what was found in the feature file.
876  */
877  if (in_cache){
878  fname=""; // no filename
879  fname_feature="<CACHED>";
880  }
881 
882  // write to the feature file
883  ss.str(std::string()); // clear the stringstream
884  ss << "<fileobject>";
885  if (!in_cache) ss << "<filename>" << fname << "</filename>";
886  ss << "<filesize>" << len << "</filesize>";
887  ss << "<hashdigest type='" << fs.hasher.name << "'>" << carved_hash_hexvalue << "</hashdigest></fileobject>";
888  this->write(cbuf.pos0,fname_feature,ss.str());
889 
890  if (in_cache) return fname; // do not make directories or write out if we are cached
891 
892  /* Make the directory if it doesn't exist. */
893  if (access(dirname2.c_str(),R_OK)!=0){
894 #ifdef WIN32
895  mkdir(dirname1.c_str());
896  mkdir(dirname2.c_str());
897 #else
898  mkdir(dirname1.c_str(),0777);
899  mkdir(dirname2.c_str(),0777);
900 #endif
901  }
902  /* Check to make sure that directory is there. We don't just the return code
903  * because there could have been two attempts to make the directory simultaneously,
904  * so the mkdir could fail but the directory could nevertheless exist. We need to
905  * remember the error number because the access() call may clear it.
906  */
907  int oerrno = errno; // remember error number
908  if (access(dirname2.c_str(),R_OK)!=0){
909  std::cerr << "Could not make directory " << dirname2 << ": " << strerror(oerrno) << "\n";
910  return std::string();
911  }
912 
913  /* Write the file into the directory */
914  int fd = ::open(fname.c_str(),O_CREAT|O_BINARY|O_RDWR,0666);
915  if(fd<0){
916  std::cerr << "*** carve: Cannot create " << fname << ": " << strerror(errno) << "\n";
917  return std::string();
918  }
919 
920  ssize_t ret = cbuf.write(fd,0,len);
921  if(ret<0){
922  std::cerr << "*** carve: Cannot write(pos=" << fd << "," << pos << " len=" << len << "): "<< strerror(errno) << "\n";
923  }
924  ::close(fd);
925  return fname;
926 }
927 
928 /**
929  * Currently, we need strptime() and utimes() to set the time.
930  */
931 void feature_recorder::set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601)
932 {
933  if(flags & FLAG_DISABLED) return; // disabled
934 #if defined(HAVE_STRPTIME) && defined(HAVE_UTIMES)
935  if(fname.size()){
936  struct tm tm;
937  if(strptime(mtime_iso8601.c_str(),"%Y-%m-%dT%H:%M:%S",&tm)){
938  time_t t = mktime(&tm);
939  if(t>0){
940  const struct timeval times[2] = {{t,0},{t,0}};
941  utimes(fname.c_str(),times);
942  }
943  }
944  }
945 #endif
946 }
947 
#define DEBUG_SCANNER
#define DEBUG_PEDANTIC
CTYPE add(const TYPE &val, const CTYPE &count)
bool check_for_presence_and_insert(const TYPE &s)
int search(const std::string &line, std::string *found, size_t *offset, size_t *len) const
Definition: beregex.cpp:85
virtual void write_buf(const sbuf_t &sbuf, size_t pos, size_t len)
static std::string banner_file
static uint32_t opt_max_context_size
virtual void db_write0(const pos0_t &pos0, const std::string &feature, const std::string &context)
static const uint32_t FLAG_NO_FEATURES_SQL
uint64_t file_number_add(uint64_t i)
virtual void dump_histogram(const histogram_def &def, void *user, feature_recorder::dump_callback_t cb) const
virtual ~feature_recorder()
static const uint32_t FLAG_NO_FEATURES
static int64_t offset_add
class besql_stmt * bs
virtual void set_flag(uint32_t flags_)
void banner_stamp(std::ostream &os, const std::string &header) const
virtual void unset_flag(uint32_t flags_)
virtual void write(const std::string &str)
static const int FLAG_NO_QUOTE
virtual const std::string & get_outdir() const
static const int FLAG_NO_ALERTLIST
static const char * db_insert_stmt
static const int max_histogram_files
static size_t context_window_default
static std::string extract_feature(const std::string &line)
virtual void write0(const pos0_t &pos0, const std::string &feature, const std::string &context)
const std::string name
static const int FLAG_NO_CONTEXT
uint64_t count() const
feature_recorder(const feature_recorder &)
static const int FLAG_NO_STOPLIST
static const int FLAG_XML
virtual void close()
static pthread_t main_threadid
virtual void set_memhist_limit(int64_t limit_)
class feature_recorder * stop_list_recorder
virtual void add_histogram(const histogram_def &def)
int() dump_callback_t(void *user, const feature_recorder &fr, const histogram_def &def, const std::string &feature, const uint64_t &count)
static int dump_callback_test(void *user, const feature_recorder &fr, const std::string &str, const uint64_t &count)
virtual void dump_histograms(void *user, feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const
void quote_if_necessary(std::string &feature, std::string &context)
std::string ignore_encoding
bool flag_notset(uint32_t f) const
virtual std::string carve(const sbuf_t &sbuf, size_t pos, size_t len, const std::string &ext)
static uint32_t opt_max_feature_size
virtual void flush()
virtual void dump_histogram_file(const histogram_def &def, void *user, feature_recorder::dump_callback_t cb) const
static uint32_t debug
static const int FLAG_DISABLED
static const std::string bulk_extractor_version_header
static std::string unquote_string(const std::string &feature)
mhistograms_t mhistograms
std::string fname_counter(std::string suffix) const
carve_cache_t carve_cache
carve_mode_t carve_mode
class feature_recorder_set & fs
virtual void open()
static const std::string feature_file_header
virtual void printf(const char *fmt_,...)
histogram_defs_t histogram_defs
static const std::string histogram_file_header
virtual void dump_histogram_db(const histogram_def &def, void *user, feature_recorder::dump_callback_t cb) const
bool flag_set(uint32_t f) const
virtual void set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601)
static int callback(void *ptr, const std::string &str, const uint64_t &tally)
feature_recorder::dump_callback_t * cb
mhistogram_callback & operator=(const mhistogram_callback &)
mhistogram_callback(void *user_, feature_recorder::dump_callback_t *cb_, const histogram_def &def_, const feature_recorder &fr_, uint64_t limit_)
mhistogram_callback(const mhistogram_callback &)
int do_callback(const std::string &str, const uint64_t &tally)
const feature_recorder & fr
const histogram_def & def
Definition: sbuf.h:70
Definition: sbuf.h:221
static bool isodigit(char c)
#define O_BINARY
std::string replace(const std::string &src, char f, char t)
std::string valid_dosname(std::string in)
static int hexval(char ch)
size_t bufsize
Definition: sbuf.h:248
std::string substr(size_t loc, size_t len) const
Definition: sbuf.cpp:172
TYPE * buf
Definition: sbuf.h:191
pos0_t shift(int64_t s) const
Definition: sbuf.h:125
std::string alphaPart() const
Definition: sbuf.h:100
atomic_histogram< std::string, uint64_t > mhistogram_t
std::string str() const
Definition: sbuf.h:79
ssize_t write(int fd, size_t loc, size_t len) const
Definition: sbuf.cpp:156
size_t pagesize
Definition: sbuf.h:249
static void MAINTHREAD()
pos0_t pos0
Definition: sbuf.h:235
const uint8_t * buf
Definition: sbuf.h:246
const std::string path
Definition: sbuf.h:72
std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8, bool escape_backslash)
std::string(* func)(const uint8_t *buf, const size_t bufsize)
bool flag_set(uint32_t f) const
FrequencyReportVector * makeReport() const
Definition: histogram.cpp:28
static const uint32_t DISABLE_FILE_RECORDERS
static const uint32_t SET_DISABLED
const word_and_context_list * alert_list
static std::string * make_utf8(const std::string &key)
Definition: histogram.cpp:128
static const uint32_t ENABLE_SQLITE3_RECORDERS
void db_create_table(const std::string &name)
void add(const std::string &key)
Definition: histogram.cpp:142
void(* xml_notifier_t)(const std::string &xmlstring)
bool flag_notset(uint32_t f) const
virtual const std::string & get_outdir() const
std::string get_input_fname() const
bool check_feature_context(const std::string &probe, const std::string &context) const
const word_and_context_list * stop_list
std::vector< ReportElement * > FrequencyReportVector
Definition: histogram.h:124
void truncate_at(std::string &line, char ch)
int isxdigit(int c)
const char * name
Definition: http_parser.c:465
flags
Definition: http_parser.h:216
unsigned int uint32_t
Definition: core.h:40
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition: core.h:317
const std::string pattern
const uint32_t flags
const std::string require
const std::string suffix
const beregex reg
int c
Definition: tcpdemux.cpp:366
#define PACKAGE_NAME
Definition: tcpflow.h:28
#define PACKAGE_VERSION
Definition: tcpflow.h:24
unsigned char uint8_t
Definition: util.h:6