"Fossies" - the Fresh Open Source Software Archive 
Member "tcpflow-1.6.1/src/scan_http.cpp" (19 Feb 2021, 18954 Bytes) of package /linux/misc/tcpflow-1.6.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "scan_http.cpp" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
1.5.0_vs_1.6.1.
1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /**
3 *
4 * scan_http:
5 * Decodes HTTP responses
6 */
7
8 #include "config.h"
9
10 #include "tcpflow.h"
11 #include "tcpip.h"
12 #include "tcpdemux.h"
13
14 #include "http-parser/http_parser.h"
15
16 #include "mime_map.h"
17
18 #ifdef HAVE_SYS_WAIT_H
19 #include <sys/wait.h>
20 #endif
21
22
23 #ifdef HAVE_LIBZ
24 # define ZLIB_CONST
25 # ifdef GNUC_HAS_DIAGNOSTIC_PRAGMA
26 # pragma GCC diagnostic ignored "-Wundef"
27 # pragma GCC diagnostic ignored "-Wcast-qual"
28 # endif
29 # ifdef HAVE_ZLIB_H
30 # include <zlib.h>
31 # endif
32 #else
33 # define z_stream void * // prevents z_stream from generating an error
34 #endif
35
36 #define MIN_HTTP_BUFSIZE 80 // don't bother parsing smaller than this
37
38 #include <sys/types.h>
39 #include <iostream>
40 #include <algorithm>
41 #include <map>
42 #include <iomanip>
43
44 #define HTTP_CMD "http_cmd"
45 #define HTTP_ALERT_FD "http_alert_fd"
46
47 /* options */
48 std::string http_cmd; // command to run on each http object
49 int http_subproc_max = 10; // how many subprocesses are we allowed?
50 int http_subproc = 0; // how many do we currently have?
51 int http_alert_fd = -1; // where should we send alerts?
52
53
54 /* define a callback object for sharing state between scan_http() and its callbacks
55 */
56 class scan_http_cbo {
57 private:
58 typedef enum {NOTHING,FIELD,VALUE} last_on_header_t;
59 scan_http_cbo(const scan_http_cbo& c); // not implemented
60 scan_http_cbo &operator=(const scan_http_cbo &c); // not implemented
61
62 public:
63 virtual ~scan_http_cbo(){
64 on_message_complete(); // make sure message was ended
65 }
66 scan_http_cbo(const std::string& path_,const char *base_,std::stringstream *xmlstream_) :
67 path(path_), base(base_),xmlstream(xmlstream_),xml_fo(),request_no(0),
68 headers(), last_on_header(NOTHING), header_value(), header_field(),
69 output_path(), fd(-1), first_body(true),bytes_written(0),unzip(false),zs(),zinit(false),zfail(false){};
70 private:
71
72 const std::string path; // where data gets written
73 const char *base; // where data started in memory
74 std::stringstream *xmlstream; // if present, where to put the fileobject annotations
75 std::stringstream xml_fo; // xml stream for this file object
76 int request_no; // request number
77
78 /* parsed headers */
79 std::map<std::string, std::string> headers;
80
81 /* placeholders for possibly-incomplete header data */
82 last_on_header_t last_on_header;
83 std::string header_value, header_field;
84 std::string output_path;
85 int fd; // fd for writing
86 bool first_body; // first call to on_body after headers
87 uint64_t bytes_written;
88
89 /* decompression for gzip-encoded streams. */
90 bool unzip; // should we be decompressing?
91 z_stream zs; // zstream (avoids casting and memory allocation)
92 bool zinit; // we have initialized the zstream
93 bool zfail; // zstream failed in some manner, so ignore the rest of this stream
94
95 /* The static functions are callbacks; they wrap the method calls */
96 #define CBO (reinterpret_cast<scan_http_cbo*>(parser->data))
97 public:
98 static int scan_http_cb_on_message_begin(http_parser * parser) { return CBO->on_message_begin();}
99 static int scan_http_cb_on_url(http_parser * parser, const char *at, size_t length) { return 0;}
100 static int scan_http_cb_on_header_field(http_parser * parser, const char *at, size_t length) { return CBO->on_header_field(at,length);}
101 static int scan_http_cb_on_header_value(http_parser * parser, const char *at, size_t length) { return CBO->on_header_value(at,length); }
102 static int scan_http_cb_on_headers_complete(http_parser * parser) { return CBO->on_headers_complete();}
103 static int scan_http_cb_on_body(http_parser * parser, const char *at, size_t length) { return CBO->on_body(at,length);}
104 static int scan_http_cb_on_message_complete(http_parser * parser) {return CBO->on_message_complete();}
105 #undef CBO
106 private:
107 int on_message_begin();
108 int on_url(const char *at, size_t length);
109 int on_header_field(const char *at, size_t length);
110 int on_header_value(const char *at, size_t length);
111 int on_headers_complete();
112 int on_body(const char *at, size_t length);
113 int on_message_complete();
114 };
115
116
117 /**
118 * on_message_begin:
119 * Increment request nubmer. Note that the first request is request_no = 1
120 */
121
122 int scan_http_cbo::on_message_begin()
123 {
124 request_no ++;
125 return 0;
126 }
127
128 /**
129 * on_url currently not implemented.
130 */
131
132 int scan_http_cbo::on_url(const char *at, size_t length)
133 {
134 return 0;
135 }
136
137
138 /* Note 1: The state machine is defined in http-parser/README.md
139 * Note 2: All header field names are converted to lowercase.
140 * This is consistent with the RFC.
141 */
142
143 int scan_http_cbo::on_header_field(const char *at,size_t length)
144 {
145 std::string field(at,length);
146 std::transform(field.begin(), field.end(), field.begin(), ::tolower);
147
148 switch(last_on_header){
149 case NOTHING:
150 // Allocate new buffer and copy callback data into it
151 header_field = field;
152 break;
153 case VALUE:
154 // New header started.
155 // Copy current name,value buffers to headers
156 // list and allocate new buffer for new name
157 headers[header_field] = header_value;
158 header_field = field;
159 break;
160 case FIELD:
161 // Previous name continues. Reallocate name
162 // buffer and append callback data to it
163 header_field.append(field);
164 break;
165 }
166 last_on_header = FIELD;
167 return 0;
168 }
169
170 int scan_http_cbo::on_header_value(const char *at, size_t length)
171 {
172 const std::string value(at,length);
173 switch(last_on_header){
174 case FIELD:
175 //Value for current header started. Allocate
176 //new buffer and copy callback data to it
177 header_value = value;
178 break;
179 case VALUE:
180 //Value continues. Reallocate value buffer
181 //and append callback data to it
182 header_value.append(value);
183 break;
184 case NOTHING:
185 // this shouldn't happen
186 DEBUG(10)("Internal error in http-parser");
187 break;
188 }
189 last_on_header = VALUE;
190
191 return 0;
192 }
193
194 /**
195 * called when last header is read.
196 * Determine the filename based on request_no and extension.
197 * Also see if decompressing is happening...
198 */
199
200 int scan_http_cbo::on_headers_complete()
201 {
202 tcpdemux *demux = tcpdemux::getInstance();
203
204 /* Add the most recently read header to the map, if any */
205 if (last_on_header==VALUE) {
206 headers[header_field] = header_value;
207 header_field="";
208 }
209
210 /* Set output path to <path>-HTTPBODY-nnn.ext for each part.
211 * This is not consistent with tcpflow <= 1.3.0, which supported only one HTTPBODY,
212 * but it's correct...
213 */
214
215 std::stringstream os;
216 os << path << "-HTTPBODY-" << std::setw(3) << std::setfill('0') << request_no << std::setw(0);
217
218 /* See if we can guess a file extension */
219 std::string extension = get_extension_for_mime_type(headers["content-type"]);
220 if (extension.size()) {
221 os << "." << extension;
222 }
223
224 output_path = os.str();
225
226 /* Choose an output function based on the content encoding */
227 std::string content_encoding(headers["content-encoding"]);
228
229 if ((content_encoding == "gzip" || content_encoding == "deflate") && (demux->opt.gzip_decompress)){
230 #ifdef HAVE_LIBZ
231 DEBUG(10) ( "%s: detected zlib content, decompressing", output_path.c_str());
232 unzip = true;
233 #else
234 /* We can't decompress, so just give it a .gz */
235 output_path.append(".gz");
236 DEBUG(5) ( "%s: refusing to decompress since zlib is unavailable", output_path.c_str() );
237 #endif
238 }
239
240 /* Open the output path */
241 fd = demux->retrying_open(output_path.c_str(), O_WRONLY|O_CREAT|O_BINARY|O_TRUNC, 0644);
242 if (fd < 0) {
243 DEBUG(1) ("unable to open HTTP body file %s", output_path.c_str());
244 }
245 if(http_alert_fd>=0){
246 std::stringstream ss;
247 ss << "open\t" << output_path << "\n";
248 const std::string &sso = ss.str();
249 if(write(http_alert_fd,sso.c_str(),sso.size())!=(int)sso.size()){
250 perror("write");
251 }
252 }
253
254 first_body = true; // next call to on_body will be the first one
255
256 /* We can do something smart with the headers here.
257 *
258 * For example, we could:
259 * - Record all headers into the report.xml
260 * - Pick the intended filename if we see Content-Disposition: attachment; name="..."
261 * - Record headers into filesystem extended attributes on the body file
262 */
263 return 0;
264 }
265
266 /* Write to fd, optionally decompressing as we go */
267 int scan_http_cbo::on_body(const char *at,size_t length)
268 {
269 if (fd < 0) return -1; // no open fd? (internal error)x
270 if (length==0) return 0; // nothing to write
271
272 if(first_body){ // stuff for first time on_body is called
273 xml_fo << " <byte_run file_offset='" << (at-base) << "'><fileobject><filename>" << output_path << "</filename>";
274 first_body = false;
275 }
276
277 /* If not decompressing, just write the data and return. */
278 if(unzip==false){
279 size_t offset = 0;
280 while (offset < length) {
281 int rv = write(fd, at + offset, length - offset);
282 if (rv < 0) return -1; // write error; that's bad
283 offset += rv;
284 }
285 bytes_written += offset;
286 return 0;
287 }
288
289 #ifndef HAVE_LIBZ
290 assert(0); // shoudln't have gotten here
291 #endif
292 if(zfail) return 0; // stream was corrupt; ignore rest
293 /* set up this round of decompression, using a small local buffer */
294
295 /* Call init if we are not initialized */
296 char decompressed[65536]; // where decompressed data goes
297 if (!zinit) {
298 memset(&zs,0,sizeof(zs));
299 zs.next_in = (Bytef*)at;
300 zs.avail_in = length;
301 zs.next_out = (Bytef*)decompressed;
302 zs.avail_out = sizeof(decompressed);
303
304 int rv = inflateInit2(&zs, 32 + MAX_WBITS); /* 32 auto-detects gzip or deflate */
305 if (rv != Z_OK) {
306 /* fail! */
307 DEBUG(3) ("decompression failed at stream initialization; rv=%d bad Content-Encoding?",rv);
308 zfail = true;
309 return 0;
310 }
311 zinit = true; // successfully initted
312 } else {
313 zs.next_in = (Bytef*)at;
314 zs.avail_in = length;
315 zs.next_out = (Bytef*)decompressed;
316 zs.avail_out = sizeof(decompressed);
317 }
318
319 /* iteratively decompress, writing each time */
320 while (zs.avail_in > 0) {
321 /* decompress as much as possible */
322 int rv = inflate(&zs, Z_SYNC_FLUSH);
323
324 if (rv == Z_STREAM_END) {
325 /* are we done with the stream? */
326 if (zs.avail_in > 0) {
327 /* ...no. */
328 DEBUG(3) ("decompression completed, but with trailing garbage");
329 return 0;
330 }
331 } else if (rv != Z_OK) {
332 /* some other error */
333 DEBUG(3) ("decompression failed (corrupted stream?)");
334 zfail = true; // ignore the rest of this stream
335 return 0;
336 }
337
338 /* successful decompression, at least partly */
339 /* write the result */
340 int bytes_decompressed = sizeof(decompressed) - zs.avail_out;
341 ssize_t written = write(fd, decompressed, bytes_decompressed);
342
343 if (written < bytes_decompressed) {
344 DEBUG(3) ("writing decompressed data failed");
345 zfail= true;
346 return 0;
347 }
348 bytes_written += written;
349
350 /* reset the buffer for the next iteration */
351 zs.next_out = (Bytef*)decompressed;
352 zs.avail_out = sizeof(decompressed);
353 }
354 return 0;
355 }
356
357
358 /**
359 * called at the conclusion of each HTTP body.
360 * Clean out all of the state for this HTTP header/body pair.
361 */
362
363 int scan_http_cbo::on_message_complete()
364 {
365 /* Close the file */
366 headers.clear();
367 header_field = "";
368 header_value = "";
369 last_on_header = NOTHING;
370 if(fd >= 0) {
371 if (::close(fd) != 0) {
372 perror("close() of http body");
373 }
374 fd = -1;
375 }
376
377 /* Erase zero-length files and update the DFXML */
378 if(bytes_written>0){
379 /* Update DFXML */
380 if(xmlstream){
381 xml_fo << "<filesize>" << bytes_written << "</filesize></fileobject></byte_run>\n";
382 if(xmlstream) *xmlstream << xml_fo.str();
383 }
384 if(http_alert_fd>=0){
385 std::stringstream ss;
386 ss << "close\t" << output_path << "\n";
387 const std::string &sso = ss.str();
388 if(write(http_alert_fd,sso.c_str(),sso.size()) != (int)sso.size()){
389 perror("write");
390 }
391 }
392 if(http_cmd.size()>0 && output_path.size()>0){
393 /* If we are at maximum number of subprocesses, wait for one to exit */
394 std::string cmd = http_cmd + " " + output_path;
395 #ifdef HAVE_FORK
396 int status=0;
397 pid_t pid = 0;
398 while(http_subproc >= http_subproc_max){
399 pid = wait(&status);
400 http_subproc--;
401 }
402 /* Fork off a child */
403 pid = fork();
404 if(pid<0) die("Cannot fork child");
405 if(pid==0){
406 /* We are the child */
407 exit(system(cmd.c_str()));
408 }
409 http_subproc++;
410 #else
411 system(cmd.c_str());
412 #endif
413 }
414 } else {
415 /* Nothing written; erase the file */
416 if(output_path.size() > 0){
417 ::unlink(output_path.c_str());
418 }
419 }
420
421 /* Erase the state variables for this part */
422 xml_fo.str("");
423 output_path = "";
424 bytes_written=0;
425 unzip = false;
426 if(zinit){
427 inflateEnd(&zs);
428 zinit = false;
429 }
430 zfail = false;
431 return 0;
432 }
433
434
435 /***
436 * the HTTP scanner plugin itself
437 */
438
439 extern "C"
440 void scan_http(const class scanner_params &sp,const recursion_control_block &rcb)
441 {
442 if(sp.sp_version!=scanner_params::CURRENT_SP_VERSION){
443 std::cerr << "scan_http requires sp version " << scanner_params::CURRENT_SP_VERSION << "; "
444 << "got version " << sp.sp_version << "\n";
445 exit(1);
446 }
447
448 if(sp.phase==scanner_params::PHASE_STARTUP){
449 sp.info->name = "http";
450 sp.info->flags = scanner_info::SCANNER_DISABLED; // default disabled
451 sp.info->get_config(HTTP_CMD,&http_cmd,"Command to execute on each HTTP attachment");
452 sp.info->get_config(HTTP_ALERT_FD,&http_alert_fd,"File descriptor to send information about completed HTTP attachments");
453 return; /* No feature files created */
454 }
455
456 if(sp.phase==scanner_params::PHASE_SCAN){
457 /* See if there is an HTTP response */
458 if(sp.sbuf.bufsize>=MIN_HTTP_BUFSIZE && sp.sbuf.memcmp(reinterpret_cast<const uint8_t *>("HTTP/1."),0,7)==0){
459 /* Smells enough like HTTP to try parsing */
460 /* Set up callbacks */
461 http_parser_settings scan_http_parser_settings;
462 memset(&scan_http_parser_settings,0,sizeof(scan_http_parser_settings)); // in the event that new callbacks get created
463 scan_http_parser_settings.on_message_begin = scan_http_cbo::scan_http_cb_on_message_begin;
464 scan_http_parser_settings.on_url = scan_http_cbo::scan_http_cb_on_url;
465 scan_http_parser_settings.on_header_field = scan_http_cbo::scan_http_cb_on_header_field;
466 scan_http_parser_settings.on_header_value = scan_http_cbo::scan_http_cb_on_header_value;
467 scan_http_parser_settings.on_headers_complete = scan_http_cbo::scan_http_cb_on_headers_complete;
468 scan_http_parser_settings.on_body = scan_http_cbo::scan_http_cb_on_body;
469 scan_http_parser_settings.on_message_complete = scan_http_cbo::scan_http_cb_on_message_complete;
470
471 if(sp.sxml) (*sp.sxml) << "\n <byte_runs>\n";
472 for(size_t offset=0;;){
473 /* Set up a parser instance for the next chunk of HTTP responses and data.
474 * This might be repeated several times due to connection re-use and multiple requests.
475 * Note that the parser is not a C++ library but it can pass a "data" to the
476 * callback. We put the address for the scan_http_cbo object in the data and
477 * recover it with a cast in each of the callbacks.
478 */
479
480 /* Make an sbuf for the remaining data.
481 * Note that this may not be necessary, because in our test runs the parser
482 * processed all of the data the first time through...
483 */
484 sbuf_t sub_buf(sp.sbuf, offset);
485
486 const char *base = reinterpret_cast<const char*>(sub_buf.buf);
487 http_parser parser;
488 http_parser_init(&parser, HTTP_RESPONSE);
489
490 scan_http_cbo cbo(sp.sbuf.pos0.path,base,sp.sxml);
491 parser.data = &cbo;
492
493 /* Parse */
494 size_t parsed = http_parser_execute(&parser, &scan_http_parser_settings,
495 base, sub_buf.size());
496 assert(parsed <= sub_buf.size());
497
498 /* Indicate EOF (flushing callbacks) and terminate if we parsed the entire buffer.
499 */
500 if (parsed == sub_buf.size()) {
501 http_parser_execute(&parser, &scan_http_parser_settings, NULL, 0);
502 break;
503 }
504
505 /* Stop parsing if we parsed nothing, as that indicates something header! */
506 if (parsed == 0) {
507 break;
508 }
509
510 /* Stop parsing if we're a connection upgrade (e.g. WebSockets) */
511 if (parser.upgrade) {
512 DEBUG(9) ("upgrade connection detected (WebSockets?); cowardly refusing to dump further");
513 break;
514 }
515
516 /* Bump the offset for next iteration */
517 offset += parsed;
518 }
519 if(sp.sxml) (*sp.sxml) << " </byte_runs>";
520 }
521 }
522 }