libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

html_extractor.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 2, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19 
20  */
21 /**
22  * @file plugins/html_extractor.c
23  * @brief plugin to support HTML files
24  * @author Christian Grothoff
25  */
26 #include "platform.h"
27 #include "extractor.h"
28 #include <magic.h>
29 #include <tidy/tidy.h>
30 #include <tidy/tidybuffio.h>
31 
32 /**
33  * Mapping of HTML META names to LE types.
34  */
35 static struct
36 {
37  /**
38  * HTML META name.
39  */
40  const char *name;
41 
42  /**
43  * Corresponding LE type.
44  */
46 } tagmap[] = {
47  { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
48  { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
49  { "title", EXTRACTOR_METATYPE_TITLE },
50  { "dc.title", EXTRACTOR_METATYPE_TITLE},
51  { "description", EXTRACTOR_METATYPE_DESCRIPTION },
52  { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
53  { "subject", EXTRACTOR_METATYPE_SUBJECT},
54  { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
56  { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
57  { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
58  { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
59  { "rights", EXTRACTOR_METATYPE_RIGHTS },
60  { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
61  { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
62  { "language", EXTRACTOR_METATYPE_LANGUAGE },
63  { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
64  { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
66  { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
67  { "dc.identifier", EXTRACTOR_METATYPE_URI },
68  { "dc.format", EXTRACTOR_METATYPE_FORMAT },
70 };
71 
72 
73 /**
74  * Global handle to MAGIC data.
75  */
76 static magic_t magic;
77 
78 
79 /**
80  * Map 'meta' tag to LE type.
81  *
82  * @param tag tag to map
83  * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
84  */
85 static enum EXTRACTOR_MetaType
86 tag_to_type (const char *tag)
87 {
88  unsigned int i;
89 
90  for (i = 0; NULL != tagmap[i].name; i++)
91  if (0 == strcasecmp (tag,
92  tagmap[i].name))
93  return tagmap[i].type;
95 }
96 
97 
98 /**
99  * Function called by libtidy for error reporting.
100  *
101  * @param doc tidy doc being processed
102  * @param lvl report level
103  * @param line input line
104  * @param col input column
105  * @param mssg message
106  * @return FALSE (no output)
107  */
108 static Bool TIDY_CALL
109 report_cb (TidyDoc doc,
110  TidyReportLevel lvl,
111  uint line,
112  uint col,
113  ctmbstr mssg)
114 {
115  return 0;
116 }
117 
118 
119 /**
120  * Input callback: get next byte of input.
121  *
122  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
123  * @return next byte of input, EndOfStream on errors and EOF
124  */
125 static int TIDY_CALL
126 get_byte_cb (void *sourceData)
127 {
128  struct EXTRACTOR_ExtractContext *ec = sourceData;
129  void *data;
130 
131  if (1 !=
132  ec->read (ec->cls,
133  &data, 1))
134  return EndOfStream;
135  return *(unsigned char*) data;
136 }
137 
138 
139 /**
140  * Input callback: unget last byte of input.
141  *
142  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
143  * @param bt byte to unget (ignored)
144  */
145 static void TIDY_CALL
146 unget_byte_cb (void *sourceData, byte bt)
147 {
148  struct EXTRACTOR_ExtractContext *ec = sourceData;
149 
150  (void) ec->seek (ec->cls, -1, SEEK_CUR);
151 }
152 
153 
154 /**
155  * Input callback: check for EOF.
156  *
157  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
158  * @return true if we are at the EOF
159  */
160 static Bool TIDY_CALL
161 eof_cb (void *sourceData)
162 {
163  struct EXTRACTOR_ExtractContext *ec = sourceData;
164 
165  return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
166 }
167 
168 
169 /**
170  * Main entry method for the 'text/html' extraction plugin.
171  *
172  * @param ec extraction context provided to the plugin
173  */
174 void
176 {
177  TidyDoc doc;
178  TidyNode head;
179  TidyNode child;
180  TidyNode title;
181  TidyInputSource src;
182  const char *name;
183  TidyBuffer tbuf;
184  TidyAttr attr;
186  ssize_t iret;
187  void *data;
188  const char *mime;
189 
190  if (-1 == (iret = ec->read (ec->cls,
191  &data,
192  16 * 1024)))
193  return;
194  if (NULL == (mime = magic_buffer (magic, data, iret)))
195  return;
196  if (0 != strncmp (mime,
197  "text/html",
198  strlen ("text/html")))
199  return; /* not HTML */
200 
201  if (0 != ec->seek (ec->cls, 0, SEEK_SET))
202  return; /* seek failed !? */
203 
204  tidyInitSource (&src, ec,
205  &get_byte_cb,
206  &unget_byte_cb,
207  &eof_cb);
208  if (NULL == (doc = tidyCreate ()))
209  return;
210  tidySetReportFilter (doc, &report_cb);
211  tidySetAppData (doc, ec);
212  if (0 > tidyParseSource (doc, &src))
213  {
214  tidyRelease (doc);
215  return;
216  }
217  if (1 != tidyStatus (doc))
218  {
219  tidyRelease (doc);
220  return;
221  }
222  if (NULL == (head = tidyGetHead (doc)))
223  {
224  fprintf (stderr, "no head\n");
225  tidyRelease (doc);
226  return;
227  }
228  for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
229  {
230  switch (tidyNodeGetType (child))
231  {
232  case TidyNode_Root:
233  break;
234  case TidyNode_DocType:
235  break;
236  case TidyNode_Comment:
237  break;
238  case TidyNode_ProcIns:
239  break;
240  case TidyNode_Text:
241  break;
242  case TidyNode_CDATA:
243  break;
244  case TidyNode_Section:
245  break;
246  case TidyNode_Asp:
247  break;
248  case TidyNode_Jste:
249  break;
250  case TidyNode_Php:
251  break;
252  case TidyNode_XmlDecl:
253  break;
254  case TidyNode_Start:
255  case TidyNode_StartEnd:
256  name = tidyNodeGetName (child);
257  if ( (0 == strcasecmp (name, "title")) &&
258  (NULL != (title = tidyGetChild (child))) )
259  {
260  tidyBufInit (&tbuf);
261  tidyNodeGetValue (doc, title, &tbuf);
262  /* add 0-termination */
263  tidyBufPutByte (&tbuf, 0);
264  if (0 !=
265  ec->proc (ec->cls,
266  "html",
269  "text/plain",
270  (const char *) tbuf.bp,
271  tbuf.size))
272  {
273  tidyBufFree (&tbuf);
274  goto CLEANUP;
275  }
276  tidyBufFree (&tbuf);
277  break;
278  }
279  if (0 == strcasecmp (name, "meta"))
280  {
281  if (NULL == (attr = tidyAttrGetById (child,
282  TidyAttr_NAME)))
283  break;
285  (type = tag_to_type (tidyAttrValue (attr))))
286  break;
287  if (NULL == (attr = tidyAttrGetById (child,
288  TidyAttr_CONTENT)))
289  break;
290  name = tidyAttrValue (attr);
291  if (0 !=
292  ec->proc (ec->cls,
293  "html",
294  type,
296  "text/plain",
297  name,
298  strlen (name) + 1))
299  goto CLEANUP;
300  break;
301  }
302  break;
303  case TidyNode_End:
304  break;
305  default:
306  break;
307  }
308  }
309 CLEANUP:
310  tidyRelease (doc);
311 }
312 
313 
314 #if OLD
315 
316 
317 /* ******************** parser helper functions ************** */
318 
319 static int
320 tagMatch (const char *tag, const char *s, const char *e)
321 {
322  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
323 }
324 
325 
326 static int
327 lookFor (char c, size_t *pos, const char *data, size_t size)
328 {
329  size_t p = *pos;
330 
331  while ((p < size) && (data[p] != c))
332  {
333  if (data[p] == '\0')
334  return 0;
335  p++;
336  }
337  *pos = p;
338  return p < size;
339 }
340 
341 
342 static int
343 skipWhitespace (size_t *pos, const char *data, size_t size)
344 {
345  size_t p = *pos;
346 
347  while ((p < size) && (isspace ( (unsigned char) data[p])))
348  {
349  if (data[p] == '\0')
350  return 0;
351  p++;
352  }
353  *pos = p;
354  return p < size;
355 }
356 
357 
358 static int
359 skipLetters (size_t *pos, const char *data, size_t size)
360 {
361  size_t p = *pos;
362 
363  while ((p < size) && (isalpha ( (unsigned char) data[p])))
364  {
365  if (data[p] == '\0')
366  return 0;
367  p++;
368  }
369  *pos = p;
370  return p < size;
371 }
372 
373 
374 static int
375 lookForMultiple (const char *c, size_t *pos, const char *data, size_t size)
376 {
377  size_t p = *pos;
378 
379  while ((p < size) && (strchr (c, data[p]) == NULL))
380  {
381  if (data[p] == '\0')
382  return 0;
383  p++;
384  }
385  *pos = p;
386  return p < size;
387 }
388 
389 
390 static void
391 findEntry (const char *key,
392  const char *start,
393  const char *end, const char **mstart, const char **mend)
394 {
395  size_t len;
396 
397  *mstart = NULL;
398  *mend = NULL;
399  len = strlen (key);
400  while (start < end - len - 1)
401  {
402  start++;
403  if (start[len] != '=')
404  continue;
405  if (0 == strncasecmp (start, key, len))
406  {
407  start += len + 1;
408  *mstart = start;
409  if ((*start == '\"') || (*start == '\''))
410  {
411  start++;
412  while ((start < end) && (*start != **mstart))
413  start++;
414  (*mstart)++; /* skip quote */
415  }
416  else
417  {
418  while ((start < end) && (! isspace ( (unsigned char) *start)))
419  start++;
420  }
421  *mend = start;
422  return;
423  }
424  }
425 }
426 
427 
428 /**
429  * Search all tags that correspond to "tagname". Example:
430  * If the tag is <meta name="foo" desc="bar">, and
431  * tagname == "meta", keyname="name", keyvalue="foo",
432  * and searchname="desc", then this function returns a
433  * copy (!) of "bar". Easy enough?
434  *
435  * @return NULL if nothing is found
436  */
437 static char *
438 findInTags (struct TagInfo *t,
439  const char *tagname,
440  const char *keyname, const char *keyvalue, const char *searchname)
441 {
442  const char *pstart;
443  const char *pend;
444 
445  while (t != NULL)
446  {
447  if (tagMatch (tagname, t->tagStart, t->tagEnd))
448  {
449  findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
450  if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
451  {
452  findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
453  if (pstart != NULL)
454  {
455  char *ret = malloc (pend - pstart + 1);
456  if (ret == NULL)
457  return NULL;
458  memcpy (ret, pstart, pend - pstart);
459  ret[pend - pstart] = '\0';
460  return ret;
461  }
462  }
463  }
464  t = t->next;
465  }
466  return NULL;
467 }
468 
469 
470 /* mimetype = text/html */
471 int
472 EXTRACTOR_html_extract (const char *data,
473  size_t size,
475  void *proc_cls,
476  const char *options)
477 {
478  size_t xsize;
479  struct TagInfo *tags;
480  struct TagInfo *t;
481  struct TagInfo tag;
482  size_t pos;
483  size_t tpos;
484  int i;
485  char *charset;
486  char *tmp;
487  char *xtmp;
488  int ret;
489 
490  ret = 0;
491  if (size == 0)
492  return 0;
493  /* only scan first 32k */
494  if (size > 1024 * 32)
495  xsize = 1024 * 32;
496  else
497  xsize = size;
498  tags = NULL;
499  tag.next = NULL;
500  pos = 0;
501  while (pos < xsize)
502  {
503  if (! lookFor ('<', &pos, data, size))
504  break;
505  tag.tagStart = &data[++pos];
506  if (! skipLetters (&pos, data, size))
507  break;
508  tag.tagEnd = &data[pos];
509  if (! skipWhitespace (&pos, data, size))
510  break;
511 STEP3:
512  if (! lookForMultiple (">\"\'", &pos, data, size))
513  break;
514  if (data[pos] != '>')
515  {
516  /* find end-quote, ignore escaped quotes (\') */
517  do
518  {
519  tpos = pos;
520  pos++;
521  if (! lookFor (data[tpos], &pos, data, size))
522  break;
523  }
524  while (data[pos - 1] == '\\');
525  pos++;
526  goto STEP3;
527  }
528  pos++;
529  if (! skipWhitespace (&pos, data, size))
530  break;
531  tag.dataStart = &data[pos];
532  if (! lookFor ('<', &pos, data, size))
533  break;
534  tag.dataEnd = &data[pos];
535  i = 0;
536  while (relevantTags[i] != NULL)
537  {
538  if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
539  (0 == strncasecmp (relevantTags[i],
540  tag.tagStart, tag.tagEnd - tag.tagStart)))
541  {
542  t = malloc (sizeof (struct TagInfo));
543  if (t == NULL)
544  return 0;
545  *t = tag;
546  t->next = tags;
547  tags = t;
548  break;
549  }
550  i++;
551  }
552  /* abort early if we hit the body tag */
553  if (tagMatch ("body", tag.tagStart, tag.tagEnd))
554  break;
555  }
556 
557  /* fast exit */
558  if (tags == NULL)
559  return 0;
560 
561  charset = NULL;
562  /* first, try to determine mime type and/or character set */
563  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
564  if (tmp != NULL)
565  {
566  /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
567  if text/html is present, we take that as the mime-type; if charset=
568  is present, we try to use that for character set conversion. */
569  if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
570  ret = proc (proc_cls,
571  "html",
574  "text/plain",
575  "text/html",
576  strlen ("text/html") + 1);
577  charset = strcasestr (tmp, "charset=");
578  if (charset != NULL)
579  charset = strdup (&charset[strlen ("charset=")]);
580  free (tmp);
581  }
582  i = 0;
583  while (tagmap[i].name != NULL)
584  {
585  tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
586  if ( (tmp != NULL) &&
587  (ret == 0) )
588  {
589  if (charset == NULL)
590  {
591  ret = proc (proc_cls,
592  "html",
593  tagmap[i].type,
595  "text/plain",
596  tmp,
597  strlen (tmp) + 1);
598  }
599  else
600  {
602  strlen (tmp),
603  charset);
604  if (xtmp != NULL)
605  {
606  ret = proc (proc_cls,
607  "html",
608  tagmap[i].type,
610  "text/plain",
611  xtmp,
612  strlen (xtmp) + 1);
613  free (xtmp);
614  }
615  }
616  }
617  if (tmp != NULL)
618  free (tmp);
619  i++;
620  }
621  while (tags != NULL)
622  {
623  t = tags;
624  if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
625  (ret == 0) )
626  {
627  if (charset == NULL)
628  {
629  xtmp = malloc (t->dataEnd - t->dataStart + 1);
630  if (xtmp != NULL)
631  {
632  memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
633  xtmp[t->dataEnd - t->dataStart] = '\0';
634  ret = proc (proc_cls,
635  "html",
638  "text/plain",
639  xtmp,
640  strlen (xtmp) + 1);
641  free (xtmp);
642  }
643  }
644  else
645  {
646  xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
647  t->dataEnd - t->dataStart,
648  charset);
649  if (xtmp != NULL)
650  {
651  ret = proc (proc_cls,
652  "html",
655  "text/plain",
656  xtmp,
657  strlen (xtmp) + 1);
658  free (xtmp);
659  }
660  }
661  }
662  tags = t->next;
663  free (t);
664  }
665  if (charset != NULL)
666  free (charset);
667  return ret;
668 }
669 
670 
671 #endif
672 
673 
674 /**
675  * Initialize glib and load magic file.
676  */
677 void __attribute__ ((constructor))
679 {
680  magic = magic_open (MAGIC_MIME_TYPE);
681  if (0 != magic_load (magic, NULL))
682  {
683  /* FIXME: how to deal with errors? */
684  }
685 }
686 
687 
688 /**
689  * Destructor for the library, cleans up.
690  */
691 void __attribute__ ((destructor))
693 {
694  if (NULL != magic)
695  {
696  magic_close (magic);
697  magic = NULL;
698  }
699 }
700 
701 
702 /* end of html_extractor.c */
char * EXTRACTOR_common_convert_to_utf8(const char *input, size_t len, const char *charset)
Definition: convert.c:39
int(* EXTRACTOR_MetaDataProcessor)(void *cls, const char *plugin_name, enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, const char *data_mime_type, const char *data, size_t data_len)
Definition: extractor.h:460
@ EXTRACTOR_METAFORMAT_C_STRING
Definition: extractor.h:113
@ EXTRACTOR_METAFORMAT_UTF8
Definition: extractor.h:102
#define NULL
Definition: getopt1.c:60
EXTRACTOR_MetaType
Definition: extractor.h:126
@ EXTRACTOR_METATYPE_UNKNOWN_DATE
Definition: extractor.h:195
@ EXTRACTOR_METATYPE_URI
Definition: extractor.h:162
@ EXTRACTOR_METATYPE_FORMAT
Definition: extractor.h:190
@ EXTRACTOR_METATYPE_AUTHOR_NAME
Definition: extractor.h:143
@ EXTRACTOR_METATYPE_LANGUAGE
Definition: extractor.h:157
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE
Definition: extractor.h:194
@ EXTRACTOR_METATYPE_CREATOR
Definition: extractor.h:189
@ EXTRACTOR_METATYPE_ABSTRACT
Definition: extractor.h:186
@ EXTRACTOR_METATYPE_KEYWORDS
Definition: extractor.h:185
@ EXTRACTOR_METATYPE_COPYRIGHT
Definition: extractor.h:183
@ EXTRACTOR_METATYPE_RIGHTS
Definition: extractor.h:184
@ EXTRACTOR_METATYPE_MIMETYPE
Definition: extractor.h:129
@ EXTRACTOR_METATYPE_PUBLISHER
Definition: extractor.h:146
@ EXTRACTOR_METATYPE_SUBJECT
Definition: extractor.h:188
@ EXTRACTOR_METATYPE_RESERVED
Definition: extractor.h:128
@ EXTRACTOR_METATYPE_DESCRIPTION
Definition: extractor.h:182
void html_ltdl_fini()
void EXTRACTOR_html_extract_method(struct EXTRACTOR_ExtractContext *ec)
static Bool TIDY_CALL report_cb(TidyDoc doc, TidyReportLevel lvl, uint line, uint col, ctmbstr mssg)
void html_gobject_init()
static enum EXTRACTOR_MetaType tag_to_type(const char *tag)
static struct @1 tagmap[]
static void TIDY_CALL unget_byte_cb(void *sourceData, byte bt)
static int TIDY_CALL get_byte_cb(void *sourceData)
static Bool TIDY_CALL eof_cb(void *sourceData)
const char * name
enum EXTRACTOR_MetaType type
static magic_t magic
plaform specifics
int64_t(* seek)(void *cls, int64_t pos, int whence)
Definition: extractor.h:509
uint64_t(* get_size)(void *cls)
Definition: extractor.h:520
EXTRACTOR_MetaDataProcessor proc
Definition: extractor.h:525
ssize_t(* read)(void *cls, void **data, size_t size)
Definition: extractor.h:494