"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/plugins/html_extractor.c" (30 Jan 2021, 16728 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "html_extractor.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 2, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19 
   20  */
   21 /**
   22  * @file plugins/html_extractor.c
   23  * @brief plugin to support HTML files
   24  * @author Christian Grothoff
   25  */
   26 #include "platform.h"
   27 #include "extractor.h"
   28 #include <magic.h>
   29 #include <tidy/tidy.h>
   30 #include <tidy/tidybuffio.h>
   31 
   32 /**
   33  * Mapping of HTML META names to LE types.
   34  */
   35 static struct
   36 {
   37   /**
   38    * HTML META name.
   39    */
   40   const char *name;
   41 
   42   /**
   43    * Corresponding LE type.
   44    */
   45   enum EXTRACTOR_MetaType type;
   46 } tagmap[] = {
   47   { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
   48   { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
   49   { "title", EXTRACTOR_METATYPE_TITLE },
   50   { "dc.title", EXTRACTOR_METATYPE_TITLE},
   51   { "description", EXTRACTOR_METATYPE_DESCRIPTION },
   52   { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
   53   { "subject", EXTRACTOR_METATYPE_SUBJECT},
   54   { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
   55   { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
   56   { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
   57   { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
   58   { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
   59   { "rights", EXTRACTOR_METATYPE_RIGHTS },
   60   { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
   61   { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
   62   { "language", EXTRACTOR_METATYPE_LANGUAGE },
   63   { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
   64   { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
   65   { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
   66   { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
   67   { "dc.identifier", EXTRACTOR_METATYPE_URI },
   68   { "dc.format", EXTRACTOR_METATYPE_FORMAT },
   69   { NULL, EXTRACTOR_METATYPE_RESERVED }
   70 };
   71 
   72 
   73 /**
   74  * Global handle to MAGIC data.
   75  */
   76 static magic_t magic;
   77 
   78 
   79 /**
   80  * Map 'meta' tag to LE type.
   81  *
   82  * @param tag tag to map
   83  * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
   84  */
   85 static enum EXTRACTOR_MetaType
   86 tag_to_type (const char *tag)
   87 {
   88   unsigned int i;
   89 
   90   for (i = 0; NULL != tagmap[i].name; i++)
   91     if (0 == strcasecmp (tag,
   92                          tagmap[i].name))
   93       return tagmap[i].type;
   94   return EXTRACTOR_METATYPE_RESERVED;
   95 }
   96 
   97 
   98 /**
   99  * Function called by libtidy for error reporting.
  100  *
  101  * @param doc tidy doc being processed
  102  * @param lvl report level
  103  * @param line input line
  104  * @param col input column
  105  * @param mssg message
  106  * @return FALSE (no output)
  107  */
  108 static Bool TIDY_CALL
  109 report_cb (TidyDoc doc,
  110            TidyReportLevel lvl,
  111            uint line,
  112            uint col,
  113            ctmbstr mssg)
  114 {
  115   return 0;
  116 }
  117 
  118 
  119 /**
  120  * Input callback: get next byte of input.
  121  *
  122  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
  123  * @return next byte of input, EndOfStream on errors and EOF
  124  */
  125 static int TIDY_CALL
  126 get_byte_cb (void *sourceData)
  127 {
  128   struct EXTRACTOR_ExtractContext *ec = sourceData;
  129   void *data;
  130 
  131   if (1 !=
  132       ec->read (ec->cls,
  133                 &data, 1))
  134     return EndOfStream;
  135   return *(unsigned char*) data;
  136 }
  137 
  138 
  139 /**
  140  * Input callback: unget last byte of input.
  141  *
  142  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
  143  * @param bt byte to unget (ignored)
  144  */
  145 static void TIDY_CALL
  146 unget_byte_cb (void *sourceData, byte bt)
  147 {
  148   struct EXTRACTOR_ExtractContext *ec = sourceData;
  149 
  150   (void) ec->seek (ec->cls, -1, SEEK_CUR);
  151 }
  152 
  153 
  154 /**
  155  * Input callback: check for EOF.
  156  *
  157  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
  158  * @return true if we are at the EOF
  159  */
  160 static Bool TIDY_CALL
  161 eof_cb (void *sourceData)
  162 {
  163   struct EXTRACTOR_ExtractContext *ec = sourceData;
  164 
  165   return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
  166 }
  167 
  168 
  169 /**
  170  * Main entry method for the 'text/html' extraction plugin.
  171  *
  172  * @param ec extraction context provided to the plugin
  173  */
  174 void
  175 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
  176 {
  177   TidyDoc doc;
  178   TidyNode head;
  179   TidyNode child;
  180   TidyNode title;
  181   TidyInputSource src;
  182   const char *name;
  183   TidyBuffer tbuf;
  184   TidyAttr attr;
  185   enum EXTRACTOR_MetaType type;
  186   ssize_t iret;
  187   void *data;
  188   const char *mime;
  189 
  190   if (-1 == (iret = ec->read (ec->cls,
  191                               &data,
  192                               16 * 1024)))
  193     return;
  194   if (NULL == (mime = magic_buffer (magic, data, iret)))
  195     return;
  196   if (0 != strncmp (mime,
  197                     "text/html",
  198                     strlen ("text/html")))
  199     return; /* not HTML */
  200 
  201   if (0 != ec->seek (ec->cls, 0, SEEK_SET))
  202     return; /* seek failed !? */
  203 
  204   tidyInitSource (&src, ec,
  205                   &get_byte_cb,
  206                   &unget_byte_cb,
  207                   &eof_cb);
  208   if (NULL == (doc = tidyCreate ()))
  209     return;
  210   tidySetReportFilter (doc, &report_cb);
  211   tidySetAppData (doc, ec);
  212   if (0 > tidyParseSource (doc, &src))
  213   {
  214     tidyRelease (doc);
  215     return;
  216   }
  217   if (1 != tidyStatus (doc))
  218   {
  219     tidyRelease (doc);
  220     return;
  221   }
  222   if (NULL == (head = tidyGetHead (doc)))
  223   {
  224     fprintf (stderr, "no head\n");
  225     tidyRelease (doc);
  226     return;
  227   }
  228   for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
  229   {
  230     switch (tidyNodeGetType (child))
  231     {
  232     case TidyNode_Root:
  233       break;
  234     case TidyNode_DocType:
  235       break;
  236     case TidyNode_Comment:
  237       break;
  238     case TidyNode_ProcIns:
  239       break;
  240     case TidyNode_Text:
  241       break;
  242     case TidyNode_CDATA:
  243       break;
  244     case TidyNode_Section:
  245       break;
  246     case TidyNode_Asp:
  247       break;
  248     case TidyNode_Jste:
  249       break;
  250     case TidyNode_Php:
  251       break;
  252     case TidyNode_XmlDecl:
  253       break;
  254     case TidyNode_Start:
  255     case TidyNode_StartEnd:
  256       name = tidyNodeGetName (child);
  257       if ( (0 == strcasecmp (name, "title")) &&
  258            (NULL != (title = tidyGetChild (child))) )
  259       {
  260         tidyBufInit (&tbuf);
  261         tidyNodeGetValue (doc, title, &tbuf);
  262         /* add 0-termination */
  263         tidyBufPutByte (&tbuf, 0);
  264         if (0 !=
  265             ec->proc (ec->cls,
  266                       "html",
  267                       EXTRACTOR_METATYPE_TITLE,
  268                       EXTRACTOR_METAFORMAT_UTF8,
  269                       "text/plain",
  270                       (const char *) tbuf.bp,
  271                       tbuf.size))
  272         {
  273           tidyBufFree (&tbuf);
  274           goto CLEANUP;
  275         }
  276         tidyBufFree (&tbuf);
  277         break;
  278       }
  279       if (0 == strcasecmp (name, "meta"))
  280       {
  281         if (NULL == (attr = tidyAttrGetById (child,
  282                                              TidyAttr_NAME)))
  283           break;
  284         if (EXTRACTOR_METATYPE_RESERVED ==
  285             (type = tag_to_type (tidyAttrValue (attr))))
  286           break;
  287         if (NULL == (attr = tidyAttrGetById (child,
  288                                              TidyAttr_CONTENT)))
  289           break;
  290         name = tidyAttrValue (attr);
  291         if (0 !=
  292             ec->proc (ec->cls,
  293                       "html",
  294                       type,
  295                       EXTRACTOR_METAFORMAT_UTF8,
  296                       "text/plain",
  297                       name,
  298                       strlen (name) + 1))
  299           goto CLEANUP;
  300         break;
  301       }
  302       break;
  303     case TidyNode_End:
  304       break;
  305     default:
  306       break;
  307     }
  308   }
  309 CLEANUP:
  310   tidyRelease (doc);
  311 }
  312 
  313 
  314 #if OLD
  315 
  316 
  317 /* ******************** parser helper functions ************** */
  318 
  319 static int
  320 tagMatch (const char *tag, const char *s, const char *e)
  321 {
  322   return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
  323 }
  324 
  325 
  326 static int
  327 lookFor (char c, size_t *pos, const char *data, size_t size)
  328 {
  329   size_t p = *pos;
  330 
  331   while ((p < size) && (data[p] != c))
  332   {
  333     if (data[p] == '\0')
  334       return 0;
  335     p++;
  336   }
  337   *pos = p;
  338   return p < size;
  339 }
  340 
  341 
  342 static int
  343 skipWhitespace (size_t *pos, const char *data, size_t size)
  344 {
  345   size_t p = *pos;
  346 
  347   while ((p < size) && (isspace ( (unsigned char) data[p])))
  348   {
  349     if (data[p] == '\0')
  350       return 0;
  351     p++;
  352   }
  353   *pos = p;
  354   return p < size;
  355 }
  356 
  357 
  358 static int
  359 skipLetters (size_t *pos, const char *data, size_t size)
  360 {
  361   size_t p = *pos;
  362 
  363   while ((p < size) && (isalpha ( (unsigned char) data[p])))
  364   {
  365     if (data[p] == '\0')
  366       return 0;
  367     p++;
  368   }
  369   *pos = p;
  370   return p < size;
  371 }
  372 
  373 
  374 static int
  375 lookForMultiple (const char *c, size_t *pos, const char *data, size_t size)
  376 {
  377   size_t p = *pos;
  378 
  379   while ((p < size) && (strchr (c, data[p]) == NULL))
  380   {
  381     if (data[p] == '\0')
  382       return 0;
  383     p++;
  384   }
  385   *pos = p;
  386   return p < size;
  387 }
  388 
  389 
  390 static void
  391 findEntry (const char *key,
  392            const char *start,
  393            const char *end, const char **mstart, const char **mend)
  394 {
  395   size_t len;
  396 
  397   *mstart = NULL;
  398   *mend = NULL;
  399   len = strlen (key);
  400   while (start < end - len - 1)
  401   {
  402     start++;
  403     if (start[len] != '=')
  404       continue;
  405     if (0 == strncasecmp (start, key, len))
  406     {
  407       start += len + 1;
  408       *mstart = start;
  409       if ((*start == '\"') || (*start == '\''))
  410       {
  411         start++;
  412         while ((start < end) && (*start != **mstart))
  413           start++;
  414         (*mstart)++;            /* skip quote */
  415       }
  416       else
  417       {
  418         while ((start < end) && (! isspace ( (unsigned char) *start)))
  419           start++;
  420       }
  421       *mend = start;
  422       return;
  423     }
  424   }
  425 }
  426 
  427 
  428 /**
  429  * Search all tags that correspond to "tagname".  Example:
  430  * If the tag is <meta name="foo" desc="bar">, and
  431  * tagname == "meta", keyname="name", keyvalue="foo",
  432  * and searchname="desc", then this function returns a
  433  * copy (!) of "bar".  Easy enough?
  434  *
  435  * @return NULL if nothing is found
  436  */
  437 static char *
  438 findInTags (struct TagInfo *t,
  439             const char *tagname,
  440             const char *keyname, const char *keyvalue, const char *searchname)
  441 {
  442   const char *pstart;
  443   const char *pend;
  444 
  445   while (t != NULL)
  446   {
  447     if (tagMatch (tagname, t->tagStart, t->tagEnd))
  448     {
  449       findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
  450       if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
  451       {
  452         findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
  453         if (pstart != NULL)
  454         {
  455           char *ret = malloc (pend - pstart + 1);
  456           if (ret == NULL)
  457             return NULL;
  458           memcpy (ret, pstart, pend - pstart);
  459           ret[pend - pstart] = '\0';
  460           return ret;
  461         }
  462       }
  463     }
  464     t = t->next;
  465   }
  466   return NULL;
  467 }
  468 
  469 
  470 /* mimetype = text/html */
  471 int
  472 EXTRACTOR_html_extract (const char *data,
  473                         size_t size,
  474                         EXTRACTOR_MetaDataProcessor proc,
  475                         void *proc_cls,
  476                         const char *options)
  477 {
  478   size_t xsize;
  479   struct TagInfo *tags;
  480   struct TagInfo *t;
  481   struct TagInfo tag;
  482   size_t pos;
  483   size_t tpos;
  484   int i;
  485   char *charset;
  486   char *tmp;
  487   char *xtmp;
  488   int ret;
  489 
  490   ret = 0;
  491   if (size == 0)
  492     return 0;
  493   /* only scan first 32k */
  494   if (size > 1024 * 32)
  495     xsize = 1024 * 32;
  496   else
  497     xsize = size;
  498   tags = NULL;
  499   tag.next = NULL;
  500   pos = 0;
  501   while (pos < xsize)
  502   {
  503     if (! lookFor ('<', &pos, data, size))
  504       break;
  505     tag.tagStart = &data[++pos];
  506     if (! skipLetters (&pos, data, size))
  507       break;
  508     tag.tagEnd = &data[pos];
  509     if (! skipWhitespace (&pos, data, size))
  510       break;
  511 STEP3:
  512     if (! lookForMultiple (">\"\'", &pos, data, size))
  513       break;
  514     if (data[pos] != '>')
  515     {
  516       /* find end-quote, ignore escaped quotes (\') */
  517       do
  518       {
  519         tpos = pos;
  520         pos++;
  521         if (! lookFor (data[tpos], &pos, data, size))
  522           break;
  523       }
  524       while (data[pos - 1] == '\\');
  525       pos++;
  526       goto STEP3;
  527     }
  528     pos++;
  529     if (! skipWhitespace (&pos, data, size))
  530       break;
  531     tag.dataStart = &data[pos];
  532     if (! lookFor ('<', &pos, data, size))
  533       break;
  534     tag.dataEnd = &data[pos];
  535     i = 0;
  536     while (relevantTags[i] != NULL)
  537     {
  538       if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
  539           (0 == strncasecmp (relevantTags[i],
  540                              tag.tagStart, tag.tagEnd - tag.tagStart)))
  541       {
  542         t = malloc (sizeof (struct TagInfo));
  543         if (t == NULL)
  544           return 0;
  545         *t = tag;
  546         t->next = tags;
  547         tags = t;
  548         break;
  549       }
  550       i++;
  551     }
  552     /* abort early if we hit the body tag */
  553     if (tagMatch ("body", tag.tagStart, tag.tagEnd))
  554       break;
  555   }
  556 
  557   /* fast exit */
  558   if (tags == NULL)
  559     return 0;
  560 
  561   charset = NULL;
  562   /* first, try to determine mime type and/or character set */
  563   tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
  564   if (tmp != NULL)
  565   {
  566     /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
  567        if text/html is present, we take that as the mime-type; if charset=
  568        is present, we try to use that for character set conversion. */
  569     if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
  570       ret = proc (proc_cls,
  571                   "html",
  572                   EXTRACTOR_METATYPE_MIMETYPE,
  573                   EXTRACTOR_METAFORMAT_UTF8,
  574                   "text/plain",
  575                   "text/html",
  576                   strlen ("text/html") + 1);
  577     charset = strcasestr (tmp, "charset=");
  578     if (charset != NULL)
  579       charset = strdup (&charset[strlen ("charset=")]);
  580     free (tmp);
  581   }
  582   i = 0;
  583   while (tagmap[i].name != NULL)
  584   {
  585     tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
  586     if ( (tmp != NULL) &&
  587          (ret == 0) )
  588     {
  589       if (charset == NULL)
  590       {
  591         ret = proc (proc_cls,
  592                     "html",
  593                     tagmap[i].type,
  594                     EXTRACTOR_METAFORMAT_C_STRING,
  595                     "text/plain",
  596                     tmp,
  597                     strlen (tmp) + 1);
  598       }
  599       else
  600       {
  601         xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
  602                                                  strlen (tmp),
  603                                                  charset);
  604         if (xtmp != NULL)
  605         {
  606           ret = proc (proc_cls,
  607                       "html",
  608                       tagmap[i].type,
  609                       EXTRACTOR_METAFORMAT_UTF8,
  610                       "text/plain",
  611                       xtmp,
  612                       strlen (xtmp) + 1);
  613           free (xtmp);
  614         }
  615       }
  616     }
  617     if (tmp != NULL)
  618       free (tmp);
  619     i++;
  620   }
  621   while (tags != NULL)
  622   {
  623     t = tags;
  624     if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
  625          (ret == 0) )
  626     {
  627       if (charset == NULL)
  628       {
  629         xtmp = malloc (t->dataEnd - t->dataStart + 1);
  630         if (xtmp != NULL)
  631         {
  632           memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
  633           xtmp[t->dataEnd - t->dataStart] = '\0';
  634           ret = proc (proc_cls,
  635                       "html",
  636                       EXTRACTOR_METATYPE_TITLE,
  637                       EXTRACTOR_METAFORMAT_C_STRING,
  638                       "text/plain",
  639                       xtmp,
  640                       strlen (xtmp) + 1);
  641           free (xtmp);
  642         }
  643       }
  644       else
  645       {
  646         xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
  647                                                  t->dataEnd - t->dataStart,
  648                                                  charset);
  649         if (xtmp != NULL)
  650         {
  651           ret = proc (proc_cls,
  652                       "html",
  653                       EXTRACTOR_METATYPE_TITLE,
  654                       EXTRACTOR_METAFORMAT_UTF8,
  655                       "text/plain",
  656                       xtmp,
  657                       strlen (xtmp) + 1);
  658           free (xtmp);
  659         }
  660       }
  661     }
  662     tags = t->next;
  663     free (t);
  664   }
  665   if (charset != NULL)
  666     free (charset);
  667   return ret;
  668 }
  669 
  670 
  671 #endif
  672 
  673 
  674 /**
  675  * Initialize glib and load magic file.
  676  */
  677 void __attribute__ ((constructor))
  678 html_gobject_init ()
  679 {
  680   magic = magic_open (MAGIC_MIME_TYPE);
  681   if (0 != magic_load (magic, NULL))
  682   {
  683     /* FIXME: how to deal with errors? */
  684   }
  685 }
  686 
  687 
  688 /**
  689  * Destructor for the library, cleans up.
  690  */
  691 void __attribute__ ((destructor))
  692 html_ltdl_fini ()
  693 {
  694   if (NULL != magic)
  695   {
  696     magic_close (magic);
  697     magic = NULL;
  698   }
  699 }
  700 
  701 
  702 /* end of html_extractor.c */