"Fossies" - the Fresh Open Source Software Archive

Member "libextractor-1.11/src/plugins/man_extractor.c" (30 Jan 2021, 7055 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "man_extractor.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.10_vs_1.11.

    1 /*
    2      This file is part of libextractor.
    3      Copyright (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
    4 
    5      libextractor is free software; you can redistribute it and/or modify
    6      it under the terms of the GNU General Public License as published
    7      by the Free Software Foundation; either version 3, or (at your
    8      option) any later version.
    9 
   10      libextractor is distributed in the hope that it will be useful, but
   11      WITHOUT ANY WARRANTY; without even the implied warranty of
   12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13      General Public License for more details.
   14 
   15      You should have received a copy of the GNU General Public License
   16      along with libextractor; see the file COPYING.  If not, write to the
   17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   18      Boston, MA 02110-1301, USA.
   19  */
   20 /**
   21  * @file plugins/man_extractor.c
   22  * @brief plugin to support man pages
   23  * @author Christian Grothoff
   24  */
   25 #include "platform.h"
   26 #include "extractor.h"
   27 #include <ctype.h>
   28 
   29 
   30 /**
   31  * Create string from first 'n' characters of 'str'.  See 'strndup'.
   32  *
   33  * @param str input string
   34  * @param n desired output length (plus 0-termination)
   35  * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error
   36  */
   37 static char *
   38 stndup (const char *str, size_t n)
   39 {
   40   char *tmp;
   41 
   42   if (NULL == (tmp = malloc (n + 1)))
   43     return NULL;
   44   tmp[n] = '\0';
   45   memcpy (tmp, str, n);
   46   return tmp;
   47 }
   48 
   49 
   50 /**
   51  * Give a metadata item to LE.  Removes double-quotes and
   52  * makes sure we don't pass empty strings or NULL pointers.
   53  *
   54  * @param type metadata type to use
   55  * @param keyword metadata value; freed in the process
   56  * @param proc function to call with meta data
   57  * @param proc_cls closure for 'proc'
   58  * @return 0 to continue extracting, 1 if we are done
   59  */
   60 static int
   61 add_keyword (enum EXTRACTOR_MetaType type,
   62              char *keyword,
   63              EXTRACTOR_MetaDataProcessor proc,
   64              void *proc_cls)
   65 {
   66   int ret;
   67   char *value;
   68 
   69   if (NULL == keyword)
   70     return 0;
   71   if ( (keyword[0] == '\"') &&
   72        (keyword[strlen (keyword) - 1] == '\"') )
   73   {
   74     keyword[strlen (keyword) - 1] = '\0';
   75     value = &keyword[1];
   76   }
   77   else
   78     value = keyword;
   79   if (0 == strlen (value))
   80   {
   81     free (keyword);
   82     return 0;
   83   }
   84   ret = proc (proc_cls,
   85               "man",
   86               type,
   87               EXTRACTOR_METAFORMAT_UTF8,
   88               "text/plain",
   89               value,
   90               strlen (value) + 1);
   91   free (keyword);
   92   return ret;
   93 }
   94 
   95 
   96 /**
   97  * Find the end of the current token (which may be quoted).
   98  *
   99  * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly
  100  * @param buf input buffer with the characters
  101  * @param size number of bytes in buf
  102  */
  103 static void
  104 find_end_of_token (size_t *end,
  105                    const char *buf,
  106                    const size_t size)
  107 {
  108   int quot;
  109 
  110   quot = 0;
  111   while ( (*end < size) &&
  112           ( (0 != (quot & 1)) ||
  113             ((' ' != buf[*end])) ) )
  114   {
  115     if ('\"' == buf[*end])
  116       quot++;
  117     (*end)++;
  118   }
  119   if (1 == (quot & 1))
  120     (*end) = size + 1;
  121 }
  122 
  123 
  124 /**
  125  * How many bytes do we actually try to scan? (from the beginning
  126  * of the file).
  127  */
  128 #define MAX_READ (16 * 1024)
  129 
  130 
  131 /**
  132  * Add a keyword to LE.
  133  *
  134  * @param t type to use
  135  * @param s keyword to give to LE
  136  */
  137 #define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; \
  138 } while (0)
  139 
  140 
  141 /**
  142  * Main entry method for the man page extraction plugin.
  143  *
  144  * @param ec extraction context provided to the plugin
  145  */
  146 void
  147 EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec)
  148 {
  149   const size_t xlen = strlen (".TH ");
  150   size_t pos;
  151   size_t xsize;
  152   size_t end;
  153   void *data;
  154   ssize_t size;
  155   char *buf;
  156 
  157   if (0 >= (size = ec->read (ec->cls, &data, MAX_READ)))
  158     return;
  159   buf = data;
  160   pos = 0;
  161   if (size < xlen)
  162     return;
  163   /* find actual beginning of the man page (.TH);
  164      abort if we find non-printable characters */
  165   while ( (pos < size - xlen) &&
  166           ( (0 != strncmp (".TH ",
  167                            &buf[pos],
  168                            xlen)) ||
  169             ( (0 != pos) &&
  170               (buf[pos - 1] != '\n') ) ) )
  171   {
  172     if ( (! isgraph ((unsigned char) buf[pos])) &&
  173          (! isspace ((unsigned char) buf[pos])) )
  174       return;
  175     pos++;
  176   }
  177   if (0 != strncmp (".TH ", &buf[pos], xlen))
  178     return;
  179 
  180   /* find end of ".TH"-line */
  181   xsize = pos;
  182   while ( (xsize < size) && ('\n' != buf[xsize]) )
  183     xsize++;
  184   /* limit processing to ".TH" line */
  185   size = xsize;
  186 
  187   /* skip over ".TH" */
  188   pos += xlen;
  189 
  190   /* first token is the title */
  191   end = pos;
  192   find_end_of_token (&end, buf, size);
  193   if (end > size)
  194     return;
  195   if (end > pos)
  196   {
  197     ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
  198     pos = end + 1;
  199   }
  200   if (pos >= size)
  201     return;
  202 
  203   /* next token is the section */
  204   end = pos;
  205   find_end_of_token (&end, buf, size);
  206   if (end > size)
  207     return;
  208   if ('\"' == buf[pos])
  209     pos++;
  210   if ((end - pos >= 1) && (end - pos <= 4))
  211   {
  212     switch (buf[pos])
  213     {
  214     case '1':
  215       ADD (EXTRACTOR_METATYPE_SECTION,
  216            strdup (_ ("Commands")));
  217       break;
  218     case '2':
  219       ADD (EXTRACTOR_METATYPE_SECTION,
  220            strdup (_ ("System calls")));
  221       break;
  222     case '3':
  223       ADD (EXTRACTOR_METATYPE_SECTION,
  224            strdup (_ ("Library calls")));
  225       break;
  226     case '4':
  227       ADD (EXTRACTOR_METATYPE_SECTION,
  228            strdup (_ ("Special files")));
  229       break;
  230     case '5':
  231       ADD (EXTRACTOR_METATYPE_SECTION,
  232            strdup (_ ("File formats and conventions")));
  233       break;
  234     case '6':
  235       ADD (EXTRACTOR_METATYPE_SECTION,
  236            strdup (_ ("Games")));
  237       break;
  238     case '7':
  239       ADD (EXTRACTOR_METATYPE_SECTION,
  240            strdup (_ ("Conventions and miscellaneous")));
  241       break;
  242     case '8':
  243       ADD (EXTRACTOR_METATYPE_SECTION,
  244            strdup (_ ("System management commands")));
  245       break;
  246     case '9':
  247       ADD (EXTRACTOR_METATYPE_SECTION,
  248            strdup (_ ("Kernel routines")));
  249       break;
  250     default:
  251       ADD (EXTRACTOR_METATYPE_SECTION,
  252            stndup (&buf[pos], 1));
  253     }
  254     pos = end + 1;
  255   }
  256   end = pos;
  257 
  258   /* next token is the modification date */
  259   find_end_of_token (&end, buf, size);
  260   if (end > size)
  261     return;
  262   if (end > pos)
  263   {
  264     ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos));
  265     pos = end + 1;
  266   }
  267 
  268   /* next token is the source of the man page */
  269   end = pos;
  270   find_end_of_token (&end, buf, size);
  271   if (end > size)
  272     return;
  273   if (end > pos)
  274   {
  275     ADD (EXTRACTOR_METATYPE_SOURCE,
  276          stndup (&buf[pos], end - pos));
  277     pos = end + 1;
  278   }
  279 
  280   /* last token is the title of the book the man page belongs to */
  281   end = pos;
  282   find_end_of_token (&end, buf, size);
  283   if (end > size)
  284     return;
  285   if (end > pos)
  286   {
  287     ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
  288          stndup (&buf[pos], end - pos));
  289     pos = end + 1;
  290   }
  291 }
  292 
  293 
  294 /* end of man_extractor.c */