libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

man_extractor.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 3, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19  */
20 /**
21  * @file plugins/man_extractor.c
22  * @brief plugin to support man pages
23  * @author Christian Grothoff
24  */
25 #include "platform.h"
26 #include "extractor.h"
27 #include <ctype.h>
28 
29 
30 /**
31  * Create string from first 'n' characters of 'str'. See 'strndup'.
32  *
33  * @param str input string
34  * @param n desired output length (plus 0-termination)
35  * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error
36  */
37 static char *
38 stndup (const char *str, size_t n)
39 {
40  char *tmp;
41 
42  if (NULL == (tmp = malloc (n + 1)))
43  return NULL;
44  tmp[n] = '\0';
45  memcpy (tmp, str, n);
46  return tmp;
47 }
48 
49 
50 /**
51  * Give a metadata item to LE. Removes double-quotes and
52  * makes sure we don't pass empty strings or NULL pointers.
53  *
54  * @param type metadata type to use
55  * @param keyword metadata value; freed in the process
56  * @param proc function to call with meta data
57  * @param proc_cls closure for 'proc'
58  * @return 0 to continue extracting, 1 if we are done
59  */
60 static int
62  char *keyword,
64  void *proc_cls)
65 {
66  int ret;
67  char *value;
68 
69  if (NULL == keyword)
70  return 0;
71  if ( (keyword[0] == '\"') &&
72  (keyword[strlen (keyword) - 1] == '\"') )
73  {
74  keyword[strlen (keyword) - 1] = '\0';
75  value = &keyword[1];
76  }
77  else
78  value = keyword;
79  if (0 == strlen (value))
80  {
81  free (keyword);
82  return 0;
83  }
84  ret = proc (proc_cls,
85  "man",
86  type,
88  "text/plain",
89  value,
90  strlen (value) + 1);
91  free (keyword);
92  return ret;
93 }
94 
95 
96 /**
97  * Find the end of the current token (which may be quoted).
98  *
99  * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly
100  * @param buf input buffer with the characters
101  * @param size number of bytes in buf
102  */
103 static void
104 find_end_of_token (size_t *end,
105  const char *buf,
106  const size_t size)
107 {
108  int quot;
109 
110  quot = 0;
111  while ( (*end < size) &&
112  ( (0 != (quot & 1)) ||
113  ((' ' != buf[*end])) ) )
114  {
115  if ('\"' == buf[*end])
116  quot++;
117  (*end)++;
118  }
119  if (1 == (quot & 1))
120  (*end) = size + 1;
121 }
122 
123 
124 /**
125  * How many bytes do we actually try to scan? (from the beginning
126  * of the file).
127  */
128 #define MAX_READ (16 * 1024)
129 
130 
131 /**
132  * Add a keyword to LE.
133  *
134  * @param t type to use
135  * @param s keyword to give to LE
136  */
137 #define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; \
138 } while (0)
139 
140 
141 /**
142  * Main entry method for the man page extraction plugin.
143  *
144  * @param ec extraction context provided to the plugin
145  */
146 void
148 {
149  const size_t xlen = strlen (".TH ");
150  size_t pos;
151  size_t xsize;
152  size_t end;
153  void *data;
154  ssize_t size;
155  char *buf;
156 
157  if (0 >= (size = ec->read (ec->cls, &data, MAX_READ)))
158  return;
159  buf = data;
160  pos = 0;
161  if (size < xlen)
162  return;
163  /* find actual beginning of the man page (.TH);
164  abort if we find non-printable characters */
165  while ( (pos < size - xlen) &&
166  ( (0 != strncmp (".TH ",
167  &buf[pos],
168  xlen)) ||
169  ( (0 != pos) &&
170  (buf[pos - 1] != '\n') ) ) )
171  {
172  if ( (! isgraph ((unsigned char) buf[pos])) &&
173  (! isspace ((unsigned char) buf[pos])) )
174  return;
175  pos++;
176  }
177  if (0 != strncmp (".TH ", &buf[pos], xlen))
178  return;
179 
180  /* find end of ".TH"-line */
181  xsize = pos;
182  while ( (xsize < size) && ('\n' != buf[xsize]) )
183  xsize++;
184  /* limit processing to ".TH" line */
185  size = xsize;
186 
187  /* skip over ".TH" */
188  pos += xlen;
189 
190  /* first token is the title */
191  end = pos;
192  find_end_of_token (&end, buf, size);
193  if (end > size)
194  return;
195  if (end > pos)
196  {
197  ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
198  pos = end + 1;
199  }
200  if (pos >= size)
201  return;
202 
203  /* next token is the section */
204  end = pos;
205  find_end_of_token (&end, buf, size);
206  if (end > size)
207  return;
208  if ('\"' == buf[pos])
209  pos++;
210  if ((end - pos >= 1) && (end - pos <= 4))
211  {
212  switch (buf[pos])
213  {
214  case '1':
216  strdup (_ ("Commands")));
217  break;
218  case '2':
220  strdup (_ ("System calls")));
221  break;
222  case '3':
224  strdup (_ ("Library calls")));
225  break;
226  case '4':
228  strdup (_ ("Special files")));
229  break;
230  case '5':
232  strdup (_ ("File formats and conventions")));
233  break;
234  case '6':
236  strdup (_ ("Games")));
237  break;
238  case '7':
240  strdup (_ ("Conventions and miscellaneous")));
241  break;
242  case '8':
244  strdup (_ ("System management commands")));
245  break;
246  case '9':
248  strdup (_ ("Kernel routines")));
249  break;
250  default:
252  stndup (&buf[pos], 1));
253  }
254  pos = end + 1;
255  }
256  end = pos;
257 
258  /* next token is the modification date */
259  find_end_of_token (&end, buf, size);
260  if (end > size)
261  return;
262  if (end > pos)
263  {
264  ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos));
265  pos = end + 1;
266  }
267 
268  /* next token is the source of the man page */
269  end = pos;
270  find_end_of_token (&end, buf, size);
271  if (end > size)
272  return;
273  if (end > pos)
274  {
276  stndup (&buf[pos], end - pos));
277  pos = end + 1;
278  }
279 
280  /* last token is the title of the book the man page belongs to */
281  end = pos;
282  find_end_of_token (&end, buf, size);
283  if (end > size)
284  return;
285  if (end > pos)
286  {
288  stndup (&buf[pos], end - pos));
289  pos = end + 1;
290  }
291 }
292 
293 
294 /* end of man_extractor.c */
int(* EXTRACTOR_MetaDataProcessor)(void *cls, const char *plugin_name, enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, const char *data_mime_type, const char *data, size_t data_len)
Definition: extractor.h:460
@ EXTRACTOR_METAFORMAT_UTF8
Definition: extractor.h:102
#define NULL
Definition: getopt1.c:60
EXTRACTOR_MetaType
Definition: extractor.h:126
@ EXTRACTOR_METATYPE_SECTION
Definition: extractor.h:212
@ EXTRACTOR_METATYPE_BOOK_TITLE
Definition: extractor.h:135
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_SOURCE
Definition: extractor.h:261
@ EXTRACTOR_METATYPE_MODIFICATION_DATE
Definition: extractor.h:197
enum EXTRACTOR_MetaType type
#define MAX_READ
static void find_end_of_token(size_t *end, const char *buf, const size_t size)
static char * stndup(const char *str, size_t n)
Definition: man_extractor.c:38
static int add_keyword(enum EXTRACTOR_MetaType type, char *keyword, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
Definition: man_extractor.c:61
void EXTRACTOR_man_extract_method(struct EXTRACTOR_ExtractContext *ec)
#define ADD(t, s)
plaform specifics
#define _(a)
Definition: platform.h:32
ssize_t(* read)(void *cls, void **data, size_t size)
Definition: extractor.h:494