libextractor  1.11
About: GNU libextractor is a library used to extract meta-data from files of arbitrary type.
  Fossies Dox: libextractor-1.11.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pdf_extractor.c
Go to the documentation of this file.
1 /*
2  This file is part of libextractor.
3  Copyright (C) 2016 Christian Grothoff
4 
5  libextractor is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published
7  by the Free Software Foundation; either version 3, or (at your
8  option) any later version.
9 
10  libextractor is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with libextractor; see the file COPYING. If not, write to the
17  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  Boston, MA 02110-1301, USA.
19  */
20 /**
21  * @file plugins/pdf_extractor.c
22  * @brief plugin to support PDF files
23  * @author Christian Grothoff
24  *
25  * PDF libraries today are a nightmare (TM). So instead of doing the
26  * fast thing and calling some library functions to parse the PDF,
27  * we execute 'pdfinfo' and parse the output. Because that's 21st
28  * century plumbing: nobody writes reasonable code anymore.
29  */
30 #include "platform.h"
31 #include <extractor.h>
32 #include <sys/types.h>
33 #include <sys/wait.h>
34 #include <signal.h>
35 #include <unistd.h>
36 
37 /**
38  * Entry in the mapping from control data to LE types.
39  */
40 struct Matches
41 {
42  /**
43  * Key in the Pdfian control file.
44  */
45  const char *text;
46 
47  /**
48  * Corresponding type in LE.
49  */
51 };
52 
53 
54 /**
55  * Map from pdf-control entries to LE types.
56  *
57  * See output of 'pdfinfo'.
58  */
59 static struct Matches tmap[] = {
60  {"Title", EXTRACTOR_METATYPE_TITLE},
61  {"Subject", EXTRACTOR_METATYPE_SUBJECT},
62  {"Keywords", EXTRACTOR_METATYPE_KEYWORDS},
64  {"Creator", EXTRACTOR_METATYPE_CREATOR},
66  {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
68  {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION},
70  {NULL, 0}
71 };
72 
73 
74 /**
75  * Process the "stdout" file from pdfinfo.
76  *
77  * @param fout stdout of pdfinfo
78  * @param proc function to call with meta data
79  * @param proc_cls closure for @e proc
80  */
81 static void
82 process_stdout (FILE *fout,
84  void *proc_cls)
85 {
86  unsigned int i;
87  char line[1025];
88  const char *psuffix;
89  const char *colon;
90 
91  while (! feof (fout))
92  {
93  if (NULL == fgets (line, sizeof (line) - 1, fout))
94  break;
95  if (0 == strlen (line))
96  continue;
97  if ('\n' == line[strlen (line) - 1])
98  line[strlen (line) - 1] = '\0';
99  colon = strchr (line, (int) ':');
100  if (NULL == colon)
101  break;
102  psuffix = colon + 1;
103  while (isblank ((unsigned char) psuffix[0]))
104  psuffix++;
105  if (0 == strlen (psuffix))
106  continue;
107  for (i = 0; NULL != tmap[i].text; i++)
108  {
109  if (0 != strncasecmp (line,
110  tmap[i].text,
111  colon - line))
112  continue;
113  if (0 != proc (proc_cls,
114  "pdf",
115  tmap[i].type,
117  "text/plain",
118  psuffix,
119  strlen (psuffix) + 1))
120  return;
121  break;
122  }
123  }
124 }
125 
126 
127 /**
128  * Main entry method for the PDF extraction plugin.
129  *
130  * @param ec extraction context provided to the plugin
131  */
132 void
134 {
135  uint64_t fsize;
136  void *data;
137  pid_t pid;
138  int in[2];
139  int out[2];
140  FILE *fout;
141  uint64_t pos;
142 
143  fsize = ec->get_size (ec->cls);
144  if (fsize < 128)
145  return;
146  if (4 !=
147  ec->read (ec->cls, &data, 4))
148  return;
149  if (0 != strncmp ("%PDF", data, 4))
150  return;
151  if (0 !=
152  ec->seek (ec->cls, 0, SEEK_SET))
153  return;
154  if (0 != pipe (in))
155  return;
156  if (0 != pipe (out))
157  {
158  close (in[0]);
159  close (in[1]);
160  return;
161  }
162  pid = fork ();
163  if (-1 == pid)
164  {
165  close (in[0]);
166  close (in[1]);
167  close (out[0]);
168  close (out[1]);
169  return;
170  }
171  if (0 == pid)
172  {
173  char *const args[] = {
174  "pdfinfo",
175  "-",
176  NULL
177  };
178  /* am child, exec 'pdfinfo' */
179  close (0);
180  close (1);
181  if ( (-1 == dup2 (in[0], 0)) ||
182  (-1 == dup2 (out[1], 1)) )
183  exit (1);
184  close (in[0]);
185  close (in[1]);
186  close (out[0]);
187  close (out[1]);
188  execvp ("pdfinfo", args);
189  exit (1);
190  }
191  /* am parent, send file */
192  close (in[0]);
193  close (out[1]);
194  fout = fdopen (out[0], "r");
195  if (NULL == fout)
196  {
197  close (in[1]);
198  close (out[0]);
199  kill (pid, SIGKILL);
200  waitpid (pid, NULL, 0);
201  return;
202  }
203  pos = 0;
204  while (pos < fsize)
205  {
206  ssize_t got;
207  size_t wpos;
208 
209  data = NULL;
210  got = ec->read (ec->cls,
211  &data,
212  fsize - pos);
213  if ( (-1 == got) ||
214  (NULL == data) )
215  break;
216  wpos = 0;
217  while (wpos < got)
218  {
219  ssize_t out;
220 
221  out = write (in[1], data + wpos, got - wpos);
222  if (out <= 0)
223  break;
224  wpos += out;
225  }
226  if (wpos < got)
227  break;
228  pos += got;
229  }
230  close (in[1]);
231  process_stdout (fout, ec->proc, ec->cls);
232  fclose (fout);
233  kill (pid, SIGKILL);
234  waitpid (pid, NULL, 0);
235 }
236 
237 
238 /* end of pdf_extractor.c */
int(* EXTRACTOR_MetaDataProcessor)(void *cls, const char *plugin_name, enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, const char *data_mime_type, const char *data, size_t data_len)
Definition: extractor.h:460
@ EXTRACTOR_METAFORMAT_UTF8
Definition: extractor.h:102
#define NULL
Definition: getopt1.c:60
EXTRACTOR_MetaType
Definition: extractor.h:126
@ EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE
Definition: extractor.h:258
@ EXTRACTOR_METATYPE_AUTHOR_NAME
Definition: extractor.h:143
@ EXTRACTOR_METATYPE_ENCODER_VERSION
Definition: extractor.h:350
@ EXTRACTOR_METATYPE_TITLE
Definition: extractor.h:134
@ EXTRACTOR_METATYPE_CREATOR
Definition: extractor.h:189
@ EXTRACTOR_METATYPE_CREATION_DATE
Definition: extractor.h:196
@ EXTRACTOR_METATYPE_KEYWORDS
Definition: extractor.h:185
@ EXTRACTOR_METATYPE_MODIFICATION_DATE
Definition: extractor.h:197
@ EXTRACTOR_METATYPE_PAGE_COUNT
Definition: extractor.h:141
@ EXTRACTOR_METATYPE_SUBJECT
Definition: extractor.h:188
enum EXTRACTOR_MetaType type
static void process_stdout(FILE *fout, EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
Definition: pdf_extractor.c:82
void EXTRACTOR_pdf_extract_method(struct EXTRACTOR_ExtractContext *ec)
static struct Matches tmap[]
Definition: pdf_extractor.c:59
plaform specifics
int64_t(* seek)(void *cls, int64_t pos, int whence)
Definition: extractor.h:509
uint64_t(* get_size)(void *cls)
Definition: extractor.h:520
EXTRACTOR_MetaDataProcessor proc
Definition: extractor.h:525
ssize_t(* read)(void *cls, void **data, size_t size)
Definition: extractor.h:494
const char * text
Definition: deb_extractor.c:77
enum EXTRACTOR_MetaType type
Definition: deb_extractor.c:82