"Fossies" - the Fresh Open Source Software Archive 
Member "libextractor-1.11/src/plugins/pdf_extractor.c" (30 Jan 2021, 5530 Bytes) of package /linux/privat/libextractor-1.11.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "pdf_extractor.c" see the
Fossies "Dox" file reference documentation and the last
Fossies "Diffs" side-by-side code changes report:
1.5_vs_1.6.
1 /*
2 This file is part of libextractor.
3 Copyright (C) 2016 Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19 */
20 /**
21 * @file plugins/pdf_extractor.c
22 * @brief plugin to support PDF files
23 * @author Christian Grothoff
24 *
25 * PDF libraries today are a nightmare (TM). So instead of doing the
26 * fast thing and calling some library functions to parse the PDF,
27 * we execute 'pdfinfo' and parse the output. Because that's 21st
28 * century plumbing: nobody writes reasonable code anymore.
29 */
30 #include "platform.h"
31 #include <extractor.h>
32 #include <sys/types.h>
33 #include <sys/wait.h>
34 #include <signal.h>
35 #include <unistd.h>
36
37 /**
38 * Entry in the mapping from control data to LE types.
39 */
40 struct Matches
41 {
42 /**
43 * Key in the Pdfian control file.
44 */
45 const char *text;
46
47 /**
48 * Corresponding type in LE.
49 */
50 enum EXTRACTOR_MetaType type;
51 };
52
53
54 /**
55 * Map from pdf-control entries to LE types.
56 *
57 * See output of 'pdfinfo'.
58 */
59 static struct Matches tmap[] = {
60 {"Title", EXTRACTOR_METATYPE_TITLE},
61 {"Subject", EXTRACTOR_METATYPE_SUBJECT},
62 {"Keywords", EXTRACTOR_METATYPE_KEYWORDS},
63 {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME},
64 {"Creator", EXTRACTOR_METATYPE_CREATOR},
65 {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
66 {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
67 {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE},
68 {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION},
69 {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT},
70 {NULL, 0}
71 };
72
73
74 /**
75 * Process the "stdout" file from pdfinfo.
76 *
77 * @param fout stdout of pdfinfo
78 * @param proc function to call with meta data
79 * @param proc_cls closure for @e proc
80 */
81 static void
82 process_stdout (FILE *fout,
83 EXTRACTOR_MetaDataProcessor proc,
84 void *proc_cls)
85 {
86 unsigned int i;
87 char line[1025];
88 const char *psuffix;
89 const char *colon;
90
91 while (! feof (fout))
92 {
93 if (NULL == fgets (line, sizeof (line) - 1, fout))
94 break;
95 if (0 == strlen (line))
96 continue;
97 if ('\n' == line[strlen (line) - 1])
98 line[strlen (line) - 1] = '\0';
99 colon = strchr (line, (int) ':');
100 if (NULL == colon)
101 break;
102 psuffix = colon + 1;
103 while (isblank ((unsigned char) psuffix[0]))
104 psuffix++;
105 if (0 == strlen (psuffix))
106 continue;
107 for (i = 0; NULL != tmap[i].text; i++)
108 {
109 if (0 != strncasecmp (line,
110 tmap[i].text,
111 colon - line))
112 continue;
113 if (0 != proc (proc_cls,
114 "pdf",
115 tmap[i].type,
116 EXTRACTOR_METAFORMAT_UTF8,
117 "text/plain",
118 psuffix,
119 strlen (psuffix) + 1))
120 return;
121 break;
122 }
123 }
124 }
125
126
127 /**
128 * Main entry method for the PDF extraction plugin.
129 *
130 * @param ec extraction context provided to the plugin
131 */
132 void
133 EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
134 {
135 uint64_t fsize;
136 void *data;
137 pid_t pid;
138 int in[2];
139 int out[2];
140 FILE *fout;
141 uint64_t pos;
142
143 fsize = ec->get_size (ec->cls);
144 if (fsize < 128)
145 return;
146 if (4 !=
147 ec->read (ec->cls, &data, 4))
148 return;
149 if (0 != strncmp ("%PDF", data, 4))
150 return;
151 if (0 !=
152 ec->seek (ec->cls, 0, SEEK_SET))
153 return;
154 if (0 != pipe (in))
155 return;
156 if (0 != pipe (out))
157 {
158 close (in[0]);
159 close (in[1]);
160 return;
161 }
162 pid = fork ();
163 if (-1 == pid)
164 {
165 close (in[0]);
166 close (in[1]);
167 close (out[0]);
168 close (out[1]);
169 return;
170 }
171 if (0 == pid)
172 {
173 char *const args[] = {
174 "pdfinfo",
175 "-",
176 NULL
177 };
178 /* am child, exec 'pdfinfo' */
179 close (0);
180 close (1);
181 if ( (-1 == dup2 (in[0], 0)) ||
182 (-1 == dup2 (out[1], 1)) )
183 exit (1);
184 close (in[0]);
185 close (in[1]);
186 close (out[0]);
187 close (out[1]);
188 execvp ("pdfinfo", args);
189 exit (1);
190 }
191 /* am parent, send file */
192 close (in[0]);
193 close (out[1]);
194 fout = fdopen (out[0], "r");
195 if (NULL == fout)
196 {
197 close (in[1]);
198 close (out[0]);
199 kill (pid, SIGKILL);
200 waitpid (pid, NULL, 0);
201 return;
202 }
203 pos = 0;
204 while (pos < fsize)
205 {
206 ssize_t got;
207 size_t wpos;
208
209 data = NULL;
210 got = ec->read (ec->cls,
211 &data,
212 fsize - pos);
213 if ( (-1 == got) ||
214 (NULL == data) )
215 break;
216 wpos = 0;
217 while (wpos < got)
218 {
219 ssize_t out;
220
221 out = write (in[1], data + wpos, got - wpos);
222 if (out <= 0)
223 break;
224 wpos += out;
225 }
226 if (wpos < got)
227 break;
228 pos += got;
229 }
230 close (in[1]);
231 process_stdout (fout, ec->proc, ec->cls);
232 fclose (fout);
233 kill (pid, SIGKILL);
234 waitpid (pid, NULL, 0);
235 }
236
237
238 /* end of pdf_extractor.c */