"Fossies" - the Fresh Open Source Software Archive 
Member "dehtml-1.8/dehtml.c" (11 Jan 2011, 13841 Bytes) of package /linux/www/old/dehtml-1.8.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "dehtml.c" see the
Fossies "Dox" file reference documentation.
1 /* #includes */ /*{{{C}}}*//*{{{*/
2 #include "config.h"
3
4 #include <sys/types.h>
5 #include <ctype.h>
6 #include <errno.h>
7 #ifdef HAVE_GETTEXT
8 #include <libintl.h>
9 #define _(String) gettext(String)
10 #else
11 #define _(String) String
12 #endif
13 #include <locale.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <unistd.h>
18
19 #include "getopt.h"
20 /*}}}*/
21 /* #defines */ /*{{{*/
22 #define ISALPHA(c) ((c>='a' && c<='z') || (c>='A' && c<='Z'))
23 /*}}}*/
24
25 /* types */ /*{{{*/
26 struct Url
27 {
28 char *url;
29 int number;
30 struct Url *next;
31 };
32 /*}}}*/
33 /* variables */ /*{{{*/
34 static int intitle,inlist,inheader,inpre,inword,inwhite;
35 static int words=0,skipheaders=0,skiplists=0,pretty=0;
36 static int first=1;
37 static const char *file;
38 static int line;
39 /*}}}*/
40
41 static int mystrcasecmp(const char *s, const char *t) /*{{{*/
42 {
43 int x;
44
45 while ((x=tolower(*s)-tolower(*t))==0 && *s) { ++s; ++t; }
46 return x;
47 }
48 /*}}}*/
49 static int condputchar(int c) /*{{{*/
50 {
51 static int nls=2;
52 static int lastc='\n';
53
54 if (words || ((!skiplists || !inlist) && (!skipheaders || (!inheader && !intitle))))
55 {
56 if (pretty)
57 {
58 if (c=='\n')
59 {
60 if (++nls>2) return c;
61 }
62 else
63 {
64 nls=0;
65 }
66 if (c=='\n') ++line;
67 lastc=c;
68 return putchar(c);
69 }
70 else return putchar(c);
71 }
72 else if (!pretty && c=='\n') return putchar(c);
73 }
74 /*}}}*/
75 static void wordputchar(int c) /*{{{*/
76 {
77 if (words)
78 {
79 if (isalpha(c) || c=='_') { inword=1; inwhite=0; condputchar(c); }
80 else if (inword && !inwhite) { inword=0; inwhite=1; condputchar('\n'); }
81 }
82 else condputchar(c);
83 }
84 /*}}}*/
85 static void dehtml(FILE *fp, const char *fileName) /*{{{*/
86 {
87 int c;
88 char href[512];
89 struct Url *urls,**lasturl;
90
91 line=1;
92 file=fileName;
93 intitle=inlist=inheader=inpre=0;
94 href[0]='\0';
95 urls=(struct Url*)0;
96 lasturl=&urls;
97 while ((c=getc(fp))!=EOF)
98 {
99 if (c=='<') /* tag */ /*{{{*/
100 {
101 char tag[sizeof("/address")];
102 char attribute[sizeof("href")];
103 int i;
104
105 if (inword) inword=0;
106 /* tag name */ /*{{{*/
107 i=0;
108 while ((c=getc(fp))!=EOF && c!='>' && c!=' ' && c!='\n')
109 {
110 if (i<sizeof(tag)-1) tag[i++]=tolower(c);
111 }
112 tag[i]='\0';
113 if (c=='\n')
114 {
115 ++line;
116 condputchar('\n');
117 }
118 if (i && i<sizeof(tag))
119 {
120 if (mystrcasecmp(tag,"p")==0 || mystrcasecmp(tag,"hr")==0) /*{{{*/
121 {
122 if (!words && pretty) { condputchar('\n'); condputchar('\n'); }
123 }
124 /*}}}*/
125 else if (mystrcasecmp(tag,"br")==0) /*{{{*/
126 {
127 if (!words && pretty) condputchar('\n');
128 }
129 /*}}}*/
130 else if (mystrcasecmp(tag,"title")==0) intitle=1;
131 else if (mystrcasecmp(tag,"/title")==0) intitle=0;
132 else if (tolower(tag[0])=='h' && isdigit(tag[1]) && tag[2]=='\0') /*{{{*/
133 {
134 if (!words && pretty)
135 {
136 condputchar('\n');
137 condputchar('\n');
138 }
139 ++inheader;
140 }
141 /*}}}*/
142 else if (tag[0]=='/' && tolower(tag[1])=='h' && isdigit(tag[2]) && tag[3]=='\0') /*{{{*/
143 {
144 if (!words && pretty)
145 {
146 condputchar('\n');
147 condputchar('\n');
148 }
149 if (inheader) --inheader;
150 }
151 /*}}}*/
152 else if (mystrcasecmp(tag,"pre")==0) inpre=1;
153 else if (mystrcasecmp(tag,"/pre")==0) inpre=0;
154 else if (mystrcasecmp(tag,"dl")==0) ++inlist;
155 else if (mystrcasecmp(tag,"/dl")==0) { if (inlist) --inlist; }
156 else if (mystrcasecmp(tag,"ul")==0) ++inlist;
157 else if (mystrcasecmp(tag,"/ul")==0) { if (inlist) --inlist; }
158 else if (mystrcasecmp(tag,"ol")==0) ++inlist;
159 else if (mystrcasecmp(tag,"/ol")==0) { if (inlist) --inlist; }
160 else if (mystrcasecmp(tag,"/a")==0 && href[0]) /*{{{*/
161 {
162 struct Url *u;
163 char n[32],*s;
164 int number=0;
165
166 for (u=urls; u && strcmp(u->url,href); u=u->next) number=u->number;
167 if (u==(struct Url*)0)
168 {
169 u=malloc(sizeof(struct Url));
170 u->number=number+1;
171 u->url=strcpy(malloc(strlen(href)+1),href);
172 u->next=(struct Url*)0;
173 *lasturl=u;
174 lasturl=&u->next;
175 }
176 snprintf(n,sizeof(n)," [%d]",u->number);
177 for (s=n; *s; ++s) wordputchar(*s);
178 href[0]='\0';
179 }
180 /*}}}*/
181 }
182 /*}}}*/
183 if (c!=EOF && c!='>') /* tag attributes */ /*{{{*/
184 {
185 enum { EMPTY, ATTRIBUTE, EQ, VALUE, QUOTEDVALUE } state=EMPTY;
186 int output_value=0;
187 int a_href=0;
188
189 do
190 {
191 c=getc(fp);
192 if (c=='\n')
193 {
194 ++line;
195 condputchar('\n');
196 }
197 switch (state)
198 {
199 case EMPTY: /*{{{*/
200 {
201 if (ISALPHA(c))
202 {
203 state=ATTRIBUTE;
204 i=0;
205 attribute[i++]=c;
206 }
207 break;
208 }
209 /*}}}*/
210 case ATTRIBUTE: /*{{{*/
211 {
212 if (ISALPHA(c))
213 {
214 if (i<sizeof(attribute)-1) attribute[i++]=tolower(c);
215 }
216 else
217 {
218 attribute[i]='\0';
219 if (c=='=')
220 {
221 state=EQ;
222 a_href=(strcmp(tag,"a")==0) && (strcmp(attribute,"href")==0);
223 output_value=(strcmp(tag,"img")==0) && (strcmp(attribute,"alt")==0);
224 }
225 else state=EMPTY;
226 }
227 break;
228 }
229 /*}}}*/
230 case EQ: /*{{{*/
231 {
232 i=0;
233 if (c=='"') state=QUOTEDVALUE;
234 else
235 {
236 state=VALUE;
237 if (output_value) wordputchar(c);
238 }
239 break;
240 }
241 /*}}}*/
242 case QUOTEDVALUE: /*{{{*/
243 {
244 if (c=='"')
245 {
246 if (a_href)
247 {
248 href[i]='\0';
249 a_href=0;
250 }
251 output_value=0;
252 state=EMPTY;
253 }
254 else if (a_href)
255 {
256 if (i<sizeof(href)-1) href[i++]=c;
257 }
258 else if (output_value) condputchar(c);
259 break;
260 }
261 /*}}}*/
262 case VALUE: /*{{{*/
263 {
264 if (c==' ')
265 {
266 if (a_href)
267 {
268 a_href=0;
269 href[i]='\0';
270 }
271 output_value=0;
272 state=EMPTY;
273 }
274 else if (a_href)
275 {
276 if (i<sizeof(href)-1) href[i++]=c;
277 }
278 else if (output_value) wordputchar(c);
279 break;
280 }
281 /*}}}*/
282 }
283 } while (c!=EOF && c!='>');
284 }
285 /*}}}*/
286 }
287 /*}}}*/
288 else if (c=='&') /* entity */ /*{{{*/
289 {
290 char entity[73];
291 int i=0;
292
293 if ((c=getc(fp))=='#')
294 {
295 c=getc(fp);
296 if (isdigit(c))
297 {
298 int numeric=c-'0';
299
300 while ((c=getc(fp))!=EOF && isdigit(c))
301 {
302 numeric=numeric*10+(c-'0');
303 }
304 wordputchar(numeric);
305 if (c!=';') wordputchar(c);
306 }
307 else
308 {
309 wordputchar('&');
310 wordputchar('#');
311 }
312 }
313 else if (ISALPHA(c) || isdigit(c) || c=='.' || c=='-')
314 {
315 /* variables */ /*{{{*/
316 struct
317 {
318 const char *name;
319 char value;
320 }
321 const *eptr,
322 entities[]=
323 {
324 { "gt", '>' },
325 { "lt", '<' },
326 { "amp", '&' },
327 { "quot", '"' },
328 { "AElig", 'Æ' },
329 { "Aacute", 'Á' },
330 { "Acirc", 'Â' },
331 { "Agrave", 'À' },
332 { "Aring", 'Å' },
333 { "Atilde", 'Ã' },
334 { "Auml", 'Ä' },
335 { "Ccedil", 'Ç' },
336 { "ETH", 'Ð' },
337 { "Eacute", 'É' },
338 { "Ecirc", 'Ê' },
339 { "Egrave", 'È' },
340 { "Euml", 'Ë' },
341 { "Iacute", 'Í' },
342 { "Icirc", 'Î' },
343 { "Igrave", 'Ì' },
344 { "Iuml", 'Ï' },
345 { "Ntilde", 'Ñ' },
346 { "Oacute", 'Ó' },
347 { "Ocirc", 'Ô' },
348 { "Ograve", 'Ò' },
349 { "Oslash", 'Ø' },
350 { "Otilde", 'Õ' },
351 { "Ouml", 'Ö' },
352 { "THORN", 'Þ' },
353 { "Uacute", 'Ú' },
354 { "Ucirc", 'Û' },
355 { "Ugrave", 'Ù' },
356 { "Uuml", 'Ü' },
357 { "Yacute", 'Ý' },
358 { "aacute", 'á' },
359 { "acirc", 'â' },
360 { "aelig", 'æ' },
361 { "agrave", 'à' },
362 { "aring", 'å' },
363 { "atilde", 'ã' },
364 { "auml", 'ä' },
365 { "ccedil", 'ç' },
366 { "eacute", 'é' },
367 { "ecirc", 'ê' },
368 { "egrave", 'è' },
369 { "eth", 'ð' },
370 { "euml", 'ë' },
371 { "iacute", 'í' },
372 { "icirc", 'î' },
373 { "igrave", 'ì' },
374 { "iuml", 'ï' },
375 { "nbsp", ' ' },
376 { "ntilde", 'ñ' },
377 { "oacute", 'ó' },
378 { "ocirc", 'ô' },
379 { "ograve", 'ò' },
380 { "oslash", 'ø' },
381 { "otilde", 'õ' },
382 { "ouml", 'ö' },
383 { "szlig", 'ß' },
384 { "thorn", 'þ' },
385 { "uacute", 'ú' },
386 { "ucirc", 'û' },
387 { "ugrave", 'ù' },
388 { "uuml", 'ü' },
389 { "yacute", 'ý' },
390 { "yuml", 'ÿ' }
391 };
392 /*}}}*/
393
394 entity[i++]=c;
395 while ((c=getc(fp))!=EOF && (ISALPHA(c) || isdigit(c) || c=='.' || c=='-'))
396 {
397 if (i<sizeof(entity)-1) entity[i++]=c;
398 }
399 entity[i]='\0';
400 for (eptr=entities; eptr<entities+sizeof(entities)/sizeof(entities[0]); ++eptr)
401 {
402 if (strcmp(eptr->name,entity)==0)
403 {
404 wordputchar(eptr->value);
405 if (c!=';') wordputchar(c);
406 goto continueLoop;
407 }
408 else if (strcmp(entity,"hellip")==0)
409 {
410 wordputchar('.');
411 wordputchar('.');
412 wordputchar('.');
413 goto continueLoop;
414 }
415 }
416 wordputchar('&');
417 for (i=0; entity[i]; ++i) wordputchar(entity[i]);
418 wordputchar(c);
419 }
420 else
421 {
422 wordputchar('&');
423 wordputchar(c);
424 }
425 }
426 /*}}}*/
427 else if (c=='\n') /* new line */ /*{{{*/
428 {
429 ++line;
430 wordputchar(c);
431 }
432 /*}}}*/
433 else wordputchar(c);
434 continueLoop:;
435 }
436 wordputchar('\n');
437 while (urls)
438 {
439 char n[32],*s;
440 struct Url *f;
441
442 snprintf(n,sizeof(n),"[%d] ",urls->number);
443 for (s=n; *s; ++s) wordputchar(*s);
444 for (s=urls->url; *s; ++s) wordputchar(*s);
445 wordputchar('\n');
446 free(urls->url);
447 f=urls;
448 urls=urls->next;
449 free(f);
450 }
451 }
452 /*}}}*/
453
454 int main(int argc, char *argv[]) /*{{{*/
455 {
456 /* variable declarations */ /*{{{*/
457 FILE *in;
458 int usage=0;
459 int c;
460 static struct option lopts[]=
461 {
462 { "word-list", no_argument, 0, 'w' },
463 { "skip-headers", no_argument, 0, 's' },
464 { "skip-lists", no_argument, 0, 'l' },
465 { "prettyprint", no_argument, 0, 'p' },
466 { "urls", no_argument, 0, 'u' },
467 { "help", no_argument, 0, 'h' },
468 { "version", no_argument, 0, 'v' },
469 { (const char*)0, 0, 0, '\0' }
470 };
471 /*}}}*/
472
473 setlocale(LC_MESSAGES,"");
474 setlocale(LC_CTYPE,"");
475 #ifdef HAVE_GETTEXT
476 bindtextdomain("dehtml",LOCALEDIR);
477 textdomain("dehtml");
478 #endif
479 /* parse arguments */ /*{{{*/
480 while ((c=getopt_long(argc,argv,"wslp?h",lopts,(int*)0))!=EOF) switch(c)
481 {
482 case 'w': words=1; break;
483 case 's': skipheaders=1; break;
484 case 'l': skiplists=1; break;
485 case 'p': pretty=1; break;
486 case 'h': usage=2; break;
487 case 'v': printf("dehtml " VERSION "\n"); exit(0);
488 default: usage=1;
489 }
490 if (usage==1)
491 {
492 fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
493 fprintf(stderr,"\n");
494 fprintf(stderr,_("Try `dehtml -h' or `dehtml --help' for more information.\n"));
495 exit(1);
496 }
497 if (usage==2)
498 {
499 fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
500 fprintf(stderr,"\n");
501 fprintf(stderr,_("Remove HTML constructs from documents.\n"));
502 fprintf(stderr,"\n");
503 fprintf(stderr,_("-w, --word-list output a word list\n"));
504 fprintf(stderr,_("-s, --skip-headers do not output headers\n"));
505 fprintf(stderr,_("-l, --skip-lists do not output lists\n"));
506 fprintf(stderr,_("-p, --pretty-print pretty printed output\n"));
507 fprintf(stderr,_("-h, --help display this help and exit\n"));
508 fprintf(stderr,_(" --version display version and exit\n"));
509 fprintf(stderr,"\n");
510 fprintf(stderr,_("Report bugs to <michael@moria.de>.\n"));
511 exit(0);
512 }
513 /*}}}*/
514 /* dehtml stdin or files, if any */ /*{{{*/
515 if (optind<argc) while (optind<argc)
516 {
517 if ((in=fopen(argv[optind],"r"))==(FILE*)0)
518 {
519 fprintf(stderr,_("dehtml: Opening `%s' failed (%s).\n"),argv[optind],strerror(errno));
520 exit(1);
521 }
522 dehtml(in,argv[optind]);
523 fclose(in);
524 ++optind;
525 }
526 else dehtml(stdin,(const char*)0);
527 if (fclose(stdout)==-1)
528 {
529 fprintf(stderr,_("dehtml: Closing standard output failed (%s).\n"),strerror(errno));
530 return 1;
531 }
532 /*}}}*/
533 return 0;
534 }
535 /*}}}*/