"Fossies" - the Fresh Open Source Software Archive 
Member "dehtml-1.8/dehtml.c" (11 Jan 2011, 13841 Bytes) of package /linux/www/old/dehtml-1.8.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 /* #includes */ /*{{{C}}}*//*{{{*/
2 #include "config.h"
3
4 #include <sys/types.h>
5 #include <ctype.h>
6 #include <errno.h>
7 #ifdef HAVE_GETTEXT
8 #include <libintl.h>
9 #define _(String) gettext(String)
10 #else
11 #define _(String) String
12 #endif
13 #include <locale.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <unistd.h>
18
19 #include "getopt.h"
20 /*}}}*/
21 /* #defines */ /*{{{*/
22 #define ISALPHA(c) ((c>='a' && c<='z') || (c>='A' && c<='Z'))
23 /*}}}*/
24
25 /* types */ /*{{{*/
26 struct Url
27 {
28 char *url;
29 int number;
30 struct Url *next;
31 };
32 /*}}}*/
33 /* variables */ /*{{{*/
34 static int intitle,inlist,inheader,inpre,inword,inwhite;
35 static int words=0,skipheaders=0,skiplists=0,pretty=0;
36 static int first=1;
37 static const char *file;
38 static int line;
39 /*}}}*/
40
41 static int mystrcasecmp(const char *s, const char *t) /*{{{*/
42 {
43 int x;
44
45 while ((x=tolower(*s)-tolower(*t))==0 && *s) { ++s; ++t; }
46 return x;
47 }
48 /*}}}*/
49 static int condputchar(int c) /*{{{*/
50 {
51 static int nls=2;
52 static int lastc='\n';
53
54 if (words || ((!skiplists || !inlist) && (!skipheaders || (!inheader && !intitle))))
55 {
56 if (pretty)
57 {
58 if (c=='\n')
59 {
60 if (++nls>2) return c;
61 }
62 else
63 {
64 nls=0;
65 }
66 if (c=='\n') ++line;
67 lastc=c;
68 return putchar(c);
69 }
70 else return putchar(c);
71 }
72 else if (!pretty && c=='\n') return putchar(c);
73 }
74 /*}}}*/
75 static void wordputchar(int c) /*{{{*/
76 {
77 if (words)
78 {
79 if (isalpha(c) || c=='_') { inword=1; inwhite=0; condputchar(c); }
80 else if (inword && !inwhite) { inword=0; inwhite=1; condputchar('\n'); }
81 }
82 else condputchar(c);
83 }
84 /*}}}*/
85 static void dehtml(FILE *fp, const char *fileName) /*{{{*/
86 {
87 int c;
88 char href[512];
89 struct Url *urls,**lasturl;
90
91 line=1;
92 file=fileName;
93 intitle=inlist=inheader=inpre=0;
94 href[0]='\0';
95 urls=(struct Url*)0;
96 lasturl=&urls;
97 while ((c=getc(fp))!=EOF)
98 {
99 if (c=='<') /* tag */ /*{{{*/
100 {
101 char tag[sizeof("/address")];
102 char attribute[sizeof("href")];
103 int i;
104
105 if (inword) inword=0;
106 /* tag name */ /*{{{*/
107 i=0;
108 while ((c=getc(fp))!=EOF && c!='>' && c!=' ' && c!='\n')
109 {
110 if (i<sizeof(tag)-1) tag[i++]=tolower(c);
111 }
112 tag[i]='\0';
113 if (c=='\n')
114 {
115 ++line;
116 condputchar('\n');
117 }
118 if (i && i<sizeof(tag))
119 {
120 if (mystrcasecmp(tag,"p")==0 || mystrcasecmp(tag,"hr")==0) /*{{{*/
121 {
122 if (!words && pretty) { condputchar('\n'); condputchar('\n'); }
123 }
124 /*}}}*/
125 else if (mystrcasecmp(tag,"br")==0) /*{{{*/
126 {
127 if (!words && pretty) condputchar('\n');
128 }
129 /*}}}*/
130 else if (mystrcasecmp(tag,"title")==0) intitle=1;
131 else if (mystrcasecmp(tag,"/title")==0) intitle=0;
132 else if (tolower(tag[0])=='h' && isdigit(tag[1]) && tag[2]=='\0') /*{{{*/
133 {
134 if (!words && pretty)
135 {
136 condputchar('\n');
137 condputchar('\n');
138 }
139 ++inheader;
140 }
141 /*}}}*/
142 else if (tag[0]=='/' && tolower(tag[1])=='h' && isdigit(tag[2]) && tag[3]=='\0') /*{{{*/
143 {
144 if (!words && pretty)
145 {
146 condputchar('\n');
147 condputchar('\n');
148 }
149 if (inheader) --inheader;
150 }
151 /*}}}*/
152 else if (mystrcasecmp(tag,"pre")==0) inpre=1;
153 else if (mystrcasecmp(tag,"/pre")==0) inpre=0;
154 else if (mystrcasecmp(tag,"dl")==0) ++inlist;
155 else if (mystrcasecmp(tag,"/dl")==0) { if (inlist) --inlist; }
156 else if (mystrcasecmp(tag,"ul")==0) ++inlist;
157 else if (mystrcasecmp(tag,"/ul")==0) { if (inlist) --inlist; }
158 else if (mystrcasecmp(tag,"ol")==0) ++inlist;
159 else if (mystrcasecmp(tag,"/ol")==0) { if (inlist) --inlist; }
160 else if (mystrcasecmp(tag,"/a")==0 && href[0]) /*{{{*/
161 {
162 struct Url *u;
163 char n[32],*s;
164 int number=0;
165
166 for (u=urls; u && strcmp(u->url,href); u=u->next) number=u->number;
167 if (u==(struct Url*)0)
168 {
169 u=malloc(sizeof(struct Url));
170 u->number=number+1;
171 u->url=strcpy(malloc(strlen(href)+1),href);
172 u->next=(struct Url*)0;
173 *lasturl=u;
174 lasturl=&u->next;
175 }
176 snprintf(n,sizeof(n)," [%d]",u->number);
177 for (s=n; *s; ++s) wordputchar(*s);
178 href[0]='\0';
179 }
180 /*}}}*/
181 }
182 /*}}}*/
183 if (c!=EOF && c!='>') /* tag attributes */ /*{{{*/
184 {
185 enum { EMPTY, ATTRIBUTE, EQ, VALUE, QUOTEDVALUE } state=EMPTY;
186 int output_value=0;
187 int a_href=0;
188
189 do
190 {
191 c=getc(fp);
192 if (c=='\n')
193 {
194 ++line;
195 condputchar('\n');
196 }
197 switch (state)
198 {
199 case EMPTY: /*{{{*/
200 {
201 if (ISALPHA(c))
202 {
203 state=ATTRIBUTE;
204 i=0;
205 attribute[i++]=c;
206 }
207 break;
208 }
209 /*}}}*/
210 case ATTRIBUTE: /*{{{*/
211 {
212 if (ISALPHA(c))
213 {
214 if (i<sizeof(attribute)-1) attribute[i++]=tolower(c);
215 }
216 else
217 {
218 attribute[i]='\0';
219 if (c=='=')
220 {
221 state=EQ;
222 a_href=(strcmp(tag,"a")==0) && (strcmp(attribute,"href")==0);
223 output_value=(strcmp(tag,"img")==0) && (strcmp(attribute,"alt")==0);
224 }
225 else state=EMPTY;
226 }
227 break;
228 }
229 /*}}}*/
230 case EQ: /*{{{*/
231 {
232 i=0;
233 if (c=='"') state=QUOTEDVALUE;
234 else
235 {
236 state=VALUE;
237 if (output_value) wordputchar(c);
238 }
239 break;
240 }
241 /*}}}*/
242 case QUOTEDVALUE: /*{{{*/
243 {
244 if (c=='"')
245 {
246 if (a_href)
247 {
248 href[i]='\0';
249 a_href=0;
250 }
251 output_value=0;
252 state=EMPTY;
253 }
254 else if (a_href)
255 {
256 if (i<sizeof(href)-1) href[i++]=c;
257 }
258 else if (output_value) condputchar(c);
259 break;
260 }
261 /*}}}*/
262 case VALUE: /*{{{*/
263 {
264 if (c==' ')
265 {
266 if (a_href)
267 {
268 a_href=0;
269 href[i]='\0';
270 }
271 output_value=0;
272 state=EMPTY;
273 }
274 else if (a_href)
275 {
276 if (i<sizeof(href)-1) href[i++]=c;
277 }
278 else if (output_value) wordputchar(c);
279 break;
280 }
281 /*}}}*/
282 }
283 } while (c!=EOF && c!='>');
284 }
285 /*}}}*/
286 }
287 /*}}}*/
288 else if (c=='&') /* entity */ /*{{{*/
289 {
290 char entity[73];
291 int i=0;
292
293 if ((c=getc(fp))=='#')
294 {
295 c=getc(fp);
296 if (isdigit(c))
297 {
298 int numeric=c-'0';
299
300 while ((c=getc(fp))!=EOF && isdigit(c))
301 {
302 numeric=numeric*10+(c-'0');
303 }
304 wordputchar(numeric);
305 if (c!=';') wordputchar(c);
306 }
307 else
308 {
309 wordputchar('&');
310 wordputchar('#');
311 }
312 }
313 else if (ISALPHA(c) || isdigit(c) || c=='.' || c=='-')
314 {
315 /* variables */ /*{{{*/
316 struct
317 {
318 const char *name;
319 char value;
320 }
321 const *eptr,
322 entities[]=
323 {
324 { "gt", '>' },
325 { "lt", '<' },
326 { "amp", '&' },
327 { "quot", '"' },
328 { "AElig", 'Æ' },
329 { "Aacute", 'Á' },
330 { "Acirc", 'Â' },
331 { "Agrave", 'À' },
332 { "Aring", 'Å' },
333 { "Atilde", 'Ã' },
334 { "Auml", 'Ä' },
335 { "Ccedil", 'Ç' },
336 { "ETH", 'Ð' },
337 { "Eacute", 'É' },
338 { "Ecirc", 'Ê' },
339 { "Egrave", 'È' },
340 { "Euml", 'Ë' },
341 { "Iacute", 'Í' },
342 { "Icirc", 'Î' },
343 { "Igrave", 'Ì' },
344 { "Iuml", 'Ï' },
345 { "Ntilde", 'Ñ' },
346 { "Oacute", 'Ó' },
347 { "Ocirc", 'Ô' },
348 { "Ograve", 'Ò' },
349 { "Oslash", 'Ø' },
350 { "Otilde", 'Õ' },
351 { "Ouml", 'Ö' },
352 { "THORN", 'Þ' },
353 { "Uacute", 'Ú' },
354 { "Ucirc", 'Û' },
355 { "Ugrave", 'Ù' },
356 { "Uuml", 'Ü' },
357 { "Yacute", 'Ý' },
358 { "aacute", 'á' },
359 { "acirc", 'â' },
360 { "aelig", 'æ' },
361 { "agrave", 'à' },
362 { "aring", 'å' },
363 { "atilde", 'ã' },
364 { "auml", 'ä' },
365 { "ccedil", 'ç' },
366 { "eacute", 'é' },
367 { "ecirc", 'ê' },
368 { "egrave", 'è' },
369 { "eth", 'ð' },
370 { "euml", 'ë' },
371 { "iacute", 'í' },
372 { "icirc", 'î' },
373 { "igrave", 'ì' },
374 { "iuml", 'ï' },
375 { "nbsp", ' ' },
376 { "ntilde", 'ñ' },
377 { "oacute", 'ó' },
378 { "ocirc", 'ô' },
379 { "ograve", 'ò' },
380 { "oslash", 'ø' },
381 { "otilde", 'õ' },
382 { "ouml", 'ö' },
383 { "szlig", 'ß' },
384 { "thorn", 'þ' },
385 { "uacute", 'ú' },
386 { "ucirc", 'û' },
387 { "ugrave", 'ù' },
388 { "uuml", 'ü' },
389 { "yacute", 'ý' },
390 { "yuml", 'ÿ' }
391 };
392 /*}}}*/
393
394 entity[i++]=c;
395 while ((c=getc(fp))!=EOF && (ISALPHA(c) || isdigit(c) || c=='.' || c=='-'))
396 {
397 if (i<sizeof(entity)-1) entity[i++]=c;
398 }
399 entity[i]='\0';
400 for (eptr=entities; eptr<entities+sizeof(entities)/sizeof(entities[0]); ++eptr)
401 {
402 if (strcmp(eptr->name,entity)==0)
403 {
404 wordputchar(eptr->value);
405 if (c!=';') wordputchar(c);
406 goto continueLoop;
407 }
408 else if (strcmp(entity,"hellip")==0)
409 {
410 wordputchar('.');
411 wordputchar('.');
412 wordputchar('.');
413 goto continueLoop;
414 }
415 }
416 wordputchar('&');
417 for (i=0; entity[i]; ++i) wordputchar(entity[i]);
418 wordputchar(c);
419 }
420 else
421 {
422 wordputchar('&');
423 wordputchar(c);
424 }
425 }
426 /*}}}*/
427 else if (c=='\n') /* new line */ /*{{{*/
428 {
429 ++line;
430 wordputchar(c);
431 }
432 /*}}}*/
433 else wordputchar(c);
434 continueLoop:;
435 }
436 wordputchar('\n');
437 while (urls)
438 {
439 char n[32],*s;
440 struct Url *f;
441
442 snprintf(n,sizeof(n),"[%d] ",urls->number);
443 for (s=n; *s; ++s) wordputchar(*s);
444 for (s=urls->url; *s; ++s) wordputchar(*s);
445 wordputchar('\n');
446 free(urls->url);
447 f=urls;
448 urls=urls->next;
449 free(f);
450 }
451 }
452 /*}}}*/
453
454 int main(int argc, char *argv[]) /*{{{*/
455 {
456 /* variable declarations */ /*{{{*/
457 FILE *in;
458 int usage=0;
459 int c;
460 static struct option lopts[]=
461 {
462 { "word-list", no_argument, 0, 'w' },
463 { "skip-headers", no_argument, 0, 's' },
464 { "skip-lists", no_argument, 0, 'l' },
465 { "prettyprint", no_argument, 0, 'p' },
466 { "urls", no_argument, 0, 'u' },
467 { "help", no_argument, 0, 'h' },
468 { "version", no_argument, 0, 'v' },
469 { (const char*)0, 0, 0, '\0' }
470 };
471 /*}}}*/
472
473 setlocale(LC_MESSAGES,"");
474 setlocale(LC_CTYPE,"");
475 #ifdef HAVE_GETTEXT
476 bindtextdomain("dehtml",LOCALEDIR);
477 textdomain("dehtml");
478 #endif
479 /* parse arguments */ /*{{{*/
480 while ((c=getopt_long(argc,argv,"wslp?h",lopts,(int*)0))!=EOF) switch(c)
481 {
482 case 'w': words=1; break;
483 case 's': skipheaders=1; break;
484 case 'l': skiplists=1; break;
485 case 'p': pretty=1; break;
486 case 'h': usage=2; break;
487 case 'v': printf("dehtml " VERSION "\n"); exit(0);
488 default: usage=1;
489 }
490 if (usage==1)
491 {
492 fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
493 fprintf(stderr,"\n");
494 fprintf(stderr,_("Try `dehtml -h' or `dehtml --help' for more information.\n"));
495 exit(1);
496 }
497 if (usage==2)
498 {
499 fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
500 fprintf(stderr,"\n");
501 fprintf(stderr,_("Remove HTML constructs from documents.\n"));
502 fprintf(stderr,"\n");
503 fprintf(stderr,_("-w, --word-list output a word list\n"));
504 fprintf(stderr,_("-s, --skip-headers do not output headers\n"));
505 fprintf(stderr,_("-l, --skip-lists do not output lists\n"));
506 fprintf(stderr,_("-p, --pretty-print pretty printed output\n"));
507 fprintf(stderr,_("-h, --help display this help and exit\n"));
508 fprintf(stderr,_(" --version display version and exit\n"));
509 fprintf(stderr,"\n");
510 fprintf(stderr,_("Report bugs to <michael@moria.de>.\n"));
511 exit(0);
512 }
513 /*}}}*/
514 /* dehtml stdin or files, if any */ /*{{{*/
515 if (optind<argc) while (optind<argc)
516 {
517 if ((in=fopen(argv[optind],"r"))==(FILE*)0)
518 {
519 fprintf(stderr,_("dehtml: Opening `%s' failed (%s).\n"),argv[optind],strerror(errno));
520 exit(1);
521 }
522 dehtml(in,argv[optind]);
523 fclose(in);
524 ++optind;
525 }
526 else dehtml(stdin,(const char*)0);
527 if (fclose(stdout)==-1)
528 {
529 fprintf(stderr,_("dehtml: Closing standard output failed (%s).\n"),strerror(errno));
530 return 1;
531 }
532 /*}}}*/
533 return 0;
534 }
535 /*}}}*/