"Fossies" - the Fresh Open Source Software Archive

Member "dehtml-1.8/dehtml.c" (11 Jan 2011, 13841 Bytes) of package /linux/www/old/dehtml-1.8.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "dehtml.c" see the Fossies "Dox" file reference documentation.

    1 /* #includes */ /*{{{C}}}*//*{{{*/
    2 #include "config.h"
    3 
    4 #include <sys/types.h>
    5 #include <ctype.h>
    6 #include <errno.h>
    7 #ifdef HAVE_GETTEXT
    8 #include <libintl.h>
    9 #define _(String) gettext(String)
   10 #else
   11 #define _(String) String
   12 #endif
   13 #include <locale.h>
   14 #include <stdio.h>
   15 #include <stdlib.h>
   16 #include <string.h>
   17 #include <unistd.h>
   18 
   19 #include "getopt.h"
   20 /*}}}*/
   21 /* #defines */ /*{{{*/
   22 #define ISALPHA(c) ((c>='a' && c<='z') || (c>='A' && c<='Z'))
   23 /*}}}*/
   24 
   25 /* types */ /*{{{*/
   26 struct Url
   27 {
   28   char *url;
   29   int number;
   30   struct Url *next;
   31 };
   32 /*}}}*/
   33 /* variables */ /*{{{*/
   34 static int intitle,inlist,inheader,inpre,inword,inwhite;
   35 static int words=0,skipheaders=0,skiplists=0,pretty=0;
   36 static int first=1;
   37 static const char *file;
   38 static int line;
   39 /*}}}*/
   40 
   41 static int mystrcasecmp(const char *s, const char *t) /*{{{*/
   42 {
   43   int x;
   44 
   45   while ((x=tolower(*s)-tolower(*t))==0 && *s) { ++s; ++t; }
   46   return x;
   47 }
   48 /*}}}*/
   49 static int condputchar(int c) /*{{{*/
   50 {
   51   static int nls=2;
   52   static int lastc='\n';
   53 
   54   if (words || ((!skiplists || !inlist) && (!skipheaders || (!inheader && !intitle))))
   55   {
   56     if (pretty)
   57     {
   58       if (c=='\n')
   59       {
   60         if (++nls>2) return c;
   61       }
   62       else
   63       {
   64         nls=0;
   65       }
   66       if (c=='\n') ++line;
   67       lastc=c;
   68       return putchar(c);
   69     }
   70     else return putchar(c);
   71   }
   72   else if (!pretty && c=='\n') return putchar(c);
   73 }
   74 /*}}}*/
   75 static void wordputchar(int c) /*{{{*/
   76 {
   77   if (words)
   78   {
   79     if (isalpha(c) || c=='_') { inword=1; inwhite=0; condputchar(c); }
   80     else if (inword && !inwhite) { inword=0; inwhite=1; condputchar('\n'); }
   81   }
   82   else condputchar(c);
   83 }
   84 /*}}}*/
   85 static void dehtml(FILE *fp, const char *fileName) /*{{{*/
   86 {
   87   int c;
   88   char href[512];
   89   struct Url *urls,**lasturl;
   90 
   91   line=1;
   92   file=fileName;
   93   intitle=inlist=inheader=inpre=0;
   94   href[0]='\0';
   95   urls=(struct Url*)0;
   96   lasturl=&urls;
   97   while ((c=getc(fp))!=EOF)
   98   {
   99     if (c=='<') /* tag */ /*{{{*/
  100     {
  101       char tag[sizeof("/address")];
  102       char attribute[sizeof("href")];
  103       int i;
  104 
  105       if (inword) inword=0;
  106       /* tag name */ /*{{{*/
  107       i=0;
  108       while ((c=getc(fp))!=EOF && c!='>' && c!=' ' && c!='\n')
  109       {
  110         if (i<sizeof(tag)-1) tag[i++]=tolower(c);
  111       }
  112       tag[i]='\0';
  113       if (c=='\n')
  114       {
  115         ++line;
  116         condputchar('\n');
  117       }
  118       if (i && i<sizeof(tag))
  119       {
  120         if (mystrcasecmp(tag,"p")==0 || mystrcasecmp(tag,"hr")==0) /*{{{*/
  121         {
  122           if (!words && pretty) { condputchar('\n'); condputchar('\n'); }
  123         }
  124         /*}}}*/
  125         else if (mystrcasecmp(tag,"br")==0) /*{{{*/
  126         {
  127           if (!words && pretty) condputchar('\n');
  128         }
  129         /*}}}*/
  130         else if (mystrcasecmp(tag,"title")==0) intitle=1;
  131         else if (mystrcasecmp(tag,"/title")==0) intitle=0;
  132         else if (tolower(tag[0])=='h' && isdigit(tag[1]) && tag[2]=='\0') /*{{{*/
  133         {
  134           if (!words && pretty)
  135           {
  136             condputchar('\n');
  137             condputchar('\n');
  138           }
  139           ++inheader;
  140         }
  141         /*}}}*/
  142         else if (tag[0]=='/' && tolower(tag[1])=='h' && isdigit(tag[2]) && tag[3]=='\0') /*{{{*/
  143         {
  144           if (!words && pretty)
  145           {
  146             condputchar('\n');
  147             condputchar('\n');
  148           }
  149           if (inheader) --inheader;
  150         }
  151         /*}}}*/
  152         else if (mystrcasecmp(tag,"pre")==0) inpre=1;
  153         else if (mystrcasecmp(tag,"/pre")==0) inpre=0;
  154         else if (mystrcasecmp(tag,"dl")==0) ++inlist;
  155         else if (mystrcasecmp(tag,"/dl")==0) { if (inlist) --inlist; }
  156         else if (mystrcasecmp(tag,"ul")==0) ++inlist;
  157         else if (mystrcasecmp(tag,"/ul")==0) { if (inlist) --inlist; }
  158         else if (mystrcasecmp(tag,"ol")==0) ++inlist;
  159         else if (mystrcasecmp(tag,"/ol")==0) { if (inlist) --inlist; }
  160         else if (mystrcasecmp(tag,"/a")==0 && href[0]) /*{{{*/
  161         {
  162           struct Url *u;
  163           char n[32],*s;
  164           int number=0;
  165 
  166           for (u=urls; u && strcmp(u->url,href); u=u->next) number=u->number;
  167           if (u==(struct Url*)0)
  168           {
  169             u=malloc(sizeof(struct Url));
  170             u->number=number+1;
  171             u->url=strcpy(malloc(strlen(href)+1),href);
  172             u->next=(struct Url*)0;
  173             *lasturl=u;
  174             lasturl=&u->next;
  175           }
  176           snprintf(n,sizeof(n)," [%d]",u->number);
  177           for (s=n; *s; ++s) wordputchar(*s);
  178           href[0]='\0';
  179         }
  180         /*}}}*/
  181       }
  182       /*}}}*/
  183       if (c!=EOF && c!='>') /* tag attributes */ /*{{{*/
  184       {
  185         enum { EMPTY, ATTRIBUTE, EQ, VALUE, QUOTEDVALUE } state=EMPTY;
  186         int output_value=0;
  187         int a_href=0;
  188 
  189         do
  190         {
  191           c=getc(fp);
  192           if (c=='\n')
  193           {
  194             ++line;
  195             condputchar('\n');
  196           }
  197           switch (state)
  198           {
  199             case EMPTY: /*{{{*/
  200             {
  201               if (ISALPHA(c))
  202               {
  203                 state=ATTRIBUTE;
  204                 i=0;
  205                 attribute[i++]=c;
  206               }
  207               break;
  208             }
  209             /*}}}*/
  210             case ATTRIBUTE: /*{{{*/
  211             {
  212               if (ISALPHA(c))
  213               {
  214                 if (i<sizeof(attribute)-1) attribute[i++]=tolower(c);
  215               }
  216               else
  217               {
  218                 attribute[i]='\0';
  219                 if (c=='=')
  220                 {
  221                   state=EQ;
  222                   a_href=(strcmp(tag,"a")==0) && (strcmp(attribute,"href")==0);
  223                   output_value=(strcmp(tag,"img")==0) && (strcmp(attribute,"alt")==0);
  224                 }
  225                 else state=EMPTY;
  226               }
  227               break;
  228             }
  229             /*}}}*/
  230             case EQ: /*{{{*/
  231             {
  232               i=0;
  233               if (c=='"') state=QUOTEDVALUE;
  234               else
  235               {
  236                 state=VALUE;
  237                 if (output_value) wordputchar(c);
  238               }
  239               break;
  240             }
  241             /*}}}*/
  242             case QUOTEDVALUE: /*{{{*/
  243             {
  244               if (c=='"')
  245               {
  246                 if (a_href)
  247                 {
  248                   href[i]='\0';
  249                   a_href=0;
  250                 }
  251                 output_value=0;
  252                 state=EMPTY;
  253               }
  254               else if (a_href)
  255               {
  256                 if (i<sizeof(href)-1) href[i++]=c;
  257               }
  258               else if (output_value) condputchar(c);
  259               break;
  260             }
  261             /*}}}*/
  262             case VALUE: /*{{{*/
  263             {
  264               if (c==' ')
  265               {
  266                 if (a_href)
  267                 {
  268                   a_href=0;
  269                   href[i]='\0';
  270                 }
  271                 output_value=0;
  272                 state=EMPTY;
  273               }
  274               else if (a_href)
  275               {
  276                 if (i<sizeof(href)-1) href[i++]=c;
  277               }
  278               else if (output_value) wordputchar(c);
  279               break;
  280             }
  281             /*}}}*/
  282           }
  283         } while (c!=EOF && c!='>');
  284       }
  285       /*}}}*/
  286     }
  287     /*}}}*/
  288     else if (c=='&') /* entity */ /*{{{*/
  289     {
  290       char entity[73];
  291       int i=0;
  292 
  293       if ((c=getc(fp))=='#')
  294       {
  295         c=getc(fp);
  296         if (isdigit(c))
  297         {
  298           int numeric=c-'0';
  299 
  300           while ((c=getc(fp))!=EOF && isdigit(c))
  301           {
  302             numeric=numeric*10+(c-'0');
  303           }
  304           wordputchar(numeric);
  305           if (c!=';') wordputchar(c);
  306         }
  307         else
  308         {
  309           wordputchar('&');
  310           wordputchar('#');
  311         }
  312       }
  313       else if (ISALPHA(c) || isdigit(c) || c=='.' || c=='-')
  314       {
  315         /* variables */ /*{{{*/
  316           struct
  317           {
  318             const char *name;
  319             char value;
  320           }
  321           const *eptr,
  322           entities[]=
  323           {
  324             { "gt",     '>' },
  325             { "lt",     '<' },
  326             { "amp",    '&' },
  327             { "quot",   '"' },
  328             { "AElig",  'Æ' },
  329             { "Aacute", 'Á' },
  330             { "Acirc",  'Â' },
  331             { "Agrave", 'À' },
  332             { "Aring",  'Å' },
  333             { "Atilde", 'Ã' },
  334             { "Auml",   'Ä' },
  335             { "Ccedil", 'Ç' },
  336             { "ETH",    'Ð' },
  337             { "Eacute", 'É' },
  338             { "Ecirc",  'Ê' },
  339             { "Egrave", 'È' },
  340             { "Euml",   'Ë' },
  341             { "Iacute", 'Í' },
  342             { "Icirc",  'Î' },
  343             { "Igrave", 'Ì' },
  344             { "Iuml",   'Ï' },
  345             { "Ntilde", 'Ñ' },
  346             { "Oacute", 'Ó' },
  347             { "Ocirc",  'Ô' },
  348             { "Ograve", 'Ò' },
  349             { "Oslash", 'Ø' },
  350             { "Otilde", 'Õ' },
  351             { "Ouml",   'Ö' },
  352             { "THORN",  'Þ' },
  353             { "Uacute", 'Ú' },
  354             { "Ucirc",  'Û' },
  355             { "Ugrave", 'Ù' },
  356             { "Uuml",   'Ü' },
  357             { "Yacute", 'Ý' },
  358             { "aacute", 'á' },
  359             { "acirc",  'â' },
  360             { "aelig",  'æ' },
  361             { "agrave", 'à' },
  362             { "aring",  'å' },
  363             { "atilde", 'ã' },
  364             { "auml",   'ä' },
  365             { "ccedil", 'ç' },
  366             { "eacute", 'é' },
  367             { "ecirc",  'ê' },
  368             { "egrave", 'è' },
  369             { "eth",    'ð' },
  370             { "euml",   'ë' },
  371             { "iacute", 'í' },
  372             { "icirc",  'î' },
  373             { "igrave", 'ì' },
  374             { "iuml",   'ï' },
  375             { "nbsp",   ' ' },
  376             { "ntilde", 'ñ' },
  377             { "oacute", 'ó' },
  378             { "ocirc",  'ô' },
  379             { "ograve", 'ò' },
  380             { "oslash", 'ø' },
  381             { "otilde", 'õ' },
  382             { "ouml",   'ö' },
  383             { "szlig",  'ß' },
  384             { "thorn",  'þ' },
  385             { "uacute", 'ú' },
  386             { "ucirc",  'û' },
  387             { "ugrave", 'ù' },
  388             { "uuml",   'ü' },
  389             { "yacute", 'ý' },
  390             { "yuml",   'ÿ' }
  391           };
  392           /*}}}*/
  393 
  394         entity[i++]=c;
  395         while ((c=getc(fp))!=EOF && (ISALPHA(c) || isdigit(c) || c=='.' || c=='-'))
  396         {
  397           if (i<sizeof(entity)-1) entity[i++]=c;
  398         }
  399         entity[i]='\0';
  400         for (eptr=entities; eptr<entities+sizeof(entities)/sizeof(entities[0]); ++eptr)
  401         {
  402           if (strcmp(eptr->name,entity)==0)
  403           {
  404             wordputchar(eptr->value);
  405             if (c!=';') wordputchar(c);
  406             goto continueLoop;
  407           }
  408           else if (strcmp(entity,"hellip")==0)
  409           {
  410             wordputchar('.');
  411             wordputchar('.');
  412             wordputchar('.');
  413             goto continueLoop;
  414           }
  415         }
  416         wordputchar('&');
  417         for (i=0; entity[i]; ++i) wordputchar(entity[i]);
  418         wordputchar(c);
  419       }
  420       else
  421       {
  422         wordputchar('&');
  423         wordputchar(c);
  424       }
  425     }
  426     /*}}}*/
  427     else if (c=='\n') /* new line */ /*{{{*/
  428     {
  429       ++line;
  430       wordputchar(c);
  431     }
  432     /*}}}*/
  433     else wordputchar(c);
  434     continueLoop:;
  435   }
  436   wordputchar('\n');
  437   while (urls)
  438   {
  439     char n[32],*s;
  440     struct Url *f;
  441 
  442     snprintf(n,sizeof(n),"[%d] ",urls->number);
  443     for (s=n; *s; ++s) wordputchar(*s);
  444     for (s=urls->url; *s; ++s) wordputchar(*s);
  445     wordputchar('\n');
  446     free(urls->url);
  447     f=urls;
  448     urls=urls->next;
  449     free(f);
  450   }
  451 }
  452 /*}}}*/
  453 
  454 int main(int argc, char *argv[]) /*{{{*/
  455 {
  456   /* variable declarations */ /*{{{*/
  457   FILE *in;
  458   int usage=0;
  459   int c;
  460   static struct option lopts[]=
  461   {
  462     { "word-list", no_argument, 0, 'w' },
  463     { "skip-headers", no_argument, 0, 's' },
  464     { "skip-lists", no_argument, 0, 'l' },
  465     { "prettyprint", no_argument, 0, 'p' },
  466     { "urls", no_argument, 0, 'u' },
  467     { "help", no_argument, 0, 'h' },
  468     { "version", no_argument, 0, 'v' },
  469     { (const char*)0, 0, 0, '\0' }
  470   };
  471   /*}}}*/
  472 
  473   setlocale(LC_MESSAGES,"");
  474   setlocale(LC_CTYPE,"");
  475 #ifdef HAVE_GETTEXT
  476   bindtextdomain("dehtml",LOCALEDIR);
  477   textdomain("dehtml");
  478 #endif
  479   /* parse arguments */ /*{{{*/
  480   while ((c=getopt_long(argc,argv,"wslp?h",lopts,(int*)0))!=EOF) switch(c)
  481   {
  482     case 'w': words=1; break;
  483     case 's': skipheaders=1; break;
  484     case 'l': skiplists=1; break;
  485     case 'p': pretty=1; break;
  486     case 'h': usage=2; break;
  487     case 'v': printf("dehtml " VERSION "\n"); exit(0);
  488     default: usage=1;
  489   }
  490   if (usage==1)
  491   {
  492     fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
  493     fprintf(stderr,"\n");
  494     fprintf(stderr,_("Try `dehtml -h' or `dehtml --help' for more information.\n"));
  495     exit(1);
  496   }
  497   if (usage==2)
  498   {
  499     fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
  500     fprintf(stderr,"\n");
  501     fprintf(stderr,_("Remove HTML constructs from documents.\n"));
  502     fprintf(stderr,"\n");
  503     fprintf(stderr,_("-w, --word-list     output a word list\n"));
  504     fprintf(stderr,_("-s, --skip-headers  do not output headers\n"));
  505     fprintf(stderr,_("-l, --skip-lists    do not output lists\n"));
  506     fprintf(stderr,_("-p, --pretty-print  pretty printed output\n"));
  507     fprintf(stderr,_("-h, --help          display this help and exit\n"));
  508     fprintf(stderr,_("    --version       display version and exit\n"));
  509     fprintf(stderr,"\n");
  510     fprintf(stderr,_("Report bugs to <michael@moria.de>.\n"));
  511     exit(0);
  512   }
  513   /*}}}*/
  514   /* dehtml stdin or files, if any */ /*{{{*/
  515   if (optind<argc) while (optind<argc)
  516   {
  517     if ((in=fopen(argv[optind],"r"))==(FILE*)0)
  518     {
  519       fprintf(stderr,_("dehtml: Opening `%s' failed (%s).\n"),argv[optind],strerror(errno));
  520       exit(1);
  521     }
  522     dehtml(in,argv[optind]);
  523     fclose(in);
  524     ++optind;
  525   }
  526   else dehtml(stdin,(const char*)0);
  527   if (fclose(stdout)==-1)
  528   {
  529     fprintf(stderr,_("dehtml: Closing standard output failed (%s).\n"),strerror(errno));
  530     return 1;
  531   }
  532   /*}}}*/
  533   return 0;
  534 }
  535 /*}}}*/