"Fossies" - the Fresh Open Source Software Archive

Member "dehtml-1.8/dehtml.c" (11 Jan 2011, 13841 Bytes) of package /linux/www/old/dehtml-1.8.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /* #includes */ /*{{{C}}}*//*{{{*/
    2 #include "config.h"
    3 
    4 #include <sys/types.h>
    5 #include <ctype.h>
    6 #include <errno.h>
    7 #ifdef HAVE_GETTEXT
    8 #include <libintl.h>
    9 #define _(String) gettext(String)
   10 #else
   11 #define _(String) String
   12 #endif
   13 #include <locale.h>
   14 #include <stdio.h>
   15 #include <stdlib.h>
   16 #include <string.h>
   17 #include <unistd.h>
   18 
   19 #include "getopt.h"
   20 /*}}}*/
   21 /* #defines */ /*{{{*/
   22 #define ISALPHA(c) ((c>='a' && c<='z') || (c>='A' && c<='Z'))
   23 /*}}}*/
   24 
   25 /* types */ /*{{{*/
   26 struct Url
   27 {
   28   char *url;
   29   int number;
   30   struct Url *next;
   31 };
   32 /*}}}*/
   33 /* variables */ /*{{{*/
   34 static int intitle,inlist,inheader,inpre,inword,inwhite;
   35 static int words=0,skipheaders=0,skiplists=0,pretty=0;
   36 static int first=1;
   37 static const char *file;
   38 static int line;
   39 /*}}}*/
   40 
   41 static int mystrcasecmp(const char *s, const char *t) /*{{{*/
   42 {
   43   int x;
   44 
   45   while ((x=tolower(*s)-tolower(*t))==0 && *s) { ++s; ++t; }
   46   return x;
   47 }
   48 /*}}}*/
   49 static int condputchar(int c) /*{{{*/
   50 {
   51   static int nls=2;
   52   static int lastc='\n';
   53 
   54   if (words || ((!skiplists || !inlist) && (!skipheaders || (!inheader && !intitle))))
   55   {
   56     if (pretty)
   57     {
   58       if (c=='\n')
   59       {
   60         if (++nls>2) return c;
   61       }
   62       else
   63       {
   64         nls=0;
   65       }
   66       if (c=='\n') ++line;
   67       lastc=c;
   68       return putchar(c);
   69     }
   70     else return putchar(c);
   71   }
   72   else if (!pretty && c=='\n') return putchar(c);
   73 }
   74 /*}}}*/
   75 static void wordputchar(int c) /*{{{*/
   76 {
   77   if (words)
   78   {
   79     if (isalpha(c) || c=='_') { inword=1; inwhite=0; condputchar(c); }
   80     else if (inword && !inwhite) { inword=0; inwhite=1; condputchar('\n'); }
   81   }
   82   else condputchar(c);
   83 }
   84 /*}}}*/
   85 static void dehtml(FILE *fp, const char *fileName) /*{{{*/
   86 {
   87   int c;
   88   char href[512];
   89   struct Url *urls,**lasturl;
   90 
   91   line=1;
   92   file=fileName;
   93   intitle=inlist=inheader=inpre=0;
   94   href[0]='\0';
   95   urls=(struct Url*)0;
   96   lasturl=&urls;
   97   while ((c=getc(fp))!=EOF)
   98   {
   99     if (c=='<') /* tag */ /*{{{*/
  100     {
  101       char tag[sizeof("/address")];
  102       char attribute[sizeof("href")];
  103       int i;
  104 
  105       if (inword) inword=0;
  106       /* tag name */ /*{{{*/
  107       i=0;
  108       while ((c=getc(fp))!=EOF && c!='>' && c!=' ' && c!='\n')
  109       {
  110         if (i<sizeof(tag)-1) tag[i++]=tolower(c);
  111       }
  112       tag[i]='\0';
  113       if (c=='\n')
  114       {
  115         ++line;
  116         condputchar('\n');
  117       }
  118       if (i && i<sizeof(tag))
  119       {
  120         if (mystrcasecmp(tag,"p")==0 || mystrcasecmp(tag,"hr")==0) /*{{{*/
  121         {
  122           if (!words && pretty) { condputchar('\n'); condputchar('\n'); }
  123         }
  124         /*}}}*/
  125         else if (mystrcasecmp(tag,"br")==0) /*{{{*/
  126         {
  127           if (!words && pretty) condputchar('\n');
  128         }
  129         /*}}}*/
  130         else if (mystrcasecmp(tag,"title")==0) intitle=1;
  131         else if (mystrcasecmp(tag,"/title")==0) intitle=0;
  132         else if (tolower(tag[0])=='h' && isdigit(tag[1]) && tag[2]=='\0') /*{{{*/
  133         {
  134           if (!words && pretty)
  135           {
  136             condputchar('\n');
  137             condputchar('\n');
  138           }
  139           ++inheader;
  140         }
  141         /*}}}*/
  142         else if (tag[0]=='/' && tolower(tag[1])=='h' && isdigit(tag[2]) && tag[3]=='\0') /*{{{*/
  143         {
  144           if (!words && pretty)
  145           {
  146             condputchar('\n');
  147             condputchar('\n');
  148           }
  149           if (inheader) --inheader;
  150         }
  151         /*}}}*/
  152         else if (mystrcasecmp(tag,"pre")==0) inpre=1;
  153         else if (mystrcasecmp(tag,"/pre")==0) inpre=0;
  154         else if (mystrcasecmp(tag,"dl")==0) ++inlist;
  155         else if (mystrcasecmp(tag,"/dl")==0) { if (inlist) --inlist; }
  156         else if (mystrcasecmp(tag,"ul")==0) ++inlist;
  157         else if (mystrcasecmp(tag,"/ul")==0) { if (inlist) --inlist; }
  158         else if (mystrcasecmp(tag,"ol")==0) ++inlist;
  159         else if (mystrcasecmp(tag,"/ol")==0) { if (inlist) --inlist; }
  160         else if (mystrcasecmp(tag,"/a")==0 && href[0]) /*{{{*/
  161         {
  162           struct Url *u;
  163           char n[32],*s;
  164           int number=0;
  165 
  166           for (u=urls; u && strcmp(u->url,href); u=u->next) number=u->number;
  167           if (u==(struct Url*)0)
  168           {
  169             u=malloc(sizeof(struct Url));
  170             u->number=number+1;
  171             u->url=strcpy(malloc(strlen(href)+1),href);
  172             u->next=(struct Url*)0;
  173             *lasturl=u;
  174             lasturl=&u->next;
  175           }
  176           snprintf(n,sizeof(n)," [%d]",u->number);
  177           for (s=n; *s; ++s) wordputchar(*s);
  178           href[0]='\0';
  179         }
  180         /*}}}*/
  181       }
  182       /*}}}*/
  183       if (c!=EOF && c!='>') /* tag attributes */ /*{{{*/
  184       {
  185         enum { EMPTY, ATTRIBUTE, EQ, VALUE, QUOTEDVALUE } state=EMPTY;
  186         int output_value=0;
  187         int a_href=0;
  188 
  189         do
  190         {
  191           c=getc(fp);
  192           if (c=='\n')
  193           {
  194             ++line;
  195             condputchar('\n');
  196           }
  197           switch (state)
  198           {
  199             case EMPTY: /*{{{*/
  200             {
  201               if (ISALPHA(c))
  202               {
  203                 state=ATTRIBUTE;
  204                 i=0;
  205                 attribute[i++]=c;
  206               }
  207               break;
  208             }
  209             /*}}}*/
  210             case ATTRIBUTE: /*{{{*/
  211             {
  212               if (ISALPHA(c))
  213               {
  214                 if (i<sizeof(attribute)-1) attribute[i++]=tolower(c);
  215               }
  216               else
  217               {
  218                 attribute[i]='\0';
  219                 if (c=='=')
  220                 {
  221                   state=EQ;
  222                   a_href=(strcmp(tag,"a")==0) && (strcmp(attribute,"href")==0);
  223                   output_value=(strcmp(tag,"img")==0) && (strcmp(attribute,"alt")==0);
  224                 }
  225                 else state=EMPTY;
  226               }
  227               break;
  228             }
  229             /*}}}*/
  230             case EQ: /*{{{*/
  231             {
  232               i=0;
  233               if (c=='"') state=QUOTEDVALUE;
  234               else
  235               {
  236                 state=VALUE;
  237                 if (output_value) wordputchar(c);
  238               }
  239               break;
  240             }
  241             /*}}}*/
  242             case QUOTEDVALUE: /*{{{*/
  243             {
  244               if (c=='"')
  245               {
  246                 if (a_href)
  247                 {
  248                   href[i]='\0';
  249                   a_href=0;
  250                 }
  251                 output_value=0;
  252                 state=EMPTY;
  253               }
  254               else if (a_href)
  255               {
  256                 if (i<sizeof(href)-1) href[i++]=c;
  257               }
  258               else if (output_value) condputchar(c);
  259               break;
  260             }
  261             /*}}}*/
  262             case VALUE: /*{{{*/
  263             {
  264               if (c==' ')
  265               {
  266                 if (a_href)
  267                 {
  268                   a_href=0;
  269                   href[i]='\0';
  270                 }
  271                 output_value=0;
  272                 state=EMPTY;
  273               }
  274               else if (a_href)
  275               {
  276                 if (i<sizeof(href)-1) href[i++]=c;
  277               }
  278               else if (output_value) wordputchar(c);
  279               break;
  280             }
  281             /*}}}*/
  282           }
  283         } while (c!=EOF && c!='>');
  284       }
  285       /*}}}*/
  286     }
  287     /*}}}*/
  288     else if (c=='&') /* entity */ /*{{{*/
  289     {
  290       char entity[73];
  291       int i=0;
  292 
  293       if ((c=getc(fp))=='#')
  294       {
  295         c=getc(fp);
  296         if (isdigit(c))
  297         {
  298           int numeric=c-'0';
  299 
  300           while ((c=getc(fp))!=EOF && isdigit(c))
  301           {
  302             numeric=numeric*10+(c-'0');
  303           }
  304           wordputchar(numeric);
  305           if (c!=';') wordputchar(c);
  306         }
  307         else
  308         {
  309           wordputchar('&');
  310           wordputchar('#');
  311         }
  312       }
  313       else if (ISALPHA(c) || isdigit(c) || c=='.' || c=='-')
  314       {
  315         /* variables */ /*{{{*/
  316           struct
  317           {
  318             const char *name;
  319             char value;
  320           }
  321           const *eptr,
  322           entities[]=
  323           {
  324             { "gt",     '>' },
  325             { "lt",     '<' },
  326             { "amp",    '&' },
  327             { "quot",   '"' },
  328             { "AElig",  'Æ' },
  329             { "Aacute", 'Á' },
  330             { "Acirc",  'Â' },
  331             { "Agrave", 'À' },
  332             { "Aring",  'Å' },
  333             { "Atilde", 'Ã' },
  334             { "Auml",   'Ä' },
  335             { "Ccedil", 'Ç' },
  336             { "ETH",    'Ð' },
  337             { "Eacute", 'É' },
  338             { "Ecirc",  'Ê' },
  339             { "Egrave", 'È' },
  340             { "Euml",   'Ë' },
  341             { "Iacute", 'Í' },
  342             { "Icirc",  'Î' },
  343             { "Igrave", 'Ì' },
  344             { "Iuml",   'Ï' },
  345             { "Ntilde", 'Ñ' },
  346             { "Oacute", 'Ó' },
  347             { "Ocirc",  'Ô' },
  348             { "Ograve", 'Ò' },
  349             { "Oslash", 'Ø' },
  350             { "Otilde", 'Õ' },
  351             { "Ouml",   'Ö' },
  352             { "THORN",  'Þ' },
  353             { "Uacute", 'Ú' },
  354             { "Ucirc",  'Û' },
  355             { "Ugrave", 'Ù' },
  356             { "Uuml",   'Ü' },
  357             { "Yacute", 'Ý' },
  358             { "aacute", 'á' },
  359             { "acirc",  'â' },
  360             { "aelig",  'æ' },
  361             { "agrave", 'à' },
  362             { "aring",  'å' },
  363             { "atilde", 'ã' },
  364             { "auml",   'ä' },
  365             { "ccedil", 'ç' },
  366             { "eacute", 'é' },
  367             { "ecirc",  'ê' },
  368             { "egrave", 'è' },
  369             { "eth",    'ð' },
  370             { "euml",   'ë' },
  371             { "iacute", 'í' },
  372             { "icirc",  'î' },
  373             { "igrave", 'ì' },
  374             { "iuml",   'ï' },
  375             { "nbsp",   ' ' },
  376             { "ntilde", 'ñ' },
  377             { "oacute", 'ó' },
  378             { "ocirc",  'ô' },
  379             { "ograve", 'ò' },
  380             { "oslash", 'ø' },
  381             { "otilde", 'õ' },
  382             { "ouml",   'ö' },
  383             { "szlig",  'ß' },
  384             { "thorn",  'þ' },
  385             { "uacute", 'ú' },
  386             { "ucirc",  'û' },
  387             { "ugrave", 'ù' },
  388             { "uuml",   'ü' },
  389             { "yacute", 'ý' },
  390             { "yuml",   'ÿ' }
  391           };
  392           /*}}}*/
  393 
  394         entity[i++]=c;
  395         while ((c=getc(fp))!=EOF && (ISALPHA(c) || isdigit(c) || c=='.' || c=='-'))
  396         {
  397           if (i<sizeof(entity)-1) entity[i++]=c;
  398         }
  399         entity[i]='\0';
  400         for (eptr=entities; eptr<entities+sizeof(entities)/sizeof(entities[0]); ++eptr)
  401         {
  402           if (strcmp(eptr->name,entity)==0)
  403           {
  404             wordputchar(eptr->value);
  405             if (c!=';') wordputchar(c);
  406             goto continueLoop;
  407           }
  408           else if (strcmp(entity,"hellip")==0)
  409           {
  410             wordputchar('.');
  411             wordputchar('.');
  412             wordputchar('.');
  413             goto continueLoop;
  414           }
  415         }
  416         wordputchar('&');
  417         for (i=0; entity[i]; ++i) wordputchar(entity[i]);
  418         wordputchar(c);
  419       }
  420       else
  421       {
  422         wordputchar('&');
  423         wordputchar(c);
  424       }
  425     }
  426     /*}}}*/
  427     else if (c=='\n') /* new line */ /*{{{*/
  428     {
  429       ++line;
  430       wordputchar(c);
  431     }
  432     /*}}}*/
  433     else wordputchar(c);
  434     continueLoop:;
  435   }
  436   wordputchar('\n');
  437   while (urls)
  438   {
  439     char n[32],*s;
  440     struct Url *f;
  441 
  442     snprintf(n,sizeof(n),"[%d] ",urls->number);
  443     for (s=n; *s; ++s) wordputchar(*s);
  444     for (s=urls->url; *s; ++s) wordputchar(*s);
  445     wordputchar('\n');
  446     free(urls->url);
  447     f=urls;
  448     urls=urls->next;
  449     free(f);
  450   }
  451 }
  452 /*}}}*/
  453 
  454 int main(int argc, char *argv[]) /*{{{*/
  455 {
  456   /* variable declarations */ /*{{{*/
  457   FILE *in;
  458   int usage=0;
  459   int c;
  460   static struct option lopts[]=
  461   {
  462     { "word-list", no_argument, 0, 'w' },
  463     { "skip-headers", no_argument, 0, 's' },
  464     { "skip-lists", no_argument, 0, 'l' },
  465     { "prettyprint", no_argument, 0, 'p' },
  466     { "urls", no_argument, 0, 'u' },
  467     { "help", no_argument, 0, 'h' },
  468     { "version", no_argument, 0, 'v' },
  469     { (const char*)0, 0, 0, '\0' }
  470   };
  471   /*}}}*/
  472 
  473   setlocale(LC_MESSAGES,"");
  474   setlocale(LC_CTYPE,"");
  475 #ifdef HAVE_GETTEXT
  476   bindtextdomain("dehtml",LOCALEDIR);
  477   textdomain("dehtml");
  478 #endif
  479   /* parse arguments */ /*{{{*/
  480   while ((c=getopt_long(argc,argv,"wslp?h",lopts,(int*)0))!=EOF) switch(c)
  481   {
  482     case 'w': words=1; break;
  483     case 's': skipheaders=1; break;
  484     case 'l': skiplists=1; break;
  485     case 'p': pretty=1; break;
  486     case 'h': usage=2; break;
  487     case 'v': printf("dehtml " VERSION "\n"); exit(0);
  488     default: usage=1;
  489   }
  490   if (usage==1)
  491   {
  492     fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
  493     fprintf(stderr,"\n");
  494     fprintf(stderr,_("Try `dehtml -h' or `dehtml --help' for more information.\n"));
  495     exit(1);
  496   }
  497   if (usage==2)
  498   {
  499     fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [file ...]\n"));
  500     fprintf(stderr,"\n");
  501     fprintf(stderr,_("Remove HTML constructs from documents.\n"));
  502     fprintf(stderr,"\n");
  503     fprintf(stderr,_("-w, --word-list     output a word list\n"));
  504     fprintf(stderr,_("-s, --skip-headers  do not output headers\n"));
  505     fprintf(stderr,_("-l, --skip-lists    do not output lists\n"));
  506     fprintf(stderr,_("-p, --pretty-print  pretty printed output\n"));
  507     fprintf(stderr,_("-h, --help          display this help and exit\n"));
  508     fprintf(stderr,_("    --version       display version and exit\n"));
  509     fprintf(stderr,"\n");
  510     fprintf(stderr,_("Report bugs to <michael@moria.de>.\n"));
  511     exit(0);
  512   }
  513   /*}}}*/
  514   /* dehtml stdin or files, if any */ /*{{{*/
  515   if (optind<argc) while (optind<argc)
  516   {
  517     if ((in=fopen(argv[optind],"r"))==(FILE*)0)
  518     {
  519       fprintf(stderr,_("dehtml: Opening `%s' failed (%s).\n"),argv[optind],strerror(errno));
  520       exit(1);
  521     }
  522     dehtml(in,argv[optind]);
  523     fclose(in);
  524     ++optind;
  525   }
  526   else dehtml(stdin,(const char*)0);
  527   if (fclose(stdout)==-1)
  528   {
  529     fprintf(stderr,_("dehtml: Closing standard output failed (%s).\n"),strerror(errno));
  530     return 1;
  531   }
  532   /*}}}*/
  533   return 0;
  534 }
  535 /*}}}*/