"Fossies" - the Fresh Open Source Software Archive

Member "htmlrecode-1.3.1/htmlrecode.cc" (21 Jul 2009, 44689 Bytes) of package /linux/www/htmlrecode-1.3.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "htmlrecode.cc" see the Fossies "Dox" file reference documentation.

    1 #include <cstdio>      // *printf
    2 #include <unistd.h>    // fileno,fork,pipe
    3 #include <csignal>     // signal,kill
    4 #include <sys/wait.h>  // waitpid
    5 #include <cctype>      // isalpha,...
    6 #include <map>         // map
    7 #include <set>         // set
    8 #include <list>        // list
    9 #include <cerrno>      // errno
   10 #include <cstdlib>     // perror
   11 #include <cstring>     // memmove
   12 
   13 #include "htmlrecode.hh"
   14 
   15 #define DEBUG 0
   16 
   17 //#define GXBLT
   18 
   19 using namespace std;
   20 
   21 static const char *const midset  // do endian tests and find internal type
   22      = (sizeof(wchar_t) == 4)  // if wchar_t is 32-bit
   23              ? ((*(const short *)"\1\0\0\0" == 1) ? "UCS-4LE" : "UCS-4BE")
   24      : (sizeof(wchar_t) == 2)  // if wchar_t is 16-bit
   25              ? ((*(const short *)"\1\0\0\0" == 1) ? "UCS-2LE" : "UCS-2BE")
   26      : NULL;                   // otherwise we are lost
   27 
   28 
   29 #if 0
   30 static bool operator==(const wstring &s1, const char *s2)
   31 {
   32     for(unsigned a=0; a<s1.size(); ++a, ++s2)
   33     {
   34         if(!*s2 || s1[a] != (unsigned char)*s2)return false;
   35     }
   36     return *s2 ? false : true;
   37 }
   38 #endif
   39 static void operator+= (wstring &s1, const char *s2)
   40 {
   41     while(*s2)s1 += *s2++;
   42 }
   43 static bool IsEqual(const wstring &s1, const char *s2)
   44 {
   45     for(size_t a=0; a<s1.size(); ++a, ++s2)
   46     {
   47         if(!*s2)return false;
   48         if(s1[a] >= 0x100)return false;
   49         char c1 = (char)s1[a];
   50         if(toupper(c1) != toupper(*s2))return false;
   51     }
   52     return !*s2;
   53 }
   54 static bool IsEqual(const wstring &s1, const wstring &s2)
   55 {
   56     if(s1.size() != s2.size()) return false;
   57     for(unsigned a=0; a<s1.size(); ++a)
   58     {
   59         if(s1[a] == s2[a]
   60         || (s1[a] <= 0x100 && s2[a] <= 0x100
   61         && toupper((char)s1[a]) == toupper((char)s2[a]))
   62           ) continue;
   63         return false;
   64     }
   65     return true;
   66 }
   67 
   68 static const string Stringify(const wstring &s)
   69 {
   70     string result(s.size(), '?');
   71     for(unsigned a=0; a<s.size(); ++a)
   72         if(s[a] < 0x100) result[a] = (char)s[a];
   73     return result;
   74 }
   75 
   76 static ucs4 Getc(FILE *fp)
   77 {
   78     ucs4 p; fread(&p, 1, 4, fp);
   79     return p;
   80 }
   81 
   82 static const struct { const char *ent; ucs4 ch; } EnTab[] = {
   83 #include "entities.h"
   84 };
   85 #define ENTITYCOUNT (sizeof(EnTab) / sizeof(EnTab[0]))
   86 static ucs4 FindEntity(const wstring &ws)
   87 {
   88     string s(ws.size(), ' ');
   89     for(unsigned a=0; a<ws.size(); ++a)
   90         s[a] = (char)ws[a];
   91     
   92     unsigned eka=0, vika=ENTITYCOUNT-1;
   93     for(;;)
   94     {
   95         unsigned n = (eka+vika)/2;
   96         if(s == EnTab[n].ent)return EnTab[n].ch;
   97         if(eka >= vika)break;
   98         if(s > EnTab[n].ent) { eka=n+1; continue; }
   99         if(s < EnTab[n].ent) { vika=n; continue; }
  100     }
  101     return ilseq;
  102 }
  103 
  104 #undef putc
  105 #undef puts
  106 
  107 static unsigned FixedStyleScript = 0;
  108 static unsigned ParamViolations  = 0;
  109 
  110 static bool lossless = true;
  111 static bool usehex = false;
  112 static bool strict = false;
  113 static int verbose = 1;
  114 static bool xmlmode = false;
  115 static bool signature = false;
  116 
  117 bool Page::Dumper::OpenConv(iconv_t &conv, const char *set1, const char *set2)
  118 {
  119     conv = iconv_open(set1, set2);
  120     if(conv == (iconv_t)-1)
  121     {
  122         fprintf(stderr, "iconv_open failed to create '%s' to '%s' converter. Aborting.\n",
  123             set1, set2);
  124         return true;
  125     }
  126     return false;
  127 }
  128 
  129 Page::Dumper::Dumper() : charset(midset)
  130 {
  131     if(OpenConv(converter, charset.c_str(), midset)
  132     || OpenConv(tester,    charset.c_str(), midset))
  133         exit(EINVAL);
  134 }
  135 Page::Dumper::~Dumper()
  136 {
  137     iconv_close(converter);
  138     iconv_close(tester);
  139 }
  140 
  141 void Page::Dumper::putc(ucs4 p) const
  142 {
  143     wstring tmp;
  144     tmp += p;
  145     puts(tmp);
  146 }
  147 
  148 bool Page::Dumper::isok(ucs4 p) const
  149 {
  150     char OutBuf[256], *outptr = OutBuf, *tmp = (char *)&p;
  151     size_t outsize = sizeof OutBuf;
  152     size_t insize = sizeof(p);
  153     size_t retval = iconv(tester, &tmp, &insize, &outptr, &outsize);
  154     if(retval == (size_t)-1)return false;
  155     return true;
  156 }
  157 
  158 static const wstring makewstr(const char *s) { wstring tmp; tmp += s; return tmp; }
  159 
  160 struct Page::Tag
  161 {
  162     class Param: public ptrable
  163     {
  164     public:
  165         Param() {}
  166         virtual ~Param() {}
  167     };
  168     class ParamParam: public Param
  169     {
  170     public:
  171         wstring name, value;
  172     };
  173     class ParamKey: public Param
  174     {
  175     public:
  176         wstring name;
  177     };
  178     class ParamComm: public Param
  179     {
  180     public:
  181         wstring data;
  182     };
  183     class ParamSpace: public Param
  184     {
  185     public:
  186         wstring data;
  187     };
  188 
  189     typedef autoptr<Param> itemp_t;
  190     typedef list<itemp_t> list_t;
  191     list_t items;
  192     wstring Name;
  193     bool terminating;
  194     
  195     typedef list_t::const_iterator const_iterator;
  196     typedef list_t::iterator iterator;
  197     
  198     void SetParam(const wstring &name,
  199                   const wstring &value)
  200     {
  201         ParamParam *tmp = new ParamParam;
  202         tmp->name = name;
  203         tmp->value = value;
  204         items.push_back(tmp);
  205     }
  206     void SetKey(const wstring &name)
  207     {
  208         ParamKey *tmp = new ParamKey;
  209         tmp->name = name;
  210         items.push_back(tmp);
  211     }
  212     void AddComment(const wstring &data)
  213     {
  214         ParamComm *tmp = new ParamComm;
  215         tmp->data = data,
  216         items.push_back(tmp);
  217     }
  218     void AddSpace(const wstring &data)
  219     {
  220         ParamSpace *tmp = new ParamSpace;
  221         tmp->data = data,
  222         items.push_back(tmp);
  223     }
  224     
  225     template<typename str>
  226     bool HasParamCalled(const str &name) const
  227     {
  228         for(const_iterator i=items.begin(); i!=items.end(); ++i)
  229         {
  230             const Param *p = *i;
  231             if(const ParamParam *param = dynamic_cast<const ParamParam *> (p))
  232                 if(IsEqual(param->name, name)) return true;
  233         }
  234         return false;
  235     }
  236     template<typename str>
  237     const wstring GetParamValue(const str &name) const
  238     {
  239         for(const_iterator i=items.begin(); i!=items.end(); ++i)
  240         {
  241             const Param *p = *i;
  242             if(const ParamParam *param = dynamic_cast<const ParamParam *> (p))
  243                 if(IsEqual(param->name, name))
  244                     return param->value;
  245         }
  246         wstring tmp;
  247         return tmp;
  248     }
  249     template<typename str>
  250     wstring &GetParamValue(const str &name)
  251     {
  252         wstring *tmp = NULL;
  253         for(iterator i=items.begin(); i!=items.end(); ++i)
  254         {
  255             Param *p = *i;
  256             if(ParamParam *param = dynamic_cast<ParamParam *> (p))
  257             {
  258                 tmp = &param->value;
  259                 if(IsEqual(param->name, name))
  260                     break;
  261             }
  262         }
  263         return *tmp;
  264     }
  265     template<typename str>
  266     void ReplaceParam(const str &name, const wstring &value)
  267     {
  268         for(iterator i=items.begin(); i!=items.end(); ++i)
  269         {
  270             Param *p = *i;
  271             if(ParamParam *param = dynamic_cast<ParamParam *> (p))
  272                 if(IsEqual(param->name, name))
  273                     param->value = value;
  274         }
  275     }
  276     
  277     void clear()
  278     {
  279         CLEARSTR(Name);
  280         items.clear();
  281         terminating = false;
  282     }
  283     bool Is(const char *t) const { return IsEqual(Name, t); }
  284     bool Is(const wstring &s) const { return IsEqual(Name, s); }
  285 };
  286 
  287 struct Page::PI : public Page::Tag
  288 {
  289     void ParseParams()
  290     {
  291         wstring param;
  292         wstring value;
  293         
  294         int state=0;
  295         
  296         items.clear();
  297         
  298         for(unsigned a=0; a<Content.size(); ++a)
  299         {
  300             ucs4 c = Content[a];
  301             switch(state)
  302             {
  303                 case 0:
  304                     if(c==' ' || c=='\t' || c == '\v' || c == '\r' || c == '\n')break;
  305                     CLEARSTR(param);
  306                     state=1; //passthru
  307                 case 1:
  308                     if(c=='=') {CLEARSTR(value);state=2;break;}
  309                     param += c;
  310                     break;
  311                 case 2:
  312                     if(c=='"') {state=3;break;}
  313                     // non-" is invalid
  314                     SetParam(param, value);
  315                     state=0;
  316                     break;
  317                 case 3:
  318                     if(c=='"') {state=2;break;}
  319                     value += c;
  320                     break;
  321             }
  322         }
  323         if(param.size())
  324             if(value.size()) SetParam(param, value);
  325             else SetKey(param);
  326     }
  327 public:
  328     wstring Content;
  329     PI() {}
  330     PI(const wstring &nam, const wstring &con) : Content(con)
  331     {
  332         Name = nam;
  333         ParseParams();
  334     }
  335     
  336     void Reconstruct()
  337     {
  338         CLEARSTR(Content);
  339         
  340         const_iterator i;
  341         for(i=items.begin(); i!=items.end(); ++i)
  342         {
  343             const Tag::Param *p = *i;
  344             if(const Tag::ParamParam *param = dynamic_cast<const Tag::ParamParam *> (p))
  345             {
  346                 if(Content.size())Content += ' ';
  347                 Content += param->name;
  348                 Content += "=\"";
  349                 Content += param->value;
  350                 Content += '"';
  351             }
  352             else if(const Tag::ParamKey *key = dynamic_cast<const Tag::ParamKey *> (p))
  353             {
  354                 if(Content.size())Content += ' ';
  355                 Content += key->name;
  356             }
  357         }
  358         ParseParams();
  359     }
  360 };
  361 
  362 class Page::ElemBody : public Page::Element
  363 {
  364 public:
  365     wstring body;
  366     ElemBody(const wstring &b) : body(b)
  367     {
  368     }
  369 };
  370 
  371 class Page::ElemTag : public Page::Element
  372 {
  373 public:
  374     struct Tag tag;
  375     ElemTag(const struct Tag &t) : tag(t)
  376     {
  377     }
  378 };
  379 
  380 class Page::ElemPI : public Page::Element
  381 {
  382     // XML processing information (<? ?>)
  383 public:
  384     struct PI pi;
  385     ElemPI(const struct PI &p) : pi(p)
  386     {
  387     }
  388 };
  389 
  390 class Page::ElemRaw : public Page::Element
  391 {
  392 public:
  393     wstring data;
  394     ElemRaw(const wstring &b) : data(b)
  395     {
  396     }
  397 };
  398 
  399 void Page::DumpTag(const Tag &tag) const
  400 {
  401     Putc('<');
  402     Dump(tag.Name);
  403     
  404     Tag::const_iterator i;
  405 
  406     for(i=tag.items.begin(); i!=tag.items.end(); ++i)
  407     {
  408         const Tag::Param *p = *i;
  409         if(const Tag::ParamParam *param = dynamic_cast<const Tag::ParamParam *> (p))
  410         {
  411             Dump(param->name);
  412             Putc('=');
  413             const wstring &s = param->value;
  414             bool needquotes = false;
  415             
  416             unsigned countq2 = 0;
  417             unsigned countq1 = 0;
  418             
  419             for(unsigned a=0; a<s.size(); ++a)
  420             {
  421                 if(s[a] >= 'A' && s[a] <= 'Z')continue;
  422                 if(s[a] >= 'a' && s[a] <= 'z')continue;
  423                 if(s[a] >= '0' && s[a] <= '9')continue;
  424                 if(s[a] == '-' || s[a] == '.'
  425                 || s[a] == '_' || s[a] == ':')continue;
  426                 if(s[a] == '"') ++countq2;
  427                 if(s[a] == '\'') ++countq1;
  428                 needquotes = true;
  429                 break;
  430             }
  431             
  432             if(!s.size()) needquotes = true;
  433             
  434             if(needquotes || xmlmode)
  435             {
  436                 char quotetype = (countq2 <= countq1 || xmlmode) ? '"' : '\'';
  437                 Putc(quotetype);
  438                 DumpHTML(s, quotetype);
  439                 Putc(quotetype);
  440             }
  441             else
  442                 DumpHTML(s);
  443         }
  444         else if(const Tag::ParamKey *key = dynamic_cast<const Tag::ParamKey *> (p))
  445         {
  446             Dump(key->name);
  447         }
  448         else if(const Tag::ParamComm *comm = dynamic_cast<const Tag::ParamComm *> (p))
  449         {
  450             Putc('-'); Putc('-');
  451             Dump(comm->data);
  452             Putc('-'); Putc('-');
  453         }
  454         else if(const Tag::ParamSpace *spc = dynamic_cast<const Tag::ParamSpace *> (p))
  455         {
  456             Dump(spc->data);
  457         }
  458     }
  459 
  460     if(tag.Name.size() > 0
  461     && tag.Name[0] == '?')
  462     {
  463         Putc('?');
  464     }
  465     if(tag.terminating)Putc(' '),Putc('/');
  466     Putc('>');
  467 }
  468 
  469 void Page::DumpRaw(const wstring &data) const
  470 {
  471     bool needcomments = false;
  472     for(unsigned a=0; a<data.size(); ++a)
  473     {
  474         if(data[a] == ' ' || data[a] == '\n'
  475         || data[a] == '\t' || data[a] == '\r')continue;
  476         needcomments = !IsEqual(data.substr(a, 4), "<!--");
  477         break;
  478     }
  479     if(needcomments)
  480     {
  481         FixedStyleScript++;
  482     }
  483     if(needcomments)
  484         Putc('<'), Putc('!'), Putc('-'), Putc('-'), Putc('\n');
  485     Dump(data);
  486     if(needcomments)
  487         Putc('\n'), Putc('-'), Putc('-'), Putc('>');
  488 }
  489 
  490 void Page::DumpPI(const PI &pi) const
  491 {
  492     Putc('<');
  493     Dump(pi.Name);
  494     Putc(' ');
  495     Dump(pi.Content);
  496     Putc('?');
  497     Putc('>');
  498     /* Debug - don't use
  499     Tag::const_iterator i;
  500     for(i=pi.items.begin(); i!=pi.items.end(); ++i)
  501     {
  502         const Tag::Param *p = *i;
  503         if(const Tag::ParamParam *param = dynamic_cast<const Tag::ParamParam *> (p))
  504         {
  505             Putc('[');
  506             Dump(param->name);
  507             Putc('=');
  508             Dump(param->value);
  509             Putc(']');
  510         }
  511         else if(const Tag::ParamKey *key = dynamic_cast<const Tag::ParamKey *> (p))
  512         {
  513             Putc('[');
  514             Dump(key->name);
  515             Putc(']');
  516         }
  517     }
  518     */
  519 }
  520 
  521 void Page::Dumper::puts(const wstring &s) const
  522 {
  523     char *input = (char *) (const_cast<ucs4 *> (s.data()));
  524     size_t left = s.size() * sizeof(ucs4);
  525     while(left > 0)
  526     {
  527         char OutBuf[4096], *outptr = OutBuf;
  528         size_t outsize = sizeof OutBuf;
  529     #if DEBUG
  530         fprintf(stderr, "P1:Converting %u bytes to %u bytes space\n", left, outsize);
  531         size_t bytesread = left, converted = outsize;
  532     #endif
  533         size_t retval = iconv(converter, &input, &left, &outptr, &outsize);
  534     #if DEBUG
  535         bytesread -= left; converted -= outsize;
  536         fprintf(stderr, "%u bytes read, %u bytes left, %u bytes generated, %u bytes  space left\n", bytesread, left, converted, outsize);
  537     #endif
  538         fwrite(OutBuf, 1, outptr-OutBuf, stdout);
  539         if(retval == (size_t)-1)
  540         {
  541 #if DEBUG
  542             perror("iconv");
  543 #endif
  544             if(errno == E2BIG)
  545             {
  546                 continue;
  547             }
  548             if(errno == EILSEQ)
  549             {
  550                 input += sizeof(ucs4);
  551                 left -= sizeof(ucs4);
  552                 putchar('?');
  553             }
  554             if(errno == EINVAL)
  555             {
  556                 /* Got partial byte and the sequence terminates after that */
  557                 putchar('?');
  558                 return;
  559             }
  560         }
  561     }
  562 }
  563 
  564 
  565 void Page::Dumper::SetSet(const char *setname)
  566 {
  567     if(verbose >= 1)
  568     {
  569         fprintf(stderr, "Recoding %s to output (%s)\n", midset, setname);
  570         fflush(stderr);
  571     }
  572     iconv_close(converter);
  573     iconv_close(tester);
  574     charset = setname;
  575     if(OpenConv(converter, setname, midset)
  576     || OpenConv(tester,    setname, midset))
  577         exit(EINVAL);
  578 }
  579 
  580 
  581 wstring Page::htmlencode(const wstring &s) const
  582 {
  583     wstring res;
  584     unsigned a, b;
  585     for(a=b=0; a<s.size(); ++a)
  586     {
  587         if(s[a]=='<' && !underquote) /* Must be encoded: Could start a tag otherwise. */
  588         {
  589             if(a>b)res += s.substr(b, a-b);
  590             res += "&lt;";
  591             b = a+1; continue;
  592         }
  593         if(s[a]=='"' && underquote == '"') /* Must be encoded: Could end a parameter otherwise. */
  594         {
  595             if(a>b)res += s.substr(b, a-b);
  596             res += "&quot;";
  597             b = a+1; continue;
  598         }
  599         if(s[a]=='\'' && underquote == '\'') /* Must be encoded: Could end a parameter otherwise. */
  600         {
  601             if(a>b)res += s.substr(b, a-b);
  602             res += "&#39;";
  603             b = a+1; continue;
  604         }
  605         if(s[a]=='&') /* Must be encoded: Could start an entity otherwise. */
  606         {
  607             if(a>b)res += s.substr(b, a-b);
  608             res += "&amp;";
  609             b = a+1; continue;
  610         }
  611         if(s[a]==160) /* Not necessary, but nice: &nbsp; is commonly known. */
  612         {
  613             /* FIXME: invent a better condition here */
  614             if(!strict)
  615             {
  616                 if(a>b)res += s.substr(b, a-b);
  617                 res += "&nbsp;";
  618                 b = a+1; continue;
  619             }
  620         }
  621         if(lossless && !CanDump(s[a]))
  622         {
  623             char Buf[64];
  624             if(usehex)
  625                 sprintf(Buf, "&#x%X;", s[a]);
  626             else
  627                 sprintf(Buf, "&#%u;", s[a]);
  628             if(a>b)res += s.substr(b, a-b);
  629             res += Buf;
  630             b = a+1; continue;
  631         }
  632     }
  633     if(a>b)res += s.substr(b, a-b);
  634     return res;
  635 }
  636 
  637 wstring Page::htmldecode(const wstring &s) const
  638 {
  639     wstring res;
  640     unsigned a, b;
  641     for(a=b=0; a<s.size(); )
  642     {
  643         if(s[a] != '&')
  644         {
  645             ++a;
  646             continue;
  647         }
  648         ucs4 specialchar = 0;
  649         unsigned c=a+1, e=0;
  650         if(c < s.size() && s[c] == '#')
  651         {
  652             ++c;
  653             if(c < s.size() && s[c] == 'x')
  654             {
  655                 for(e=2; ++c < s.size(); ++e)
  656                 {
  657                     if(s[c] >= '0' && s[c] <= '9') specialchar = specialchar*16+ (s[c]-'0');
  658                     else if(s[c] >= 'A' && s[c] <= 'F') specialchar = specialchar*16+ (s[c]-'A'+10);
  659                     else if(s[c] >= 'a' && s[c] <= 'f') specialchar = specialchar*16+ (s[c]-'a'+10);
  660                     else break;
  661                 }
  662                 goto AddChar;
  663             }
  664             for(e=1; c < s.size(); ++e, ++c)
  665             {
  666                 if(s[c] >= '0' && s[c] <= '9') specialchar = specialchar*10+ (s[c]-'0');
  667                 else break;
  668             }
  669             goto AddChar;
  670         }
  671         for(e=0; (c < s.size() && s[c]!=';'); ++e, ++c)
  672         {
  673             if(s[c] >= '0' && s[c] <= '9') { if(!e)break; continue; }
  674             if(s[c] >= 'A' && s[c] <= 'Z')continue;
  675             if(s[c] >= 'a' && s[c] <= 'z')continue;
  676             if(s[c] == '.')continue;
  677             break;
  678         }
  679         {wstring entname = s.substr(a+1, e);
  680          specialchar = FindEntity(entname);
  681         }
  682         if(specialchar != ilseq)
  683         {
  684 AddChar:    res += s.substr(b, a-b);
  685             res += specialchar;
  686             a += e+1;
  687             if(a == s.size())break;
  688             if(s[a] == ';')
  689                 ++a;
  690             else
  691             {
  692                 /* FIXME: What if &abc; -code did not have ';' ? */
  693             }
  694             b = a;
  695             continue;
  696         }
  697         /* Unrecognized &..; -code. Don't parse it. */
  698         ++a;
  699         continue;
  700     }
  701     if(a > b) res += s.substr(b, a-b);
  702     return res;
  703 }
  704 
  705 
  706 void Page::Dump() const
  707 {
  708     if(signature)
  709     {
  710         if(Dumper.isok(ucsig))
  711             Putc(ucsig);
  712         else
  713         {
  714             fprintf(stderr,
  715                 "Warning: Target encoding can't express unicode signature character. Not signing.\n");
  716         }
  717     }
  718     
  719     for(unsigned a=0; a<Structure.size(); ++a)
  720     {
  721         const Element *e = Structure[a];
  722         if(const ElemBody *elem = dynamic_cast<const ElemBody *> (e))
  723         {
  724             DumpHTML(elem->body);
  725         }
  726         else if(const ElemTag *elem = dynamic_cast<const ElemTag *> (e))
  727         {
  728             DumpTag(elem->tag);
  729         }
  730         else if(const ElemRaw *elem = dynamic_cast<const ElemRaw *> (e))
  731         {
  732             DumpRaw(elem->data);
  733         }
  734         else if(const ElemPI *elem = dynamic_cast<const ElemPI *> (e))
  735         {
  736             DumpPI(elem->pi);
  737         }
  738     }
  739 }
  740 
  741 void Page::SetOut(const char *outset)
  742 {
  743     Dumper.SetSet(outset);
  744 
  745     for(unsigned a=0; a<Structure.size(); ++a)
  746     {
  747         Element *e = Structure[a];
  748         if(ElemTag *elem = dynamic_cast<ElemTag *> (e))
  749         {
  750             struct Tag &tag = elem->tag;
  751             if(!tag.Is("META"))
  752             {
  753                 /* We are only interested in meta-tags. */
  754                 continue;
  755             }
  756             
  757             Tag::iterator i;
  758 
  759             if(!tag.HasParamCalled("HTTP-EQUIV")
  760             || !IsEqual(tag.GetParamValue("HTTP-EQUIV"), "CONTENT-TYPE")) continue;
  761             
  762             wstring &s = tag.GetParamValue("CONTENT");
  763                
  764             wstring tmp; tmp += "charset=";
  765             size_t a = s.find(tmp);
  766             if(a == s.npos) { continue; }
  767             a += 8;
  768             
  769             tmp = s.substr(0, a);
  770             tmp += outset;
  771             s = tmp;
  772         }
  773         else if(ElemPI *elem = dynamic_cast<ElemPI *> (e))
  774         {
  775             struct PI &pi = elem->pi;
  776             if(pi.Is("?XML"))
  777             {
  778                 if(!pi.HasParamCalled("ENCODING")) continue;
  779                 
  780                 wstring tmp; tmp += outset;
  781                 pi.ReplaceParam("ENCODING", tmp);
  782                 
  783                 pi.Reconstruct();
  784             }
  785         }
  786     }
  787 }
  788 
  789 void Page::FilterText(wstring (*proc)(const wstring &))
  790 {
  791     for(unsigned a=0; a<Structure.size(); ++a)
  792     {
  793         Element *e = Structure[a];
  794         if(ElemBody *elem = dynamic_cast<ElemBody *> (e))
  795         {
  796             wstring &body = elem->body;
  797             body = proc(body);
  798         }
  799         else if(ElemTag *elem = dynamic_cast<ElemTag *> (e))
  800         {
  801             struct Tag &tag = elem->tag;
  802             if(tag.Is("IMG"))
  803             {
  804                 Tag::iterator i;
  805                 for(i=tag.items.begin(); i!=tag.items.end(); ++i)
  806                 {
  807                     Tag::Param *p = *i;
  808                     if(Tag::ParamParam *param = dynamic_cast<Tag::ParamParam *> (p))
  809                         if(IsEqual(param->name, "ALT"))
  810                             param->value = proc(param->value);
  811                 }
  812             }
  813             else if(tag.Is("A"))
  814             {
  815                 Tag::iterator i;
  816                 for(i=tag.items.begin(); i!=tag.items.end(); ++i)
  817                 {
  818                     Tag::Param *p = *i;
  819                     if(Tag::ParamParam *param = dynamic_cast<Tag::ParamParam *> (p))
  820                         if(IsEqual(param->name, "TITLE"))
  821                             param->value = proc(param->value);
  822                 }
  823             }
  824         }
  825         else
  826         {
  827             // Nothing interesting in ElemRaw
  828             // Are there other types of elements?
  829         }
  830     }
  831 }
  832     
  833 static void ParseInConv(FILE *fp, const char *inset, int fd)
  834 {
  835     /* Deallocate all possible unnecessary resources */
  836     if(fileno(fp) != 0 && fd != 0) close(0);
  837     if(fileno(fp) != 1 && fd != 0) close(1);
  838     chdir("/");
  839     //signal(SIGHUP, _exit); - probably unsafe and redundant
  840     
  841     iconv_t converter = iconv_open(midset, inset);
  842     if(converter == (iconv_t)(-1))
  843     {
  844         perror("iconv_open");
  845         _exit(1);
  846     }
  847     
  848     char InBuf[4096];
  849     char OutBuf[4096];
  850     
  851     char *bufptr = InBuf;
  852     size_t bytes = 0;
  853     
  854     for(;;)
  855     {
  856         size_t code = fread(bufptr+bytes, 1, sizeof InBuf - bytes, fp);
  857         if(code <= 0)
  858         {
  859             if(!bytes)break;
  860         }
  861         else
  862             bytes += code;
  863 
  864     ReCode:
  865         char *outptr = OutBuf;
  866         size_t outsize = sizeof OutBuf;
  867         
  868         bool needspace = false;
  869         bool gotilseq = false;
  870     #if DEBUG
  871         fprintf(stderr, "P2:Converting %u bytes to %u bytes space\n", bytes, outsize);
  872         size_t bytesread = bytes;
  873     #endif
  874         size_t converted = outsize;
  875         size_t retval = iconv(converter, &bufptr, &bytes, &outptr, &outsize);
  876         converted -= outsize;
  877     #if DEBUG
  878         bytesread -= bytes;
  879     #endif
  880         write(fd, OutBuf, converted);
  881         
  882         if(retval == (size_t)-1)
  883         {
  884     #if DEBUG
  885             perror("iconv");
  886     #endif
  887             if(errno == E2BIG)
  888             {
  889                 needspace = true;
  890             }
  891             if(errno == EILSEQ)
  892             {
  893                 gotilseq = true;
  894             }
  895             if(errno == EINVAL)
  896             {
  897                 /* Got partial byte and the sequence terminates after that */
  898                 if(code == 0)
  899                 {
  900                     /* It's an error if we're at eof */
  901                     gotilseq = true;
  902                 }
  903             }
  904         }
  905     #if DEBUG
  906         fprintf(stderr, "%u bytes read, %u bytes left, %u bytes generated, %u bytes space left\n", bytesread, bytes, converted, outsize);
  907         fflush(stderr);
  908     #endif
  909         if(gotilseq)
  910         {
  911             write(fd, &ilseq, sizeof(ilseq));
  912             /* Skip the invalid byte */
  913             --bytes; ++bufptr;
  914         }
  915         
  916         if(needspace)
  917         {
  918             /* No need to retry reading, just want more space */
  919             goto ReCode;
  920         }
  921         
  922         memmove(&InBuf[0], bufptr, bytes);
  923         bufptr = InBuf;
  924     }
  925     
  926     iconv_close(converter);
  927 }
  928 
  929 void Page::Parse(FILE *fp, const char *charset)
  930 {
  931     string inset = charset;
  932 ReHandle:
  933     if(verbose >= 1)
  934     {
  935         fprintf(stderr, "Recoding input (%s) to %s\n", inset.c_str(), midset);
  936         fflush(stderr);
  937     }
  938     int pip[2]; pipe(pip);
  939     int pid = fork();
  940     if(!pid)
  941     {
  942         close(pip[0]);
  943         ParseInConv(fp, inset.c_str(), pip[1]);
  944         _exit(0);
  945     }
  946     close(pip[1]);
  947     FILE *pipfp = fdopen(pip[0], "rb");
  948     
  949     Structure.clear();
  950     ParseUCS4(pipfp);
  951     
  952     fclose(pipfp);
  953     kill(pid, SIGHUP);
  954     waitpid(pid, NULL, 0);
  955 
  956     set<wstring> newcharset;
  957     for(unsigned a=0; a<Structure.size(); ++a)
  958     {
  959         const Element *e = Structure[a];
  960         if(const ElemTag *elem = dynamic_cast<const ElemTag *> (e))
  961         {
  962             const struct Tag &tag = elem->tag;
  963             if(!tag.Is("META"))
  964             {
  965                 /* We are only interested in meta-tags. */
  966                 continue;
  967             }
  968             
  969             if(!tag.HasParamCalled("HTTP-EQUIV")
  970             || !IsEqual(tag.GetParamValue("HTTP-EQUIV"), "CONTENT-TYPE")) continue;
  971                
  972             wstring s = tag.GetParamValue("CONTENT");
  973 
  974             wstring tmp; tmp += "charset=";
  975             size_t a = s.find(tmp);
  976             if(a == s.npos) { continue; }
  977             
  978             wstring way = s.substr(a+8);
  979             for(a=0; a<way.size(); ++a)
  980                 if(way[a] < 0x100)
  981                     way[a] = toupper(way[a]);
  982             newcharset.insert(way);
  983         }
  984         else if(const ElemPI *elem = dynamic_cast<const ElemPI *> (e))
  985         {
  986             const struct PI &pi = elem->pi;
  987             if(pi.Is("?XML"))
  988             {
  989                 if(!pi.HasParamCalled("ENCODING")) continue;
  990                 
  991                 wstring way = pi.GetParamValue("ENCODING");
  992                 for(unsigned a=0; a<way.size(); ++a)
  993                     if(way[a] < 0x100)
  994                         way[a] = toupper(way[a]);
  995                 newcharset.insert(way);
  996             }
  997         }
  998     }
  999     
 1000     if(newcharset.size() > 0)
 1001     {
 1002         if(newcharset.size() > 1)
 1003         {
 1004             fprintf(stderr, "Error: The document is schizophrenic and claims to be encoded in various ways:");
 1005             set<wstring>::const_iterator i;
 1006             size_t c=newcharset.size();
 1007             for(i=newcharset.begin(); i!=newcharset.end(); ++i)
 1008             {
 1009                 string s = Stringify(*i);
 1010                 fprintf(stderr, " %s%s",
 1011                     s.c_str(),
 1012                     (--c==1 ? " and" : c>1 ? "," : ".\n"));
 1013             }
 1014             return;
 1015         }
 1016         wstring newset = *newcharset.begin();
 1017         
 1018         if(!IsEqual(newset, inset.c_str()))
 1019         {
 1020             inset = Stringify(newset);
 1021             
 1022             if(verbose >= 0)
 1023             {
 1024                 fprintf(stderr,
 1025                     "Warning: Document character encoding seems to be %s, which differs from what you specified. Rereading.\n",
 1026                     inset.c_str());
 1027             }
 1028             
 1029             if(fseek(fp, 0, SEEK_SET) == -1)
 1030             {
 1031                 if(errno == ESPIPE
 1032                 || errno == EBADF)
 1033                 {
 1034                     fprintf(stderr, "Error: stdin is not seekable. Can not reread! You should use the -I%s option. Or perhaps you committed a Useless Use Of Cat.\n",
 1035                         inset.c_str());
 1036                     return;
 1037                 }
 1038                 else
 1039                     perror("fseek");
 1040             }
 1041            
 1042             goto ReHandle;
 1043         }
 1044     }
 1045 }
 1046 
 1047 void Page::ParseUCS4(FILE *fp)
 1048 {
 1049     wstring body, comment, rawcontent;
 1050     
 1051     enum states
 1052     {
 1053         stBody,
 1054         stTagName,
 1055         stTagSpace,
 1056         stTagSGML,
 1057         stTagSGMLquoted,
 1058         stTagSGMLquoted2,
 1059         stTagParam,
 1060         stTagParamValue,
 1061         stTagParamValueQuoted,
 1062         stTagParamValueQuoted2,
 1063 #if 1
 1064         stRawContent,
 1065         stMaybeEndRawContent,
 1066         stRawContentTagName,
 1067 #endif
 1068         stMaybeComment,
 1069         stComment,
 1070         stMaybeEndComment,
 1071         stXMLpiname,
 1072         stXMLpi,
 1073         stXMLpi2
 1074     } state = stBody;
 1075     
 1076     wstring ParamSpace;
 1077     wstring ParamName;
 1078     wstring ParamValue;
 1079     wstring RawCurrentTag;
 1080     wstring ParamContent;
 1081     Tag Tag;
 1082     
 1083     bool firstbyte = true;
 1084     for(;;)
 1085     {
 1086         int c = Getc(fp);
 1087         if(feof(fp))break;
 1088         
 1089         if(firstbyte && c == ucsig)
 1090         {
 1091             if(!signature)
 1092                 fprintf(stderr,
 1093                     "Warning: Found an unicode signature. Will put one to the output too.\n");
 1094             signature = true;
 1095             continue;
 1096         }
 1097         
 1098         firstbyte = false;
 1099         
 1100 Statechange:
 1101         if(ParamSpace.size() && state != stTagSpace)
 1102         {
 1103             Tag.AddSpace(ParamSpace);
 1104             CLEARSTR(ParamSpace);
 1105         }
 1106         //fprintf(stderr, "State=%d, c=%c\n", state, c);
 1107         switch(state)
 1108         {
 1109             case stBody:
 1110                 if(c != '<')
 1111                 {
 1112                     body += c;
 1113                     break;
 1114                 }
 1115                 if(body.size())
 1116                 {
 1117                     push_back(new ElemBody(htmldecode(body)));
 1118                     CLEARSTR(body);
 1119                 }
 1120                 
 1121                 Tag.clear();
 1122                 state = stTagName;
 1123                 break;
 1124 
 1125             case stTagName:
 1126                 if(c == '?' && !Tag.Name.size())
 1127                 {
 1128                     state = stXMLpiname;
 1129                     CLEARSTR(ParamName);
 1130                     goto Statechange;
 1131                 }
 1132                 // Note: this code is almost duplicated in stRawContentTagName
 1133                 if((c == '!' && !Tag.Name.size())
 1134                 || (c == '/' && !Tag.Name.size())
 1135                 || (c >= 'A' && c <= 'Z')
 1136                 || (c >= 'a' && c <= 'z')
 1137                 || (c >= '0' && c <= '9' && Tag.Name.size())
 1138                 || c == '_'
 1139                 || c == '.'
 1140                 || c == ':' /* No - here, it breaks comments */
 1141                   )
 1142                 {
 1143                     Tag.Name += c;
 1144                     break;
 1145                 }
 1146                 CLEARSTR(ParamName);
 1147                 state = stTagSpace;
 1148                 goto Statechange;
 1149                 
 1150             case stTagSpace:
 1151                 if(c == '-')
 1152                 {
 1153                     state = stMaybeComment;
 1154                     break;
 1155                 }
 1156                 if(c == '>')
 1157                 {
 1158                     if(ParamSpace.size())
 1159                     {
 1160                         Tag.AddSpace(ParamSpace);
 1161                         CLEARSTR(ParamSpace);
 1162                     }
 1163                     if(ParamName.size())
 1164                     {
 1165                         Tag.SetKey(ParamName);
 1166                         CLEARSTR(ParamName);
 1167                     }
 1168                     push_back(new ElemTag(Tag));
 1169                     
 1170                     state = stBody;
 1171                     
 1172                     if(!strict)
 1173                     {
 1174                         if(Tag.Is("SCRIPT")
 1175                         || Tag.Is("STYLE"))
 1176                         {
 1177                             RawCurrentTag = Tag.Name;
 1178                             RawCurrentTag.insert(0, 1, (ucs4)'/');
 1179                             state = stRawContent;
 1180                         }
 1181                     }
 1182 
 1183                     Tag.clear();
 1184                     
 1185                     CLEARSTR(body);
 1186                     break;
 1187                 }
 1188                 if(Tag.Name.size() && Tag.Name[0] == '!')
 1189                 {
 1190                     if(c != ' ' && c != '\t' && c != '\v' && c != '\r' && c != '\n')
 1191                     {
 1192                         if(ParamName.size())
 1193                             ParamName += ' ';
 1194                         state = stTagSGML;
 1195                         goto Statechange;
 1196                     }
 1197                 }
 1198                 
 1199                 if(c == '/')
 1200                 {
 1201                     CLEARSTR(ParamSpace);
 1202                     Tag.terminating = true;
 1203                     break;
 1204                 }
 1205                 
 1206                 if((c >= 'A' && c <= 'Z')
 1207                 || (c >= 'a' && c <= 'z'))
 1208                 {
 1209                     state = stTagParam;
 1210                     goto Statechange;
 1211                 }
 1212                 
 1213                 ParamSpace += c;
 1214                 break;
 1215             
 1216             case stTagSGML:
 1217                 if(c == '-')
 1218                 {
 1219                     state = stMaybeComment;
 1220                     break;
 1221                 }
 1222                 if(c == '>')
 1223                 {
 1224                     state = stTagSpace;
 1225                     goto Statechange;
 1226                 }
 1227                 ParamName += c;
 1228                 if(c == '"')
 1229                     state = stTagSGMLquoted;
 1230                 else if(c == '\'')
 1231                     state = stTagSGMLquoted2;
 1232                 break;
 1233             
 1234             case stTagSGMLquoted:
 1235                 ParamName += c;
 1236                 if(c == '"')
 1237                     state = stTagSGML;
 1238                 break;
 1239 
 1240             case stTagSGMLquoted2:
 1241                 ParamName += c;
 1242                 if(c == '\'')
 1243                     state = stTagSGML;
 1244                 break;
 1245 
 1246             case stTagParam:
 1247                 if((c >= 'A' && c <= 'Z')
 1248                 || (c >= 'a' && c <= 'z')
 1249                 || c == ':'
 1250                 || c == '.'
 1251                 || c == '_'
 1252                 || c == '-'
 1253                   )
 1254                 {
 1255                     ParamName += c;
 1256                     break;
 1257                 }
 1258 
 1259                 if(c == '=')
 1260                 {
 1261                     CLEARSTR(ParamValue);
 1262                     state = stTagParamValue;
 1263                     break;
 1264                 }
 1265 
 1266                 Tag.SetKey(ParamName);
 1267                 CLEARSTR(ParamName);
 1268                 state = stTagSpace;
 1269                 goto Statechange;
 1270             
 1271             case stTagParamValue:
 1272                 if(!ParamValue.size())
 1273                 {
 1274                     if(c == '"')
 1275                     {
 1276                         state = stTagParamValueQuoted;
 1277                         break;
 1278                     }
 1279                     if(c == '\'')
 1280                     {
 1281                         state = stTagParamValueQuoted2;
 1282                         break;
 1283                     }
 1284                 }
 1285                 
 1286                 if((c >= 'A' && c <= 'Z')
 1287                 || (c >= 'a' && c <= 'z')
 1288                 || (c >= '0' && c <= '9')
 1289                 || c == '.' || c == '-' /* .-_: are allowed by sgml */
 1290                 || c == '_' || c == ':')
 1291                 {
 1292                     ParamValue += c;
 1293                     break;
 1294                 }
 1295                 else if(!strict
 1296                     && (c!=' ' && c!='\t' && c!='\n' && c!='\r' && c!='\v' && c!='>')
 1297                        ) /* generated by many programs :-/ */
 1298                 {
 1299                     ParamViolations++;
 1300                     ParamValue += c;
 1301                     break;
 1302                 }
 1303                 Tag.SetParam(ParamName, htmldecode(ParamValue));
 1304                 CLEARSTR(ParamName);
 1305                 CLEARSTR(ParamValue);
 1306                 state = stTagSpace;
 1307                 goto Statechange;
 1308             
 1309             case stTagParamValueQuoted:
 1310                 if(c == '"')
 1311                 {
 1312                     Tag.SetParam(ParamName, htmldecode(ParamValue));
 1313                     CLEARSTR(ParamName);
 1314                     CLEARSTR(ParamValue);
 1315                     state = stTagSpace;
 1316                     break;
 1317                 }
 1318                 ParamValue += c;
 1319                 break;
 1320 
 1321             case stTagParamValueQuoted2:
 1322                 if(c == '\'')
 1323                 {
 1324                     Tag.SetParam(ParamName, htmldecode(ParamValue));
 1325                     CLEARSTR(ParamName);
 1326                     CLEARSTR(ParamValue);
 1327                     state = stTagSpace;
 1328                     break;
 1329                 }
 1330                 ParamValue += c;
 1331                 break;
 1332 
 1333             case stMaybeComment:
 1334                 if(c == '-')
 1335                 {
 1336                     state = stComment;
 1337                     CLEARSTR(comment);
 1338                     break;
 1339                 }
 1340                 
 1341                 ParamName += '-';
 1342                 
 1343                 state = stTagSpace;
 1344                 goto Statechange;
 1345             
 1346             case stComment:
 1347                 if(c == '-')
 1348                 {
 1349                     state = stMaybeEndComment;
 1350                     break;
 1351                 }
 1352                 comment += c;
 1353                 break;
 1354             
 1355             case stMaybeEndComment:
 1356                 if(c == '-')
 1357                 {
 1358                     if(comment.size())
 1359                     {
 1360                         Tag.AddComment(comment);
 1361                         CLEARSTR(comment);
 1362                     }
 1363                     
 1364                     state = stTagSpace;
 1365                     break;
 1366                 }
 1367                 comment += '-';
 1368                 state = stComment;
 1369                 goto Statechange;
 1370 #if 1
 1371             case stRawContent:
 1372                 // end with </
 1373                 if(c == '<')
 1374                 {
 1375                     state = stMaybeEndRawContent;
 1376                     break;
 1377                 }
 1378                 rawcontent += c;
 1379                 break;
 1380             
 1381             case stXMLpiname:
 1382                 if(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v')
 1383                 {
 1384                     CLEARSTR(ParamContent);
 1385                     state = stXMLpi;
 1386                     break;
 1387                 }
 1388                 ParamName += c;
 1389                 break;
 1390 
 1391             case stXMLpi:
 1392                 if(c == '?')
 1393                 {
 1394                     state = stXMLpi2;
 1395                     break;
 1396                 }
 1397                 ParamContent += c;
 1398                 break;
 1399                 
 1400             case stXMLpi2:
 1401                 if(c != '>')
 1402                 {
 1403                     ParamContent += '?';
 1404                     // not end of tag
 1405                     state = stXMLpi;
 1406                     goto Statechange;
 1407                 }
 1408                 
 1409                 push_back(new ElemPI(PI(ParamName, ParamContent)));
 1410                 
 1411                 Tag.clear();
 1412                 CLEARSTR(ParamContent);
 1413                 
 1414                 // tag ends here
 1415                 xmlmode = true;
 1416                 state = stBody;
 1417                 break;
 1418 
 1419             case stMaybeEndRawContent:
 1420                 if(c == '/')
 1421                 {
 1422                     CLEARSTR(Tag.Name);
 1423                     state = stRawContentTagName;
 1424                     goto Statechange;
 1425                 }
 1426                 rawcontent += '<';
 1427                 state = stRawContent;
 1428                 goto Statechange;
 1429             
 1430             case stRawContentTagName:
 1431                 // Note: this code is duplicate from stTagName
 1432                 if((c == '!' && !Tag.Name.size())
 1433                 || (c == '/' && !Tag.Name.size())
 1434                 || (c >= 'A' && c <= 'Z')
 1435                 || (c >= 'a' && c <= 'z')
 1436                 || (c >= '0' && c <= '9' && Tag.Name.size())
 1437                 || (c == '_' || c == '.'
 1438                  || c == ':') /* No - here, it breaks comments */
 1439                   )
 1440                 {
 1441                     Tag.Name += c;
 1442                     break;
 1443                 }
 1444                 
 1445                 if(Tag.Is(RawCurrentTag))
 1446                 {
 1447                     if(rawcontent.size())
 1448                     {
 1449                         push_back(new ElemRaw(rawcontent));
 1450                         CLEARSTR(rawcontent);
 1451                     }
 1452                     
 1453                     CLEARSTR(ParamName);
 1454                     state = stTagSpace;
 1455                     goto Statechange;
 1456                 }
 1457                 rawcontent += '<';
 1458                 rawcontent += Tag.Name;
 1459                 state = stRawContent;
 1460                 goto Statechange;
 1461 #endif
 1462         }
 1463     }
 1464     if(body.size())
 1465     {
 1466         push_back(new ElemBody(htmldecode(body)));
 1467         CLEARSTR(body);
 1468     }
 1469     if(rawcontent.size())
 1470     {
 1471         push_back(new ElemRaw(rawcontent));
 1472         CLEARSTR(rawcontent);
 1473     }
 1474     /* Don't add a broken tag to the end. */
 1475 }
 1476 
 1477 #ifdef GXBLT
 1478 /* This is a support for altering the text content */
 1479 static wstring testifiltteri(const wstring &var)
 1480 {
 1481     wstring res;
 1482     bool found = false;
 1483     for(unsigned a=0; a<var.size(); ++a)
 1484         if(var[a] != ' '
 1485         && var[a] != '\n'
 1486         && var[a] != '\r'
 1487         && var[a] != '\t')
 1488         {
 1489             found = true;
 1490             break;
 1491         }
 1492     if(!found) return var;
 1493     
 1494     for(unsigned a=0; a<5; ++a) res += "gxblt"[a];
 1495 /*
 1496     // res += '[';
 1497     res += var;
 1498     // res += ']';
 1499 */
 1500     return res;
 1501 }
 1502 #endif
 1503 
 1504 #include <argh.hh>
 1505 
 1506 int main(int argc, const char *const *argv)
 1507 {
 1508     string outset = "iso-8859-1";
 1509     string inset = "iso-8859-1";
 1510     
 1511     ParamHandler Argh;
 1512     Argh.AddLong("inset",   'I').SetString().SetDesc("Assumed input character set (default: "+inset+")", "setname");
 1513     Argh.AddLong("outset",  'O').SetString().SetDesc("Wanted output character set (default: "+outset+")", "setname");
 1514     Argh.AddLong("help",    'h').SetBool().SetDesc("This help.");
 1515     Argh.AddLong("lossy",   'l').SetBool().SetDesc("Disable lossless conversion.");
 1516     Argh.AddLong("usehex",  'e').SetBool().SetDesc("Use hexadecimal escapes.");
 1517     Argh.AddLong("version", 'V').SetBool().SetDesc("Displays version information.");
 1518     Argh.AddLong("strict",  's').SetBool().SetDesc("Turn off support for slightly broken HTML.");
 1519     Argh.AddLong("verbose", 'v').SetBool().SetDesc("Be less quiet.");
 1520     Argh.AddLong("quiet",   'q').SetBool().SetDesc("Be less verbose.");
 1521     Argh.AddLong("xmlmode", 'x').SetBool().SetDesc("XML mode: all tag param values quoted.");
 1522     Argh.AddLong("signature",'g').SetBool().SetDesc("Prefix the file with an unicode signature.");
 1523 
 1524     Argh.StartParse(argc, argv);
 1525     for(;;)
 1526     {
 1527         long c = Argh.GetParam();
 1528         if(c == -1)break;
 1529         switch(c)
 1530         {
 1531             case 'V': printf("%s\n", VERSION); return 0;
 1532             case 'I': inset = Argh.GetString(); break;
 1533             case 'O': outset = Argh.GetString(); break;
 1534             case 'l': lossless = !Argh.GetBool(); break;
 1535             case 'e': usehex = Argh.GetBool(); break;
 1536             case 's': strict = Argh.GetBool(); break;
 1537             case 'v': verbose += Argh.GetBool() ? 1 : -1; break;
 1538             case 'q': verbose -= Argh.GetBool() ? 1 : -1; break;
 1539             case 'x': xmlmode = Argh.GetBool(); break;
 1540             case 'g': signature = Argh.GetBool(); break;
 1541             case 'h':
 1542                 printf(
 1543                     "htmlrecode " VERSION " - Copyright (C) 1992,2003 Bisqwit (http://iki.fi/bisqwit/)\n"
 1544                     "\n"
 1545                     "Usage: htmlrecode [<option> [<...>]]\n"
 1546                     "\n"
 1547                     "Reads stdin, writes stdout.\n"
 1548                     "\nOptions:\n");
 1549                 Argh.ListOptions();
 1550                 printf("\n"
 1551                     "Pipe in the html file and pipe the output to result file.\n");
 1552                 return 0;
 1553             default:
 1554                 // TODO
 1555                 break;
 1556         }
 1557     }
 1558     if(!Argh.ok())return -1;
 1559 
 1560     Page p;
 1561 
 1562     rewind(stdin);
 1563     p.Parse(stdin, inset.c_str());
 1564     fclose(stdin);
 1565     
 1566     /* This is a support for altering the text content */
 1567 #ifdef GXBLT
 1568     p.FilterText(testifiltteri);
 1569 #endif
 1570     
 1571     p.SetOut(outset.c_str());
 1572     p.Dump();
 1573     
 1574     if(FixedStyleScript && verbose >= 1)
 1575     {
 1576         fprintf(stderr,
 1577             "Warning: Fixed %u SCRIPT/STYLE block%s that %sn't properly hidden with HTML comments. Be grateful.\n",
 1578                 FixedStyleScript,
 1579                 FixedStyleScript==1 ? "" : "s",
 1580                 FixedStyleScript==1 ? "was" : "were");
 1581     }
 1582     if(ParamViolations && verbose >= 1)
 1583     {
 1584         fprintf(stderr,
 1585             "Warning: Fixed %u broken (incorrectly unquoted) tag parameter%s. Be grateful.\n",
 1586                 ParamViolations,
 1587                 ParamViolations==1 ? "" : "s"
 1588                );
 1589     }
 1590     
 1591     return 0;
 1592 }