"Fossies" - the Fresh Open Source Software Archive

Member "htmlrecode-1.3.1/htmlparser2.cc" (21 Jul 2009, 3633 Bytes) of package /linux/www/old/htmlrecode-1.3.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "htmlparser2.cc" see the Fossies "Dox" file reference documentation.

    1 /* 26.7.2000 - This program does not do anything useful.
    2  *             Ignore this source file. // Bisqwit
    3  */
    4 #include <map>
    5 #include <string>
    6 #include <iostream>
    7 
    8 using namespace std;
    9 
   10 static class tagi
   11 {
   12 private:
   13     string tagname;
   14 public:
   15     tagi()
   16     {
   17     }
   18     tagi operator() (const string &s, unsigned &a, const unsigned b)
   19     {
   20         tagi tmp;
   21         
   22         string w;
   23         ++a;
   24         if(a < b && s[a] == '/') w += s[a++];
   25         while(a < b)
   26         {
   27             if(w == "!--")
   28             {
   29                 while(a < b)
   30                 {
   31                     if(s.substr(a, 3)=="-->")
   32                     {
   33                         a += 3;
   34                         break;
   35                     }
   36                     ++a;
   37                 }
   38                 return tmp;
   39             }
   40             unsigned char c = s[a];
   41             if(c >= 'A' && c <= 'Z') { w += c;         ++a; continue; }
   42             if(c >= 'a' && c <= 'z') { w += c+'A'-'a'; ++a; continue; }
   43             break;
   44         }
   45         while(a < b)
   46         {
   47             unsigned char c = s[a];
   48             if(c == '>')
   49             {
   50                 ++a;
   51                 break;
   52             }
   53             if(c == '"')
   54             {
   55                 ++a;
   56                 while(a < b)
   57                 {
   58                     c = s[a];
   59                     if(c == '"')break;
   60                     ++a;
   61                 }
   62             }
   63             ++a;
   64         }
   65         return w;
   66     }
   67 } tagi;
   68 
   69 static void sana(const string &s, unsigned &a, const unsigned b)
   70 {
   71     string w;
   72     while(a < b)
   73     {
   74         unsigned char c = s[a];
   75         if((c >= 'A' && c <= 'Z')
   76         || (c >= 'a' && c <= 'z')
   77         || (c >= '0' && c <= '9')
   78         || (c >= 160 && c <= 250))
   79         {
   80         ok: w += c;
   81             a++;
   82             continue;
   83         }
   84         if(c == '&')
   85         {
   86             string esc;
   87             while(++a < b)
   88             {
   89                 c = s[a];
   90                 if(c == ';')break;
   91                 esc += c;
   92             }
   93             esc += ' '; // to prevent overflow at esc[0]
   94             bool uml   = (esc.substr(1) == "uml ");
   95             bool ring  = (esc.substr(1) == "ring ");
   96             bool acute = (esc.substr(1) == "acute ");
   97             bool grave = (esc.substr(1) == "grave ");
   98             c = ' ';
   99             switch(esc[0])
  100             {
  101                 case 'a': c = " "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  102                 case 'e': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  103                 case 'i': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  104                 case 'o': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  105                 case 'u': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  106                 case 'y': c = " y "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  107                 case 'A': c = " "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  108                 case 'E': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  109                 case 'I': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  110                 case 'O': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  111                 case 'U': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  112                 case 'Y': c = " YY "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  113             }
  114             if(c != ' ')goto ok;
  115             ++a; // skip ';'
  116         }
  117         // At this point, char was illegal.
  118         if(!w.size())++a;
  119         break;
  120     }
  121     // At this point, word has been found, and a points to next char after it (or after &esc; if found).
  122     if(w.size() && (w[0]<'0' || w[0]>'9'))
  123         ++sanat[w];
  124 }
  125 static void parse(const string &s)
  126 {
  127     unsigned a=0, b=s.size();
  128     while(a < b)
  129     {
  130         char c = s[a];
  131         if(c == '<')
  132         {
  133             tagi(s,a,b);
  134         //  cout << "TAG: " << tagi(s,a,b) << endl;
  135         }
  136         else
  137         {
  138             if(c==' ' || c=='\n' || c=='\t' || c=='\r')
  139             {
  140                 ++a;
  141                 continue;
  142             }
  143             sana(s,a,b);
  144         }
  145     }
  146 }
  147 int main()
  148 {
  149     cerr << "Usage: cat *.html | htmlparser1 | sort -rn | head -n 100\n";
  150     string s;
  151     while(cin.good()) { string t; getline(cin, t); s += t; s += '\n'; }
  152     parse(s);
  153     for(stype::iterator i=sanat.begin(); i!=sanat.end(); ++i)
  154         cout << i->second << ':' << i->first << endl;
  155     return 0;
  156 }