"Fossies" - the Fresh Open Source Software Archive

Member "htmlrecode-1.3.1/htmlparser1.cc" (21 Jul 2009, 3695 Bytes) of package /linux/www/htmlrecode-1.3.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "htmlparser1.cc" see the Fossies "Dox" file reference documentation.

    1 /* 27.7.2000 - This program does not do anything useful.
    2  *             Ignore this source file. // Bisqwit
    3  */
    4 #include <map>
    5 #include <string>
    6 #include <iostream>
    7 #include <cctype>
    8 
    9 using namespace std;
   10 class intti
   11 {
   12 private:
   13     unsigned i;
   14 public:
   15     intti(unsigned ii) : i(ii) { }
   16     intti() : i(0) { }
   17     intti &operator= (unsigned ii) { i=ii; return *this; }
   18     operator unsigned() {return i; }
   19     intti &operator++() { ++i; return *this; }
   20 };
   21 typedef map<string, intti> stype;
   22 static stype sanat;
   23 static string tagi(const string &s, unsigned &a, const unsigned b)
   24 {
   25     string w;
   26     ++a;
   27     if(a < b && s[a] == '/') w += s[a++];
   28     while(a < b)
   29     {
   30         if(w == "!--")
   31         {
   32             while(a < b)
   33             {
   34                 if(s.substr(a, 3)=="-->")
   35                 {
   36                     a += 3;
   37                     break;
   38                 }
   39                 ++a;
   40             }
   41             return "";
   42         }
   43         unsigned char c = s[a];
   44         if(c >= 'A' && c <= 'Z') { w += c;         ++a; continue; }
   45         if(c >= 'a' && c <= 'z') { w += c+'A'-'a'; ++a; continue; }
   46         break;
   47     }
   48     while(a < b)
   49     {
   50         unsigned char c = s[a];
   51         if(c == '>')
   52         {
   53             ++a;
   54             break;
   55         }
   56         if(c == '"')
   57         {
   58             ++a;
   59             while(a < b)
   60             {
   61                 c = s[a];
   62                 if(c == '"')break;
   63                 ++a;
   64             }
   65         }
   66         ++a;
   67     }
   68     return w;
   69 }
   70 static void sana(const string &s, unsigned &a, const unsigned b)
   71 {
   72     string w;
   73     while(a < b)
   74     {
   75         unsigned char c = s[a];
   76         if((c >= 'A' && c <= 'Z')
   77         || (c >= 'a' && c <= 'z')
   78         || (c >= '0' && c <= '9')
   79         || (c >= 160 && c <= 250))
   80         {
   81         ok: w += toupper(c);
   82             a++;
   83             continue;
   84         }
   85         if(c == '&')
   86         {
   87             string esc;
   88             while(++a < b)
   89             {
   90                 c = s[a];
   91                 if(c == ';')break;
   92                 esc += c;
   93             }
   94             esc += ' '; // to prevent overflow at esc[0]
   95             bool uml   = (esc.substr(1) == "uml ");
   96             bool ring  = (esc.substr(1) == "ring ");
   97             bool acute = (esc.substr(1) == "acute ");
   98             bool grave = (esc.substr(1) == "grave ");
   99             c = ' ';
  100             switch(toupper(esc[0]))
  101             {
  102                 case 'a': c = " "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  103                 case 'e': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  104                 case 'i': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  105                 case 'o': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  106                 case 'u': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  107                 case 'y': c = " y "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  108                 case 'A': c = " "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  109                 case 'E': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  110                 case 'I': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  111                 case 'O': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  112                 case 'U': c = "  "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  113                 case 'Y': c = " YY "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
  114             }
  115             if(c != ' ')goto ok;
  116             ++a; // skip ';'
  117             break;
  118         }
  119         // At this point, char was illegal.
  120         if(!w.size())++a;
  121         break;
  122     }
  123     // At this point, word has been found, and a points to next char after it (or after &esc; if found).
  124     if(w.size() && (w[0]<'0' || w[0]>'9'))
  125         ++sanat[w];
  126 }
  127 static void parse(const string &s)
  128 {
  129     unsigned a=0, b=s.size();
  130     while(a < b)
  131     {
  132         char c = s[a];
  133         if(c == '<')
  134         {
  135             tagi(s,a,b);
  136         //  cout << "TAG: " << tagi(s,a,b) << endl;
  137         }
  138         else
  139         {
  140             if(c==' ' || c=='\n' || c=='\t' || c=='\r')
  141             {
  142                 ++a;
  143                 continue;
  144             }
  145             sana(s,a,b);
  146         }
  147     }
  148 }
  149 int main()
  150 {
  151     cerr << "Usage: cat *.html | htmlparser1 | sort -rn | head -n 100\n";
  152     string s;
  153     while(cin.good()) { string t; getline(cin, t); s += t; s += '\n'; }
  154     parse(s);
  155     for(stype::iterator i=sanat.begin(); i!=sanat.end(); ++i)
  156         cout << i->second << ':' << i->first << endl;
  157     return 0;
  158 }