"Fossies" - the Fresh Open Source Software Archive 
Member "htmlrecode-1.3.1/htmlparser2.cc" (21 Jul 2009, 3633 Bytes) of package /linux/www/old/htmlrecode-1.3.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "htmlparser2.cc" see the
Fossies "Dox" file reference documentation.
1 /* 26.7.2000 - This program does not do anything useful.
2 * Ignore this source file. // Bisqwit
3 */
4 #include <map>
5 #include <string>
6 #include <iostream>
7
8 using namespace std;
9
10 static class tagi
11 {
12 private:
13 string tagname;
14 public:
15 tagi()
16 {
17 }
18 tagi operator() (const string &s, unsigned &a, const unsigned b)
19 {
20 tagi tmp;
21
22 string w;
23 ++a;
24 if(a < b && s[a] == '/') w += s[a++];
25 while(a < b)
26 {
27 if(w == "!--")
28 {
29 while(a < b)
30 {
31 if(s.substr(a, 3)=="-->")
32 {
33 a += 3;
34 break;
35 }
36 ++a;
37 }
38 return tmp;
39 }
40 unsigned char c = s[a];
41 if(c >= 'A' && c <= 'Z') { w += c; ++a; continue; }
42 if(c >= 'a' && c <= 'z') { w += c+'A'-'a'; ++a; continue; }
43 break;
44 }
45 while(a < b)
46 {
47 unsigned char c = s[a];
48 if(c == '>')
49 {
50 ++a;
51 break;
52 }
53 if(c == '"')
54 {
55 ++a;
56 while(a < b)
57 {
58 c = s[a];
59 if(c == '"')break;
60 ++a;
61 }
62 }
63 ++a;
64 }
65 return w;
66 }
67 } tagi;
68
69 static void sana(const string &s, unsigned &a, const unsigned b)
70 {
71 string w;
72 while(a < b)
73 {
74 unsigned char c = s[a];
75 if((c >= 'A' && c <= 'Z')
76 || (c >= 'a' && c <= 'z')
77 || (c >= '0' && c <= '9')
78 || (c >= 160 && c <= 250))
79 {
80 ok: w += c;
81 a++;
82 continue;
83 }
84 if(c == '&')
85 {
86 string esc;
87 while(++a < b)
88 {
89 c = s[a];
90 if(c == ';')break;
91 esc += c;
92 }
93 esc += ' '; // to prevent overflow at esc[0]
94 bool uml = (esc.substr(1) == "uml ");
95 bool ring = (esc.substr(1) == "ring ");
96 bool acute = (esc.substr(1) == "acute ");
97 bool grave = (esc.substr(1) == "grave ");
98 c = ' ';
99 switch(esc[0])
100 {
101 case 'a': c = " äáàå"[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
102 case 'e': c = " ëéè "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
103 case 'i': c = " ïíì "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
104 case 'o': c = " öóò "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
105 case 'u': c = " üúù "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
106 case 'y': c = " ÿýy "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
107 case 'A': c = " ÄÁÁÅ"[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
108 case 'E': c = " ËÉÈ "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
109 case 'I': c = " ÏÍÌ "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
110 case 'O': c = " ÖÓÒ "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
111 case 'U': c = " ÜÚÙ "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
112 case 'Y': c = " YÝY "[uml+acute*2+grave*3+ring*4]; if(c!=' ')break;
113 }
114 if(c != ' ')goto ok;
115 ++a; // skip ';'
116 }
117 // At this point, char was illegal.
118 if(!w.size())++a;
119 break;
120 }
121 // At this point, word has been found, and a points to next char after it (or after &esc; if found).
122 if(w.size() && (w[0]<'0' || w[0]>'9'))
123 ++sanat[w];
124 }
125 static void parse(const string &s)
126 {
127 unsigned a=0, b=s.size();
128 while(a < b)
129 {
130 char c = s[a];
131 if(c == '<')
132 {
133 tagi(s,a,b);
134 // cout << "TAG: " << tagi(s,a,b) << endl;
135 }
136 else
137 {
138 if(c==' ' || c=='\n' || c=='\t' || c=='\r')
139 {
140 ++a;
141 continue;
142 }
143 sana(s,a,b);
144 }
145 }
146 }
147 int main()
148 {
149 cerr << "Usage: cat *.html | htmlparser1 | sort -rn | head -n 100\n";
150 string s;
151 while(cin.good()) { string t; getline(cin, t); s += t; s += '\n'; }
152 parse(s);
153 for(stype::iterator i=sanat.begin(); i!=sanat.end(); ++i)
154 cout << i->second << ':' << i->first << endl;
155 return 0;
156 }