"Fossies" - the Fresh Open Source Software Archive 
Member "anonlog-1.0.1/anonlog.pl" (24 Oct 2002, 25491 Bytes) of package /linux/www/old/anonlog-1.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 #!/usr/bin/perl
2 #
3 ### anonlog 1.0.1 http://anonlog.sourceforge.net/
4 ### This program is copyright (c) Stephen R. E. Turner 2000-2002.
5 ### It is free software; you can redistribute it and/or modify
6 ### it under the terms of version 2 of the GNU General Public License as
7 ### published by the Free Software Foundation.
8 ###
9 ### This program is distributed in the hope that it will be useful,
10 ### but without any warranty; without even the implied warranty of
11 ### merchantability or fitness for a particular purpose. See the
12 ### GNU General Public License for more details.
13 ###
14 ### You should have received a copy of the GNU General Public License
15 ### along with this program; if not, see http://www.gnu.org/copyleft/gpl.html
16 ### or write to the Free Software Foundation, Inc., 59 Temple Place,
17 ### Suite 330, Boston, MA 02111-1307, USA
18
19 require 5.004; # for rand()
20 use strict;
21 my ($conffile, $logfile, $logformat, $newlog, $dictionary, $translations,
22 $servernames, $unchfiles, $matchlength,
23 $case_sensitive, $usercase_sensitive);
24
25 # ======== User-settable parameters start here ======== #
26 #
27 # NB All parameters can also be set in the configuration file, $conffile,
28 # normally anonlog.cfg. This is usually more convenient. The variables here
29 # have the same names as those in anonlog.cfg (with the addition of a $ at
30 # the front). So see Readme.html for documentation on the various options.
31 #
32 # On Unix, you can edit the top line of this program to give the location of
33 # Perl on your system. (Try 'which perl' to find out).
34 #
35 # The configuration file to override all these options.
36 # $conffile = ''; for none.
37 $conffile = 'anonlog.cfg';
38
39 $logfile = 'logfile.log';
40 $newlog = '';
41 $servernames = '';
42 $logformat = '';
43 $dictionary = 'dictionary';
44 $translations = '';
45 $unchfiles = 'index.html';
46 $matchlength = 0;
47 $case_sensitive = 1;
48 $usercase_sensitive = 0;
49
50 # ======== User-settable parameters end here ======== #
51
52 my $progname = $0 || 'anonlog';
53 my $version = '1.0.1'; # version of this program
54 my $progurl = 'http://anonlog.sourceforge.net/';
55
56 # All legal domain names
57 my @domains = ("ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
58 "ar", "as", "at", "au", "aw", "az", "ba", "bb", "bd", "be",
59 "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs",
60 "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg",
61 "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "cr", "cs",
62 "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do",
63 "dz", "ec", "edu", "ee", "eg", "eh", "er", "es", "et", "fi",
64 "fj", "fk", "fm", "fo", "fr", "fx", "ga", "gb", "gd", "ge",
65 "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq",
66 "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr",
67 "ht", "hu", "id", "ie", "il", "im", "in", "int", "io", "iq",
68 "ir", "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh",
69 "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", "lb",
70 "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma",
71 "mc", "md", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo",
72 "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my",
73 "mz", "na", "nc", "ne", "net", "nf", "ng", "ni", "nl", "no",
74 "np", "nr", "nu", "nz", "om", "org", "pa", "pe", "pf", "pg",
75 "ph", "pk", "pl", "pm", "pn", "pr", "pt", "pw", "py", "qa",
76 "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
77 "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st",
78 "su", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj",
79 "tk", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz",
80 "ua", "ug", "uk", "um", "us", "uy", "uz", "va", "vc", "ve",
81 "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "yu", "za",
82 "zm", "zr", "zw");
83 my $no_tries = 100; # See random_entry() and random_string()
84 my $no_tries2 = 2000;
85 my $width = 35; # See writeout
86 my $firstline;
87
88 my @data;
89 # @data has the folloing components:
90 # 0 = host; 1 = user; 2 = date/time; 3 = HTTP method; 4 = filename;
91 # 5 = HTTP version / W3SVC string; 6 = HTTP status code; 7 = bytes sent;
92 # 8 = referrer; 9 = browser; 10 = virtual hostname; 11 = processing time;
93 # 12 = bytes received, 13 = IIS status; 14 = search args;
94 # 15 = time separate from date.
95 my (%hosttree, %filetree, %reftree, %usertree, %vhosttree);
96 my (@servernames, @unchfiles, @dict);
97
98 # Now all log formats
99
100 my ($format, @tokens, $outstr);
101
102 my $commonfmt = <<'HERE';
103 (\S*)\ # host
104 \S+\ # (unused)
105 (\S+)\ # user
106 \[([^\]]+)\]\ # date and time
107 \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\ # request line
108 (\d{3})\ # status code
109 (\d+|-) # bytes
110 HERE
111 my @commontokens = (0..7);
112 my $commonout = "%s - %s [%s] \"%s %s %s\" %s %s";
113
114 my $combfmt = <<'HERE';
115 (\S*)\ # host
116 \S+\ # (unused)
117 (\S+)\ # user
118 \[([^\]]+)\]\ # date and time
119 \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\ # request line
120 (\d{3})\ # status code
121 (\d+|-)\ # bytes
122 \"(.*)\"\ # referrer
123 \"([^\"]*)\" # browser
124 HERE
125 my @combtokens = (0..9);
126 my $combout = "%s - %s [%s] \"%s %s %s\" %s %s \"%s\" \"%s\"";
127
128 my $iisfmt = <<'HERE';
129 ([^,]*),\ # host
130 ([^,]*),\ # user
131 ([^,]*,\ [^,]*),\ # date and time
132 (W3SVC[^,]*),\ # W3SVC line
133 ([^,]*),\ # server name
134 [^,]*,\ # server address
135 (\d+|-),\ # processing time
136 (\d+|-),\ # bytes received
137 (\d+|-),\ # bytes sent
138 (\d{3}|-),\ # HTTP status code
139 ([^,]*),\ # IIS status
140 ([^,]*),\ # Operation
141 ([^,]*),\ # Filename
142 ([^,]*),\ ? # Search args
143 HERE
144 my @iistokens = (0..2, 5, 10..12, 7, 6, 13, 3, 4, 14);
145 my $iisout = "%s, %s, %s, %s, %s, -, %s, %s, %s, %s, %s, %s, %s, %s,";
146
147 my $msext = 0; # Whether extended format is genuine or Microsoft version
148
149 # ======== End of global declarations, start of main program ======== #
150
151 if ($conffile) {
152 open(CONFFILE, $conffile) ||
153 die "$progname: Cannot open configuration file $conffile: $!\n";
154 parse_config();
155 }
156
157 open(LOGFILE, $logfile) || die "$progname: Cannot open logfile $logfile: $!\n";
158 if ($newlog eq '') { $newlog = '-'; }
159 open(NEWLOG, ">$newlog") ||
160 die "$progname: Cannot write to new logfile $newlog: $!\n";
161
162 @servernames = split(/,\s*/, $servernames);
163 @unchfiles = split(/,\s*/, $unchfiles);
164
165 if ($dictionary) {
166 unless (open(DICT, $dictionary)) {
167 warn "$progname: Cannot open dictionary $dictionary: $!\n";
168 } else { construct_dict(); }
169 }
170
171 # == End of initialisation, now process logfile == #
172
173 $firstline = 1;
174 while (<LOGFILE>) {
175 if ($firstline) {
176 detect_format();
177 $firstline = 0;
178 }
179 if ($logformat eq 'extended' && /^\#/) {
180 # special case: extended format, line beginning with #
181 if (/^\#Fields:\s/) { parse_extfmt(); }
182 print NEWLOG;
183 }
184 else {
185 @data[@tokens] = /^$format$/x;
186 unless (defined($data[$tokens[0]])) {
187 print STDERR "$progname: Unparseable line: ";
188 print STDERR;
189 }
190 else {
191 $data[0] = anon_host($data[0]);
192 $data[4] = anon_file($data[4]);
193 $data[8] = anon_referrer($data[8]);
194 $data[1] = anon_user($data[1]);
195 $data[10] = anon_vhost($data[10]);
196 if ($data[14] ne '' && $data[14] ne '-') { $data[14] = 'args'; }
197 printf NEWLOG "$outstr\n", @data[@tokens];
198 }
199 }
200 }
201
202 # == Finished processing logfile, finally output translations == #
203
204 if ($translations) {
205 unless (open(TRANS, ">$translations")) {
206 warn "$progname: Cannot write to translation file $translations: $!\n";
207 } else {
208 if (%filetree) {
209 print TRANS "** FILES **\n\n";
210 writeout(\%filetree, 1, '/', 1, 1);
211 }
212 if (%hosttree) {
213 print TRANS "\n** HOSTS **\n\n";
214 writeout(\%hosttree, 2, '.', 0, 0);
215 }
216 if (%reftree) {
217 print TRANS "\n** REFERRERS **\n\n";
218 writeout(\%reftree, 1, '/', 0, 0);
219 }
220 if (%usertree) {
221 print TRANS "\n** USERS **\n\n";
222 writeout(\%usertree, 1, '', 0, 0);
223 }
224 if (%vhosttree) {
225 print TRANS "\n** VIRTUAL HOSTS **\n\n";
226 writeout(\%vhosttree, 1, '', 0, 0);
227 }
228 }
229 }
230
231 # ======== End of main program. Rest is subroutines. ======== #
232
233 # Parse the configuration file.
234 sub parse_config {
235 my ($name, $value);
236
237 while (<CONFFILE>) {
238 chomp;
239 s/\#.*$//; # Remove comments
240 if (/\S/) { # If any non-space character left on line
241 ($name, $value) = /^\s*(.*?)\s*=\s*(.*?)\s*$/;
242 $name =~ tr/A-Z/a-z/;
243 if (!defined($name)) {
244 warn "$progname: Can't parse configuration line: $_\n";
245 }
246 elsif ($name eq 'logfile' && $value ne '') { $logfile = $value }
247 elsif ($name eq 'logformat') { $logformat = $value }
248 elsif ($name eq 'newlog') { $newlog = $value }
249 elsif ($name eq 'dictionary') { $dictionary = $value }
250 elsif ($name eq 'translations') { $translations = $value }
251 elsif ($name eq 'servernames') { $servernames = $value }
252 elsif ($name eq 'unchfiles') { $unchfiles = $value }
253 elsif ($name eq 'matchlength' &&
254 ($value eq '0' || $value == '1')) {
255 $matchlength = $value;
256 }
257 elsif ($name eq 'case_sensitive' &&
258 ($value eq '0' || $value == '1')) {
259 $case_sensitive = $value;
260 }
261 elsif ($name eq 'usercase_sensitive' &&
262 ($value eq '0' || $value == '1')) {
263 $usercase_sensitive = $value;
264 }
265 else {
266 warn "$progname: Can't understand configuration line: $_\n";
267 }
268 }
269 }
270 }
271
272 # Construct the dictionary.
273 sub construct_dict {
274 local $_;
275 my ($w, $d, $i, $tmp, @words, @ignore, %h);
276
277 while (<DICT>) { $w .= $_ }
278 @words = split(/\s+/, $w);
279 @ignore = map(/^([^.]*)/, @unchfiles);
280 # @ignore contains the 'index' in index.html (or index.html.gz or index)
281 # We delete them from the dictionary below (could instead be careful in
282 # lookup_or_create_filename, but this is easier and faster).
283 foreach (@ignore) { tr/A-Z/a-z/; }
284 # Put words of length l into the array at $dict[l].
285 foreach (@words) {
286 tr/A-Z/a-z/;
287 $tmp = $_;
288 $i = $matchlength?length():0;
289 push(@{$dict[$i]}, $_) # Take only words, and not in @ignore
290 unless (/[^a-z]/ || grep($tmp eq $_, @ignore));
291 }
292 foreach $d (@dict) { if ($d == undef) { @$d = (); }}
293 }
294
295 # Detect logfile format from first line. (NB Line is already in (global) $_ ).
296 sub detect_format {
297 my $i;
298
299 unless ($logformat) {
300 if ((split /,\s*/) == 15) { $logformat = 'iis'; }
301 elsif (($i = index($_, '[')) >= 6 && substr($_, $i + 27, 1) eq ']' &&
302 index($_, '"') == $i + 29) {
303 if (($i = split(/\"/)) == 3) { $logformat = 'common'; }
304 elsif ($i == 7) { $logformat = 'combined'; }
305 }
306 elsif (/^\#/) { $logformat = 'extended'; }
307 unless ($logformat) { die "$progname: Can't detect format of logfile $logfile from first line: specify it in $conffile\n"; }
308 }
309
310 $logformat =~ tr/A-Z/a-z/;
311 if ($logformat eq 'common') {
312 print STDERR "$progname: Reading $logfile in common format\n";
313 $format = $commonfmt;
314 @tokens = @commontokens;
315 $outstr = $commonout;
316 }
317 elsif ($logformat eq 'combined') {
318 print STDERR "$progname: Reading $logfile in combined format\n";
319 $format = $combfmt;
320 @tokens = @combtokens;
321 $outstr = $combout;
322 }
323 elsif ($logformat eq 'iis') {
324 print STDERR "$progname: Reading $logfile in IIS format\n";
325 $format = $iisfmt;
326 @tokens = @iistokens;
327 $outstr = $iisout;
328 }
329 elsif ($logformat eq 'extended' || $logformat eq 'ms-extended') {
330 # In this case, have to construct the log format from the #Fields line
331 if ($logformat eq 'ms-extended') { $msext = 1; }
332 $logformat = 'extended';
333 while (/^\#/) { # process all # lines before handing back
334 if (/^\#Software: Microsoft Internet Information Serv/) {
335 $msext = 1;
336 }
337 elsif (/^\#Fields:\s/) { parse_extfmt(); }
338 print NEWLOG;
339 $_ = <LOGFILE>;
340 }
341 print NEWLOG "#Remark: Logfile anonymized by anonlog $version, $progurl\n";
342 if ($msext) { print STDERR "$progname: Reading $logfile in Microsoft extended format\n"; }
343 else { print STDERR "$progname: Reading $logfile in W3C extended format\n"; }
344 }
345 else { die "$progname: Don't know what you mean by 'logformat = $logformat' in $conffile\n"; }
346 }
347
348 # Parse the #Fields: line from an extended format logfile. The #Fields: line
349 # is already in (global) $_ .
350 sub parse_extfmt {
351 my ($i, $first);
352
353 $format = '';
354 @tokens = ();
355 $outstr = '';
356 $first = 1;
357
358 foreach $i (split(' ', substr($_, 9))) { # substr skips "#Fields: " itself
359 if ($first) { $first = 0; }
360 else { $format .= '\s+'; $outstr .= "\t"; }
361 $i =~ tr/A-Z/a-z/;
362 if ($i eq 'date') {
363 $format .= '(\d{4}-\d{2}-\d{2})';
364 push(@tokens, 2);
365 $outstr .= '%s';
366 }
367 elsif ($i eq 'time') {
368 $format .= '(\d{2}:\d{2}(?::\d{2}(?:\.\d*)?)?)';
369 push(@tokens, 15);
370 $outstr .= '%s';
371 }
372 elsif ($i eq 'bytes' || $i eq 'sc-bytes') {
373 $format .= '(\d+|-)';
374 push(@tokens, 7);
375 $outstr .= '%d';
376 }
377 elsif ($i eq 'cs-bytes') {
378 $format .= '(\d+|-)';
379 push(@tokens, 12);
380 $outstr .= '%d';
381 }
382 elsif ($i eq 'sc-status') {
383 $format .= '(\d{3})';
384 push(@tokens, 6);
385 $outstr .= '%d';
386 }
387 elsif ($i eq 'c-dns' || $i eq 'cs-dns' ||
388 $i eq 'c-ip' || $i eq 'cs-ip') {
389 $format .= '(\S+)';
390 push(@tokens, 0);
391 $outstr .= '%s';
392 }
393 elsif ($i eq 'cs-uri' || $i eq 'cs-uri-stem') {
394 $format .= '(\S+)';
395 push(@tokens, 4);
396 $outstr .= '%s';
397 }
398 elsif ($i eq 'cs(referer)') {
399 if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
400 else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
401 push(@tokens, 8);
402 }
403 elsif ($i eq 'cs(user-agent)') {
404 if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
405 else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
406 push(@tokens, 9);
407 }
408 elsif ($i eq 'cs-host' || $i eq 's-ip' || $i eq 's-dns' ||
409 $i eq 'cs-sip' || $i eq 's-sitename' ||
410 $i eq 's-computername') {
411 $format .= '(\S+)';
412 push(@tokens, 10);
413 $outstr .= '%s';
414 }
415 elsif ($i eq 'cs(host)') {
416 $format .= '\"(.*?)\"';
417 push(@tokens, 10);
418 $outstr .= '"%s"';
419 }
420 elsif ($i eq 'cs-uri-query') {
421 $format .= '(\S+)';
422 push(@tokens, 14);
423 $outstr .= '%s';
424 }
425 elsif ($i eq 'cs-username') {
426 $format .= '(\S+)';
427 push(@tokens, 1);
428 $outstr .= '%s';
429 }
430 elsif ($i eq 'cs(from)') {
431 $format .= '\"(.*?)\"';
432 push(@tokens, 1);
433 $outstr .= '"%s"';
434 }
435 elsif ($i eq 'time-taken') {
436 $format .= '([\d\.]+|-)';
437 push(@tokens, 11);
438 $outstr .= '%s';
439 }
440 elsif ($i eq 'cs-method') {
441 $format .= '([A-Za-z]+)';
442 push(@tokens, 3);
443 $outstr .= '%s';
444 }
445 else { # unknown token
446 $format .= '\S+';
447 $outstr .= '-';
448 }
449 }
450 }
451
452 # The anonymizing functions
453 #
454 # The translations are looked up in a tree. Each node of the tree is a hash
455 # as follows:
456 # Keys: The part of the name being translated
457 # Values: A 2-element \array (translation, \hash of the same type recursively)
458 #
459 # All these functions work the same way.
460 # They are called with one argument from the main part of the
461 # program, a second argument (\sub-hash) when called recursively.
462 # The name is split into two components, $b to be translated immediately
463 # and $a, the rest. $b's (translation, \subhash) is assigned to @b.
464 sub anon_host {
465 local $_ = $_[0];
466 my @b;
467 my $numhost = 0;
468 if (!defined($_[1])) { tr/A-Z/a-z/; }
469 if (/\.$/) { $_ = substr($_, 0, length($_) - 1); } # strip trailing dot
470 my ($a, $b) = /^(.*)\.(.*)$/;
471 if (!defined($b)) { $a = ''; $b = $_; } # if no dot in (sub-)name
472
473 if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
474 else {
475 if ($_ eq '' || $_ eq '-') { return '-'; }
476 my($n1, $n2, $n3, $n4) =
477 /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
478 if (defined($n1) && $n1 <= 255 && $n2 <= 255 && $n3 <= 255 &&
479 $n4 <= 255) { return(anon_numhost($_, \%hosttree)); }
480 if (!/\./ && !grep($b eq $_, @domains)) { # no dot in whole name
481 @b = lookup_or_create($b, \%hosttree);
482 } else { @b = lookup_or_create($b, \%hosttree, \@domains); }
483 }
484
485 if ($a eq '') { return($b[0]); }
486 else { return(anon_host($a, $b[1]) . '.' . $b[0]); }
487 }
488
489 sub anon_numhost { # Numerical hostnames
490 local $_ = $_[0];
491 my @b;
492 my ($b, $a) = /^(.*?)\.(.*)$/;
493 if (!defined($b)) { $a = ''; $b = $_; }
494 my @newnumber = (rand255($_[1]));
495 @b = lookup_or_create($b, $_[1], \@newnumber);
496 if ($a eq '') { return($b[0]); }
497 else { return($b[0] . '.' . anon_numhost($a, $b[1])); }
498 }
499
500 # anon_file also takes an optional argument number 2, overriding global
501 # $case_sensitive. If it is present, initial stripping of anchors and search
502 # arguments is also performed (because it is also used only on the first call
503 # for that data, in this case a referrer).
504 sub anon_file {
505 local $_ = $_[0];
506 my $args = '';
507 my $case = $case_sensitive;
508 my (@b, @tmp, $ans);
509 if (!defined($_[1]) || defined($_[2])) {
510 s/\#.*//; # remove anchors
511 if (s/\?.*//) { $args = '?args'; }
512 s/%([\da-fA-F]{2})/pack("C", hex($1))/ge; # change %7E to ~ etc.
513 if (defined($_[2])) { $case = $_[2]; }
514 if (!$case) { tr/A-Z/a-z/; }
515 }
516
517 my ($b, $a) = m[^/(.*?)(/.*)$];
518 if (!defined($b)) { # not two slashes in name
519 if (!m[^/]) { return '-'; }
520 # top level should always begin with slash, and lower levels forced to
521 $b = substr($_, 1);
522 if ($b eq '') { return "/$args"; }
523 if (grep($b eq $_, @unchfiles)) { return "/$b$args"; }
524 if (defined($_[1])) { @b = lookup_or_create_filename($b, \%{$_[1]}); }
525 else { @b = lookup_or_create_filename($b, \%filetree); }
526 return("/$b[0]$args");
527 }
528 # rest only reached if there were two slashes in name
529 if ($b eq '.' || $b eq '..') { # special case: leave these alone
530 @tmp = ($b);
531 if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}, \@tmp); }
532 else { @b = lookup_or_create($b, \%filetree, \@tmp); }
533 }
534 elsif (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
535 else { @b = lookup_or_create($b, \%filetree); }
536 return("/$b[0]" . anon_file($a, $b[1]) . $args);
537 }
538
539 # Referrers are a bit different because they're split into 3 parts:
540 # a scheme, a hostname and a filename. We only allow through http: and
541 # ftp: URLs. If the hostname is in the list of known $servernames, we preserve
542 # the hostname and use the existing local translations for the filename part.
543 # Otherwise we translate the hostname according to the existing hosttree, and
544 # use this as the root for the referrer tree.
545 sub anon_referrer {
546 local $_ = $_[0];
547 my @b;
548 my ($scheme, $hostname, $port, $path) = m[^(.*?)://(.*?)(:.*?)?(/.*)$];
549
550 if (!defined($scheme) || $scheme !~ /^(ht|f)tp/i) { return "-"; }
551 $scheme =~ tr/A-Z/a-z/;
552 if ($hostname =~ /\.$/) { # strip trailing dot
553 $hostname = substr($hostname, 0, length($hostname) - 1);
554 }
555 if (grep($_ eq $hostname, @servernames)) {
556 return($scheme . '://' . $hostname . anon_file($path));
557 }
558 else {
559 my @newhost = (anon_host($hostname));
560 @b = lookup_or_create($hostname, \%reftree, \@newhost);
561 return("$scheme://$b[0]$port" . anon_file($path, $b[1], 1));
562 }
563 }
564
565 sub anon_user { # users and virtual hosts aren't hierarchical
566 local $_ = $_[0];
567 my @b;
568
569 if ($_ eq '' || $_ eq '-') { return '-'; }
570 if (!$usercase_sensitive) { tr/A-Z/a-z/; }
571 @b = lookup_or_create($_, \%usertree);
572 return($b[0]);
573 }
574
575 sub anon_vhost {
576 local $_ = $_[0];
577 my @b;
578
579 if ($_ eq '' || $_ eq '-') { return '-'; }
580 tr/A-Z/a-z/;
581 @b = lookup_or_create($_, \%vhosttree);
582 return($b[0]);
583 }
584
585 # Look up an item (arg 0) in a tree node (arg 1), or create a new entry if
586 # necessary. The entry is selected from array arg 2 if present, else from
587 # dictionary entry of correct length, else a random string of correct length.
588 # See also lookup_or_create_filename below.
589 sub lookup_or_create {
590 my $n = $_[0];
591 my (@ans, %h);
592
593 unless (defined(${$_[1]}{$n})) {
594 if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1]); }
595 else {
596 $ans[0] = random_entry($dict[$matchlength?length($n):0], $_[1]);
597 }
598 unless (defined($ans[0])) {
599 $ans[0] = random_string(length($n), $_[1]);
600 }
601 # Start hash table so it isn't undef later
602 $h{''} = undef;
603 $ans[1] = \%h;
604 ${$_[1]}{$n} = \@ans;
605 }
606 return(@{${$_[1]}{$n}});
607 }
608
609 # The same as lookup_or_create above, but preserves the extension of filenames.
610 # (Actually, this is never called with 3 args, but we leave it in for possible
611 # future use, and to keep it parallel with the previous function).
612 sub lookup_or_create_filename {
613 local $_ = $_[0];
614 my (@ans, %h);
615
616 unless (defined(${$_[1]}{$_})) {
617 my ($name, $ext) = m[^(.*)(\..*)$];
618 if (!defined($name)) { $ext = ''; $name = $_; } # no extension
619 if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1], $ext); }
620 else {
621 $ans[0] = random_entry($dict[$matchlength?length($name):0], $_[1],
622 $ext);
623 }
624 unless (defined($ans[0])) {
625 $ans[0] = random_string(length($name), $_[1], $ext);
626 }
627 $h{''} = undef;
628 $ans[1] = \%h;
629 ${$_[1]}{$_} = \@ans;
630 }
631 return(@{${$_[1]}{$_}});
632 }
633
634 # Select a random entry from array arg 0, but must not occur as value in hash
635 # arg 1. If failed after $no_tries, give up and return undef.
636 # If arg 2 exists, then the random entry is "arg0_element . arg2" instead.
637 sub random_entry {
638 if ($_[0] == undef) { return undef; }
639 my @l = @{$_[0]};
640 my @v = values(%{$_[1]});
641 my ($ans, $k);
642
643 if (@l == ()) { return undef; }
644 for ($k = 0;
645 (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries;
646 $k++) { $ans = $l[rand($#l + 0.9999999)] . $_[2]; }
647 if ($k < $no_tries) { return($ans); }
648 else { return undef; }
649 }
650
651 # Create random string, length given by arg 0 (unless global $matchlength is
652 # false), again must not occur as value in hash arg 1. Same arg2 as in
653 # random_entry. This time if failed after $no_tries2, return any answer.
654 sub random_string {
655 my $l = $_[0];
656 my @v = values(%{$_[1]});
657 my ($ans, $i, $j, $k);
658
659 if ($l == 0) { return(''); }
660 unless ($matchlength) { $l = int(5 + rand(6)); } # i.e. 5 to 10
661 for ($k = 0;
662 (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries2;
663 $k++) {
664 $ans = '';
665 for ($i = 0; $i < $l; $i++) {
666 $j = 65 + rand(52);
667 if ($j >= 91) { $j += 6; }
668 $ans .= chr($j);
669 }
670 $ans .= $_[2];
671 }
672 return($ans);
673 }
674
675 # Select a random number from 0 to 255, but again not already occurring as
676 # value in hash (arg 0). This should never fail in the context of this
677 # program, but we use $no_tries2 again just in case.
678 sub rand255 {
679 my @v = values(%{$_[0]});
680 my ($ans, $k);
681
682 for ($k = 0;
683 (!defined($ans) || grep {$ans == ${$_}[0]} @v) && $k < $no_tries2;
684 $k++) { $ans = int(rand(255.9999999)); }
685 return($ans);
686 }
687
688 # Write out the translations to file TRANS.
689 # The 0th argument is the \hash to be interpreted;
690 # The 1st argument is 0 if the name-parts are collated backwards, 1 if
691 # forwards, 2 if they are hostnames (backwards unless numerical);
692 # The 2nd argument is the delimiter between name-parts, or empty string if they
693 # are not hierarchical.
694 # The 3rd argument says whether the delimiter should also occur at the start of
695 # the string.
696 # The 4th argument says whether the entry should still be printed if it is
697 # not (known to be) a leaf.
698 # The 5th and 6th arguments, if present, are the up-tree name-parts for the
699 # original and translated names respectively.
700 sub writeout {
701 my %hash = %{$_[0]};
702 my ($colorder, $delim, $initial, $printall) = ($_[1], $_[2], $_[3], $_[4]);
703 my ($partname_old, $partname_new) = ($_[5], $_[6]);
704 my ($name, @value, $name_new, $name_old, $newcolord, $fieldwidth);
705
706 foreach $name (sort { # Declare sort order inline so we can use %hash
707 # The first case is binned immediately below, but must catch here too
708 (!defined($hash{$a}) || !defined($hash{$b}))?(lc($a) cmp lc($b)):
709 (((${$hash{$a}}[0] + 0) <=> (${$hash{$b}}[0] + 0)) ||
710 # If they start with (or are) numbers, sort them that way
711 (lc(${$hash{$a}}[0]) cmp lc(${$hash{$b}}[0])) || # Usual ordering
712 (($a + 0) <=> ($b + 0)) || # Fallback to untranslated names: but by
713 (lc($a) cmp lc($b))) # construction this should (all but) never occur
714 } keys(%hash)) { # End of sort order. Phew.
715 if (defined($hash{$name})) {
716 @value = @{$hash{$name}};
717 if ($colorder == 2) { $newcolord = (($name =~ /^\d{1,3}$/)?1:0); }
718 else { $newcolord = $colorder; }
719 # NB Incomplete test for numerical hostname (unlike in anon_host)
720 if ($newcolord == 1) {
721 if (!$partname_old && !$initial) {
722 $name_new = "$value[0]";
723 $name_old = "$name";
724 } else {
725 $name_new = "$partname_new$delim$value[0]";
726 $name_old = "$partname_old$delim$name";
727 }
728 $fieldwidth = -$width;
729 }
730 else {
731 if (!$partname_old && !$initial) {
732 $name_new = "$value[0]";
733 $name_old = "$name";
734 } else {
735 $name_new = "$value[0]$delim$partname_new";
736 $name_old = "$name$delim$partname_old";
737 }
738 $fieldwidth = $width;
739 }
740 printf TRANS ("%*s = %*s\n",
741 $fieldwidth, $name_new, $fieldwidth, $name_old)
742 if ($printall || keys(%{$value[1]}) <= 1);
743 writeout($value[1], $newcolord, $delim, $initial, $printall,
744 $name_old, $name_new);
745 }
746 }
747 }