"Fossies" - the Fresh Open Source Software Archive 
Member "anonlog-1.0.1/anonlog.pl" (24 Oct 2002, 25491 Bytes) of package /linux/www/old/anonlog-1.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "anonlog.pl" see the
Fossies "Dox" file reference documentation.
1 #!/usr/bin/perl
2 #
3 ### anonlog 1.0.1 http://anonlog.sourceforge.net/
4 ### This program is copyright (c) Stephen R. E. Turner 2000-2002.
5 ### It is free software; you can redistribute it and/or modify
6 ### it under the terms of version 2 of the GNU General Public License as
7 ### published by the Free Software Foundation.
8 ###
9 ### This program is distributed in the hope that it will be useful,
10 ### but without any warranty; without even the implied warranty of
11 ### merchantability or fitness for a particular purpose. See the
12 ### GNU General Public License for more details.
13 ###
14 ### You should have received a copy of the GNU General Public License
15 ### along with this program; if not, see http://www.gnu.org/copyleft/gpl.html
16 ### or write to the Free Software Foundation, Inc., 59 Temple Place,
17 ### Suite 330, Boston, MA 02111-1307, USA
18
19 require 5.004; # for rand()
20 use strict;
21 my ($conffile, $logfile, $logformat, $newlog, $dictionary, $translations,
22 $servernames, $unchfiles, $matchlength,
23 $case_sensitive, $usercase_sensitive);
24
25 # ======== User-settable parameters start here ======== #
26 #
27 # NB All parameters can also be set in the configuration file, $conffile,
28 # normally anonlog.cfg. This is usually more convenient. The variables here
29 # have the same names as those in anonlog.cfg (with the addition of a $ at
30 # the front). So see Readme.html for documentation on the various options.
31 #
32 # On Unix, you can edit the top line of this program to give the location of
33 # Perl on your system. (Try 'which perl' to find out).
34 #
35 # The configuration file to override all these options.
36 # $conffile = ''; for none.
37 $conffile = 'anonlog.cfg';
38
39 $logfile = 'logfile.log';
40 $newlog = '';
41 $servernames = '';
42 $logformat = '';
43 $dictionary = 'dictionary';
44 $translations = '';
45 $unchfiles = 'index.html';
46 $matchlength = 0;
47 $case_sensitive = 1;
48 $usercase_sensitive = 0;
49
50 # ======== User-settable parameters end here ======== #
51
52 my $progname = $0 || 'anonlog';
53 my $version = '1.0.1'; # version of this program
54 my $progurl = 'http://anonlog.sourceforge.net/';
55
56 # All legal domain names
57 my @domains = ("ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
58 "ar", "as", "at", "au", "aw", "az", "ba", "bb", "bd", "be",
59 "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs",
60 "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg",
61 "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "cr", "cs",
62 "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do",
63 "dz", "ec", "edu", "ee", "eg", "eh", "er", "es", "et", "fi",
64 "fj", "fk", "fm", "fo", "fr", "fx", "ga", "gb", "gd", "ge",
65 "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq",
66 "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr",
67 "ht", "hu", "id", "ie", "il", "im", "in", "int", "io", "iq",
68 "ir", "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh",
69 "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", "lb",
70 "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma",
71 "mc", "md", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo",
72 "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my",
73 "mz", "na", "nc", "ne", "net", "nf", "ng", "ni", "nl", "no",
74 "np", "nr", "nu", "nz", "om", "org", "pa", "pe", "pf", "pg",
75 "ph", "pk", "pl", "pm", "pn", "pr", "pt", "pw", "py", "qa",
76 "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
77 "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st",
78 "su", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj",
79 "tk", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz",
80 "ua", "ug", "uk", "um", "us", "uy", "uz", "va", "vc", "ve",
81 "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "yu", "za",
82 "zm", "zr", "zw");
83 my $no_tries = 100; # See random_entry() and random_string()
84 my $no_tries2 = 2000;
85 my $width = 35; # See writeout
86 my $firstline;
87
88 my @data;
89 # @data has the folloing components:
90 # 0 = host; 1 = user; 2 = date/time; 3 = HTTP method; 4 = filename;
91 # 5 = HTTP version / W3SVC string; 6 = HTTP status code; 7 = bytes sent;
92 # 8 = referrer; 9 = browser; 10 = virtual hostname; 11 = processing time;
93 # 12 = bytes received, 13 = IIS status; 14 = search args;
94 # 15 = time separate from date.
95 my (%hosttree, %filetree, %reftree, %usertree, %vhosttree);
96 my (@servernames, @unchfiles, @dict);
97
98 # Now all log formats
99
100 my ($format, @tokens, $outstr);
101
102 my $commonfmt = <<'HERE';
103 (\S*)\ # host
104 \S+\ # (unused)
105 (\S+)\ # user
106 \[([^\]]+)\]\ # date and time
107 \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\ # request line
108 (\d{3})\ # status code
109 (\d+|-) # bytes
110 HERE
111 my @commontokens = (0..7);
112 my $commonout = "%s - %s [%s] \"%s %s %s\" %s %s";
113
114 my $combfmt = <<'HERE';
115 (\S*)\ # host
116 \S+\ # (unused)
117 (\S+)\ # user
118 \[([^\]]+)\]\ # date and time
119 \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\ # request line
120 (\d{3})\ # status code
121 (\d+|-)\ # bytes
122 \"(.*)\"\ # referrer
123 \"([^\"]*)\" # browser
124 HERE
125 my @combtokens = (0..9);
126 my $combout = "%s - %s [%s] \"%s %s %s\" %s %s \"%s\" \"%s\"";
127
128 my $iisfmt = <<'HERE';
129 ([^,]*),\ # host
130 ([^,]*),\ # user
131 ([^,]*,\ [^,]*),\ # date and time
132 (W3SVC[^,]*),\ # W3SVC line
133 ([^,]*),\ # server name
134 [^,]*,\ # server address
135 (\d+|-),\ # processing time
136 (\d+|-),\ # bytes received
137 (\d+|-),\ # bytes sent
138 (\d{3}|-),\ # HTTP status code
139 ([^,]*),\ # IIS status
140 ([^,]*),\ # Operation
141 ([^,]*),\ # Filename
142 ([^,]*),\ ? # Search args
143 HERE
144 my @iistokens = (0..2, 5, 10..12, 7, 6, 13, 3, 4, 14);
145 my $iisout = "%s, %s, %s, %s, %s, -, %s, %s, %s, %s, %s, %s, %s, %s,";
146
147 my $msext = 0; # Whether extended format is genuine or Microsoft version
148
149 # ======== End of global declarations, start of main program ======== #
150
151 if ($conffile) {
152 open(CONFFILE, $conffile) ||
153 die "$progname: Cannot open configuration file $conffile: $!\n";
154 parse_config();
155 }
156
157 open(LOGFILE, $logfile) || die "$progname: Cannot open logfile $logfile: $!\n";
158 if ($newlog eq '') { $newlog = '-'; }
159 open(NEWLOG, ">$newlog") ||
160 die "$progname: Cannot write to new logfile $newlog: $!\n";
161
162 @servernames = split(/,\s*/, $servernames);
163 @unchfiles = split(/,\s*/, $unchfiles);
164
165 if ($dictionary) {
166 unless (open(DICT, $dictionary)) {
167 warn "$progname: Cannot open dictionary $dictionary: $!\n";
168 } else { construct_dict(); }
169 }
170
171 # == End of initialisation, now process logfile == #
172
173 $firstline = 1;
174 while (<LOGFILE>) {
175 if ($firstline) {
176 detect_format();
177 $firstline = 0;
178 }
179 if ($logformat eq 'extended' && /^\#/) {
180 # special case: extended format, line beginning with #
181 if (/^\#Fields:\s/) { parse_extfmt(); }
182 print NEWLOG;
183 }
184 else {
185 @data[@tokens] = /^$format$/x;
186 unless (defined($data[$tokens[0]])) {
187 print STDERR "$progname: Unparseable line: ";
188 print STDERR;
189 }
190 else {
191 $data[0] = anon_host($data[0]);
192 $data[4] = anon_file($data[4]);
193 $data[8] = anon_referrer($data[8]);
194 $data[1] = anon_user($data[1]);
195 $data[10] = anon_vhost($data[10]);
196 if ($data[14] ne '' && $data[14] ne '-') { $data[14] = 'args'; }
197 printf NEWLOG "$outstr\n", @data[@tokens];
198 }
199 }
200 }
201
202 # == Finished processing logfile, finally output translations == #
203
204 if ($translations) {
205 unless (open(TRANS, ">$translations")) {
206 warn "$progname: Cannot write to translation file $translations: $!\n";
207 } else {
208 if (%filetree) {
209 print TRANS "** FILES **\n\n";
210 writeout(\%filetree, 1, '/', 1, 1);
211 }
212 if (%hosttree) {
213 print TRANS "\n** HOSTS **\n\n";
214 writeout(\%hosttree, 2, '.', 0, 0);
215 }
216 if (%reftree) {
217 print TRANS "\n** REFERRERS **\n\n";
218 writeout(\%reftree, 1, '/', 0, 0);
219 }
220 if (%usertree) {
221 print TRANS "\n** USERS **\n\n";
222 writeout(\%usertree, 1, '', 0, 0);
223 }
224 if (%vhosttree) {
225 print TRANS "\n** VIRTUAL HOSTS **\n\n";
226 writeout(\%vhosttree, 1, '', 0, 0);
227 }
228 }
229 }
230
231 # ======== End of main program. Rest is subroutines. ======== #
232
233 # Parse the configuration file.
234 sub parse_config {
235 my ($name, $value);
236
237 while (<CONFFILE>) {
238 chomp;
239 s/\#.*$//; # Remove comments
240 if (/\S/) { # If any non-space character left on line
241 ($name, $value) = /^\s*(.*?)\s*=\s*(.*?)\s*$/;
242 $name =~ tr/A-Z/a-z/;
243 if (!defined($name)) {
244 warn "$progname: Can't parse configuration line: $_\n";
245 }
246 elsif ($name eq 'logfile' && $value ne '') { $logfile = $value }
247 elsif ($name eq 'logformat') { $logformat = $value }
248 elsif ($name eq 'newlog') { $newlog = $value }
249 elsif ($name eq 'dictionary') { $dictionary = $value }
250 elsif ($name eq 'translations') { $translations = $value }
251 elsif ($name eq 'servernames') { $servernames = $value }
252 elsif ($name eq 'unchfiles') { $unchfiles = $value }
253 elsif ($name eq 'matchlength' &&
254 ($value eq '0' || $value == '1')) {
255 $matchlength = $value;
256 }
257 elsif ($name eq 'case_sensitive' &&
258 ($value eq '0' || $value == '1')) {
259 $case_sensitive = $value;
260 }
261 elsif ($name eq 'usercase_sensitive' &&
262 ($value eq '0' || $value == '1')) {
263 $usercase_sensitive = $value;
264 }
265 else {
266 warn "$progname: Can't understand configuration line: $_\n";
267 }
268 }
269 }
270 }
271
272 # Construct the dictionary.
273 sub construct_dict {
274 local $_;
275 my ($w, $d, $i, $tmp, @words, @ignore, %h);
276
277 while (<DICT>) { $w .= $_ }
278 @words = split(/\s+/, $w);
279 @ignore = map(/^([^.]*)/, @unchfiles);
280 # @ignore contains the 'index' in index.html (or index.html.gz or index)
281 # We delete them from the dictionary below (could instead be careful in
282 # lookup_or_create_filename, but this is easier and faster).
283 foreach (@ignore) { tr/A-Z/a-z/; }
284 # Put words of length l into the array at $dict[l].
285 foreach (@words) {
286 tr/A-Z/a-z/;
287 $tmp = $_;
288 $i = $matchlength?length():0;
289 push(@{$dict[$i]}, $_) # Take only words, and not in @ignore
290 unless (/[^a-z]/ || grep($tmp eq $_, @ignore));
291 }
292 foreach $d (@dict) { if ($d == undef) { @$d = (); }}
293 }
294
295 # Detect logfile format from first line. (NB Line is already in (global) $_ ).
296 sub detect_format {
297 my $i;
298
299 unless ($logformat) {
300 if ((split /,\s*/) == 15) { $logformat = 'iis'; }
301 elsif (($i = index($_, '[')) >= 6 && substr($_, $i + 27, 1) eq ']' &&
302 index($_, '"') == $i + 29) {
303 if (($i = split(/\"/)) == 3) { $logformat = 'common'; }
304 elsif ($i == 7) { $logformat = 'combined'; }
305 }
306 elsif (/^\#/) { $logformat = 'extended'; }
307 unless ($logformat) { die "$progname: Can't detect format of logfile $logfile from first line: specify it in $conffile\n"; }
308 }
309
310 $logformat =~ tr/A-Z/a-z/;
311 if ($logformat eq 'common') {
312 print STDERR "$progname: Reading $logfile in common format\n";
313 $format = $commonfmt;
314 @tokens = @commontokens;
315 $outstr = $commonout;
316 }
317 elsif ($logformat eq 'combined') {
318 print STDERR "$progname: Reading $logfile in combined format\n";
319 $format = $combfmt;
320 @tokens = @combtokens;
321 $outstr = $combout;
322 }
323 elsif ($logformat eq 'iis') {
324 print STDERR "$progname: Reading $logfile in IIS format\n";
325 $format = $iisfmt;
326 @tokens = @iistokens;
327 $outstr = $iisout;
328 }
329 elsif ($logformat eq 'extended' || $logformat eq 'ms-extended') {
330 # In this case, have to construct the log format from the #Fields line
331 if ($logformat eq 'ms-extended') { $msext = 1; }
332 $logformat = 'extended';
333 while (/^\#/) { # process all # lines before handing back
334 if (/^\#Software: Microsoft Internet Information Serv/) {
335 $msext = 1;
336 }
337 elsif (/^\#Fields:\s/) { parse_extfmt(); }
338 print NEWLOG;
339 $_ = <LOGFILE>;
340 }
341 print NEWLOG "#Remark: Logfile anonymized by anonlog $version, $progurl\n";
342 if ($msext) { print STDERR "$progname: Reading $logfile in Microsoft extended format\n"; }
343 else { print STDERR "$progname: Reading $logfile in W3C extended format\n"; }
344 }
345 else { die "$progname: Don't know what you mean by 'logformat = $logformat' in $conffile\n"; }
346 }
347
348 # Parse the #Fields: line from an extended format logfile. The #Fields: line
349 # is already in (global) $_ .
350 sub parse_extfmt {
351 my ($i, $first);
352
353 $format = '';
354 @tokens = ();
355 $outstr = '';
356 $first = 1;
357
358 foreach $i (split(' ', substr($_, 9))) { # substr skips "#Fields: " itself
359 if ($first) { $first = 0; }
360 else { $format .= '\s+'; $outstr .= "\t"; }
361 $i =~ tr/A-Z/a-z/;
362 if ($i eq 'date') {
363 $format .= '(\d{4}-\d{2}-\d{2})';
364 push(@tokens, 2);
365 $outstr .= '%s';
366 }
367 elsif ($i eq 'time') {
368 $format .= '(\d{2}:\d{2}(?::\d{2}(?:\.\d*)?)?)';
369 push(@tokens, 15);
370 $outstr .= '%s';
371 }
372 elsif ($i eq 'bytes' || $i eq 'sc-bytes') {
373 $format .= '(\d+|-)';
374 push(@tokens, 7);
375 $outstr .= '%d';
376 }
377 elsif ($i eq 'cs-bytes') {
378 $format .= '(\d+|-)';
379 push(@tokens, 12);
380 $outstr .= '%d';
381 }
382 elsif ($i eq 'sc-status') {
383 $format .= '(\d{3})';
384 push(@tokens, 6);
385 $outstr .= '%d';
386 }
387 elsif ($i eq 'c-dns' || $i eq 'cs-dns' ||
388 $i eq 'c-ip' || $i eq 'cs-ip') {
389 $format .= '(\S+)';
390 push(@tokens, 0);
391 $outstr .= '%s';
392 }
393 elsif ($i eq 'cs-uri' || $i eq 'cs-uri-stem') {
394 $format .= '(\S+)';
395 push(@tokens, 4);
396 $outstr .= '%s';
397 }
398 elsif ($i eq 'cs(referer)') {
399 if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
400 else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
401 push(@tokens, 8);
402 }
403 elsif ($i eq 'cs(user-agent)') {
404 if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
405 else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
406 push(@tokens, 9);
407 }
408 elsif ($i eq 'cs-host' || $i eq 's-ip' || $i eq 's-dns' ||
409 $i eq 'cs-sip' || $i eq 's-sitename' ||
410 $i eq 's-computername') {
411 $format .= '(\S+)';
412 push(@tokens, 10);
413 $outstr .= '%s';
414 }
415 elsif ($i eq 'cs(host)') {
416 $format .= '\"(.*?)\"';
417 push(@tokens, 10);
418 $outstr .= '"%s"';
419 }
420 elsif ($i eq 'cs-uri-query') {
421 $format .= '(\S+)';
422 push(@tokens, 14);
423 $outstr .= '%s';
424 }
425 elsif ($i eq 'cs-username') {
426 $format .= '(\S+)';
427 push(@tokens, 1);
428 $outstr .= '%s';
429 }
430 elsif ($i eq 'cs(from)') {
431 $format .= '\"(.*?)\"';
432 push(@tokens, 1);
433 $outstr .= '"%s"';
434 }
435 elsif ($i eq 'time-taken') {
436 $format .= '([\d\.]+|-)';
437 push(@tokens, 11);
438 $outstr .= '%s';
439 }
440 elsif ($i eq 'cs-method') {
441 $format .= '([A-Za-z]+)';
442 push(@tokens, 3);
443 $outstr .= '%s';
444 }
445 else { # unknown token
446 $format .= '\S+';
447 $outstr .= '-';
448 }
449 }
450 }
451
452 # The anonymizing functions
453 #
454 # The translations are looked up in a tree. Each node of the tree is a hash
455 # as follows:
456 # Keys: The part of the name being translated
457 # Values: A 2-element \array (translation, \hash of the same type recursively)
458 #
459 # All these functions work the same way.
460 # They are called with one argument from the main part of the
461 # program, a second argument (\sub-hash) when called recursively.
462 # The name is split into two components, $b to be translated immediately
463 # and $a, the rest. $b's (translation, \subhash) is assigned to @b.
464 sub anon_host {
465 local $_ = $_[0];
466 my @b;
467 my $numhost = 0;
468 if (!defined($_[1])) { tr/A-Z/a-z/; }
469 if (/\.$/) { $_ = substr($_, 0, length($_) - 1); } # strip trailing dot
470 my ($a, $b) = /^(.*)\.(.*)$/;
471 if (!defined($b)) { $a = ''; $b = $_; } # if no dot in (sub-)name
472
473 if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
474 else {
475 if ($_ eq '' || $_ eq '-') { return '-'; }
476 my($n1, $n2, $n3, $n4) =
477 /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
478 if (defined($n1) && $n1 <= 255 && $n2 <= 255 && $n3 <= 255 &&
479 $n4 <= 255) { return(anon_numhost($_, \%hosttree)); }
480 if (!/\./ && !grep($b eq $_, @domains)) { # no dot in whole name
481 @b = lookup_or_create($b, \%hosttree);
482 } else { @b = lookup_or_create($b, \%hosttree, \@domains); }
483 }
484
485 if ($a eq '') { return($b[0]); }
486 else { return(anon_host($a, $b[1]) . '.' . $b[0]); }
487 }
488
489 sub anon_numhost { # Numerical hostnames
490 local $_ = $_[0];
491 my @b;
492 my ($b, $a) = /^(.*?)\.(.*)$/;
493 if (!defined($b)) { $a = ''; $b = $_; }
494 my @newnumber = (rand255($_[1]));
495 @b = lookup_or_create($b, $_[1], \@newnumber);
496 if ($a eq '') { return($b[0]); }
497 else { return($b[0] . '.' . anon_numhost($a, $b[1])); }
498 }
499
500 # anon_file also takes an optional argument number 2, overriding global
501 # $case_sensitive. If it is present, initial stripping of anchors and search
502 # arguments is also performed (because it is also used only on the first call
503 # for that data, in this case a referrer).
504 sub anon_file {
505 local $_ = $_[0];
506 my $args = '';
507 my $case = $case_sensitive;
508 my (@b, @tmp, $ans);
509 if (!defined($_[1]) || defined($_[2])) {
510 s/\#.*//; # remove anchors
511 if (s/\?.*//) { $args = '?args'; }
512 s/%([\da-fA-F]{2})/pack("C", hex($1))/ge; # change %7E to ~ etc.
513 if (defined($_[2])) { $case = $_[2]; }
514 if (!$case) { tr/A-Z/a-z/; }
515 }
516
517 my ($b, $a) = m[^/(.*?)(/.*)$];
518 if (!defined($b)) { # not two slashes in name
519 if (!m[^/]) { return '-'; }
520 # top level should always begin with slash, and lower levels forced to
521 $b = substr($_, 1);
522 if ($b eq '') { return "/$args"; }
523 if (grep($b eq $_, @unchfiles)) { return "/$b$args"; }
524 if (defined($_[1])) { @b = lookup_or_create_filename($b, \%{$_[1]}); }
525 else { @b = lookup_or_create_filename($b, \%filetree); }
526 return("/$b[0]$args");
527 }
528 # rest only reached if there were two slashes in name
529 if ($b eq '.' || $b eq '..') { # special case: leave these alone
530 @tmp = ($b);
531 if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}, \@tmp); }
532 else { @b = lookup_or_create($b, \%filetree, \@tmp); }
533 }
534 elsif (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
535 else { @b = lookup_or_create($b, \%filetree); }
536 return("/$b[0]" . anon_file($a, $b[1]) . $args);
537 }
538
539 # Referrers are a bit different because they're split into 3 parts:
540 # a scheme, a hostname and a filename. We only allow through http: and
541 # ftp: URLs. If the hostname is in the list of known $servernames, we preserve
542 # the hostname and use the existing local translations for the filename part.
543 # Otherwise we translate the hostname according to the existing hosttree, and
544 # use this as the root for the referrer tree.
545 sub anon_referrer {
546 local $_ = $_[0];
547 my @b;
548 my ($scheme, $hostname, $port, $path) = m[^(.*?)://(.*?)(:.*?)?(/.*)$];
549
550 if (!defined($scheme) || $scheme !~ /^(ht|f)tp/i) { return "-"; }
551 $scheme =~ tr/A-Z/a-z/;
552 if ($hostname =~ /\.$/) { # strip trailing dot
553 $hostname = substr($hostname, 0, length($hostname) - 1);
554 }
555 if (grep($_ eq $hostname, @servernames)) {
556 return($scheme . '://' . $hostname . anon_file($path));
557 }
558 else {
559 my @newhost = (anon_host($hostname));
560 @b = lookup_or_create($hostname, \%reftree, \@newhost);
561 return("$scheme://$b[0]$port" . anon_file($path, $b[1], 1));
562 }
563 }
564
565 sub anon_user { # users and virtual hosts aren't hierarchical
566 local $_ = $_[0];
567 my @b;
568
569 if ($_ eq '' || $_ eq '-') { return '-'; }
570 if (!$usercase_sensitive) { tr/A-Z/a-z/; }
571 @b = lookup_or_create($_, \%usertree);
572 return($b[0]);
573 }
574
575 sub anon_vhost {
576 local $_ = $_[0];
577 my @b;
578
579 if ($_ eq '' || $_ eq '-') { return '-'; }
580 tr/A-Z/a-z/;
581 @b = lookup_or_create($_, \%vhosttree);
582 return($b[0]);
583 }
584
585 # Look up an item (arg 0) in a tree node (arg 1), or create a new entry if
586 # necessary. The entry is selected from array arg 2 if present, else from
587 # dictionary entry of correct length, else a random string of correct length.
588 # See also lookup_or_create_filename below.
589 sub lookup_or_create {
590 my $n = $_[0];
591 my (@ans, %h);
592
593 unless (defined(${$_[1]}{$n})) {
594 if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1]); }
595 else {
596 $ans[0] = random_entry($dict[$matchlength?length($n):0], $_[1]);
597 }
598 unless (defined($ans[0])) {
599 $ans[0] = random_string(length($n), $_[1]);
600 }
601 # Start hash table so it isn't undef later
602 $h{''} = undef;
603 $ans[1] = \%h;
604 ${$_[1]}{$n} = \@ans;
605 }
606 return(@{${$_[1]}{$n}});
607 }
608
609 # The same as lookup_or_create above, but preserves the extension of filenames.
610 # (Actually, this is never called with 3 args, but we leave it in for possible
611 # future use, and to keep it parallel with the previous function).
612 sub lookup_or_create_filename {
613 local $_ = $_[0];
614 my (@ans, %h);
615
616 unless (defined(${$_[1]}{$_})) {
617 my ($name, $ext) = m[^(.*)(\..*)$];
618 if (!defined($name)) { $ext = ''; $name = $_; } # no extension
619 if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1], $ext); }
620 else {
621 $ans[0] = random_entry($dict[$matchlength?length($name):0], $_[1],
622 $ext);
623 }
624 unless (defined($ans[0])) {
625 $ans[0] = random_string(length($name), $_[1], $ext);
626 }
627 $h{''} = undef;
628 $ans[1] = \%h;
629 ${$_[1]}{$_} = \@ans;
630 }
631 return(@{${$_[1]}{$_}});
632 }
633
634 # Select a random entry from array arg 0, but must not occur as value in hash
635 # arg 1. If failed after $no_tries, give up and return undef.
636 # If arg 2 exists, then the random entry is "arg0_element . arg2" instead.
637 sub random_entry {
638 if ($_[0] == undef) { return undef; }
639 my @l = @{$_[0]};
640 my @v = values(%{$_[1]});
641 my ($ans, $k);
642
643 if (@l == ()) { return undef; }
644 for ($k = 0;
645 (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries;
646 $k++) { $ans = $l[rand($#l + 0.9999999)] . $_[2]; }
647 if ($k < $no_tries) { return($ans); }
648 else { return undef; }
649 }
650
651 # Create random string, length given by arg 0 (unless global $matchlength is
652 # false), again must not occur as value in hash arg 1. Same arg2 as in
653 # random_entry. This time if failed after $no_tries2, return any answer.
654 sub random_string {
655 my $l = $_[0];
656 my @v = values(%{$_[1]});
657 my ($ans, $i, $j, $k);
658
659 if ($l == 0) { return(''); }
660 unless ($matchlength) { $l = int(5 + rand(6)); } # i.e. 5 to 10
661 for ($k = 0;
662 (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries2;
663 $k++) {
664 $ans = '';
665 for ($i = 0; $i < $l; $i++) {
666 $j = 65 + rand(52);
667 if ($j >= 91) { $j += 6; }
668 $ans .= chr($j);
669 }
670 $ans .= $_[2];
671 }
672 return($ans);
673 }
674
675 # Select a random number from 0 to 255, but again not already occurring as
676 # value in hash (arg 0). This should never fail in the context of this
677 # program, but we use $no_tries2 again just in case.
678 sub rand255 {
679 my @v = values(%{$_[0]});
680 my ($ans, $k);
681
682 for ($k = 0;
683 (!defined($ans) || grep {$ans == ${$_}[0]} @v) && $k < $no_tries2;
684 $k++) { $ans = int(rand(255.9999999)); }
685 return($ans);
686 }
687
688 # Write out the translations to file TRANS.
689 # The 0th argument is the \hash to be interpreted;
690 # The 1st argument is 0 if the name-parts are collated backwards, 1 if
691 # forwards, 2 if they are hostnames (backwards unless numerical);
692 # The 2nd argument is the delimiter between name-parts, or empty string if they
693 # are not hierarchical.
694 # The 3rd argument says whether the delimiter should also occur at the start of
695 # the string.
696 # The 4th argument says whether the entry should still be printed if it is
697 # not (known to be) a leaf.
698 # The 5th and 6th arguments, if present, are the up-tree name-parts for the
699 # original and translated names respectively.
700 sub writeout {
701 my %hash = %{$_[0]};
702 my ($colorder, $delim, $initial, $printall) = ($_[1], $_[2], $_[3], $_[4]);
703 my ($partname_old, $partname_new) = ($_[5], $_[6]);
704 my ($name, @value, $name_new, $name_old, $newcolord, $fieldwidth);
705
706 foreach $name (sort { # Declare sort order inline so we can use %hash
707 # The first case is binned immediately below, but must catch here too
708 (!defined($hash{$a}) || !defined($hash{$b}))?(lc($a) cmp lc($b)):
709 (((${$hash{$a}}[0] + 0) <=> (${$hash{$b}}[0] + 0)) ||
710 # If they start with (or are) numbers, sort them that way
711 (lc(${$hash{$a}}[0]) cmp lc(${$hash{$b}}[0])) || # Usual ordering
712 (($a + 0) <=> ($b + 0)) || # Fallback to untranslated names: but by
713 (lc($a) cmp lc($b))) # construction this should (all but) never occur
714 } keys(%hash)) { # End of sort order. Phew.
715 if (defined($hash{$name})) {
716 @value = @{$hash{$name}};
717 if ($colorder == 2) { $newcolord = (($name =~ /^\d{1,3}$/)?1:0); }
718 else { $newcolord = $colorder; }
719 # NB Incomplete test for numerical hostname (unlike in anon_host)
720 if ($newcolord == 1) {
721 if (!$partname_old && !$initial) {
722 $name_new = "$value[0]";
723 $name_old = "$name";
724 } else {
725 $name_new = "$partname_new$delim$value[0]";
726 $name_old = "$partname_old$delim$name";
727 }
728 $fieldwidth = -$width;
729 }
730 else {
731 if (!$partname_old && !$initial) {
732 $name_new = "$value[0]";
733 $name_old = "$name";
734 } else {
735 $name_new = "$value[0]$delim$partname_new";
736 $name_old = "$name$delim$partname_old";
737 }
738 $fieldwidth = $width;
739 }
740 printf TRANS ("%*s = %*s\n",
741 $fieldwidth, $name_new, $fieldwidth, $name_old)
742 if ($printall || keys(%{$value[1]}) <= 1);
743 writeout($value[1], $newcolord, $delim, $initial, $printall,
744 $name_old, $name_new);
745 }
746 }
747 }