"Fossies" - the Fresh Open Source Software Archive

Member "anonlog-1.0.1/anonlog.pl" (24 Oct 2002, 25491 Bytes) of package /linux/www/old/anonlog-1.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 #!/usr/bin/perl
    2 #
    3 ###        anonlog 1.0.1             http://anonlog.sourceforge.net/
    4 ### This program is copyright (c) Stephen R. E. Turner 2000-2002.
    5 ### It is free software; you can redistribute it and/or modify
    6 ### it under the terms of version 2 of the GNU General Public License as
    7 ### published by the Free Software Foundation.
    8 ###
    9 ### This program is distributed in the hope that it will be useful,
   10 ### but without any warranty; without even the implied warranty of
   11 ### merchantability or fitness for a particular purpose.  See the
   12 ### GNU General Public License for more details.
   13 ###
   14 ### You should have received a copy of the GNU General Public License
   15 ### along with this program; if not, see http://www.gnu.org/copyleft/gpl.html
   16 ### or write to the Free Software Foundation, Inc., 59 Temple Place,
   17 ### Suite 330, Boston, MA 02111-1307, USA
   18 
   19 require 5.004;  # for rand()
   20 use strict;
   21 my ($conffile, $logfile, $logformat, $newlog, $dictionary, $translations,
   22     $servernames, $unchfiles, $matchlength,
   23     $case_sensitive, $usercase_sensitive);
   24 
   25 # ======== User-settable parameters start here ======== #
   26 #
   27 # NB All parameters can also be set in the configuration file, $conffile,
   28 # normally anonlog.cfg. This is usually more convenient. The variables here
   29 # have the same names as those in anonlog.cfg (with the addition of a $ at
   30 # the front). So see Readme.html for documentation on the various options.
   31 #
   32 # On Unix, you can edit the top line of this program to give the location of
   33 # Perl on your system. (Try 'which perl' to find out).
   34 #
   35 # The configuration file to override all these options.
   36 # $conffile = ''; for none.
   37 $conffile = 'anonlog.cfg';
   38 
   39 $logfile = 'logfile.log';
   40 $newlog = '';
   41 $servernames = '';
   42 $logformat = '';
   43 $dictionary = 'dictionary';
   44 $translations = '';
   45 $unchfiles = 'index.html';
   46 $matchlength = 0;
   47 $case_sensitive = 1;
   48 $usercase_sensitive = 0;
   49 
   50 # ======== User-settable parameters end here ======== #
   51 
   52 my $progname = $0 || 'anonlog';
   53 my $version = '1.0.1';  # version of this program
   54 my $progurl = 'http://anonlog.sourceforge.net/';
   55 
   56 # All legal domain names
   57 my @domains = ("ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
   58            "ar", "as", "at", "au", "aw", "az", "ba", "bb", "bd", "be",
   59            "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs",
   60            "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg",
   61            "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "cr", "cs",
   62            "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do",
   63            "dz", "ec", "edu", "ee", "eg", "eh", "er", "es", "et", "fi",
   64            "fj", "fk", "fm", "fo", "fr", "fx", "ga", "gb", "gd", "ge",
   65            "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq",
   66            "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr",
   67            "ht", "hu", "id", "ie", "il", "im", "in", "int", "io", "iq",
   68            "ir", "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh",
   69            "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", "lb",
   70            "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma",
   71            "mc", "md", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo",
   72            "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my",
   73            "mz", "na", "nc", "ne", "net", "nf", "ng", "ni", "nl", "no",
   74            "np", "nr", "nu", "nz", "om", "org", "pa", "pe", "pf", "pg",
   75            "ph", "pk", "pl", "pm", "pn", "pr", "pt", "pw", "py", "qa",
   76            "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
   77            "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st",
   78            "su", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj",
   79            "tk", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz",
   80            "ua", "ug", "uk", "um", "us", "uy", "uz", "va", "vc", "ve",
   81            "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "yu", "za",
   82            "zm", "zr", "zw");
   83 my $no_tries = 100;    # See random_entry() and random_string()
   84 my $no_tries2 = 2000;
   85 my $width = 35;        # See writeout
   86 my $firstline;
   87 
   88 my @data;
   89 # @data has the folloing components:
   90 # 0 = host; 1 = user; 2 = date/time; 3 = HTTP method; 4 = filename;
   91 # 5 = HTTP version / W3SVC string; 6 = HTTP status code; 7 = bytes sent;
   92 # 8 = referrer; 9 = browser; 10 = virtual hostname; 11 = processing time;
   93 # 12 = bytes received, 13 = IIS status; 14 = search args;
   94 # 15 = time separate from date.
   95 my (%hosttree, %filetree, %reftree, %usertree, %vhosttree);
   96 my (@servernames, @unchfiles, @dict);
   97 
   98 # Now all log formats
   99 
  100 my ($format, @tokens, $outstr);
  101 
  102 my $commonfmt = <<'HERE';
  103     (\S*)\            # host
  104     \S+\              # (unused)
  105     (\S+)\            # user
  106     \[([^\]]+)\]\     # date and time
  107     \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\   # request line
  108     (\d{3})\          # status code
  109     (\d+|-)           # bytes
  110 HERE
  111 my @commontokens = (0..7);
  112 my $commonout = "%s - %s [%s] \"%s %s %s\" %s %s";
  113 
  114 my $combfmt = <<'HERE';
  115     (\S*)\            # host
  116     \S+\              # (unused)
  117     (\S+)\            # user
  118     \[([^\]]+)\]\     # date and time
  119     \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\   # request line
  120     (\d{3})\          # status code
  121     (\d+|-)\          # bytes
  122     \"(.*)\"\         # referrer
  123     \"([^\"]*)\"      # browser
  124 HERE
  125 my @combtokens = (0..9);
  126 my $combout = "%s - %s [%s] \"%s %s %s\" %s %s \"%s\" \"%s\"";
  127 
  128 my $iisfmt = <<'HERE';
  129     ([^,]*),\             # host
  130     ([^,]*),\             # user
  131     ([^,]*,\ [^,]*),\     # date and time
  132     (W3SVC[^,]*),\        # W3SVC line
  133     ([^,]*),\             # server name
  134     [^,]*,\               # server address
  135     (\d+|-),\             # processing time
  136     (\d+|-),\             # bytes received
  137     (\d+|-),\             # bytes sent
  138     (\d{3}|-),\           # HTTP status code
  139     ([^,]*),\             # IIS status
  140     ([^,]*),\             # Operation
  141     ([^,]*),\             # Filename
  142     ([^,]*),\ ?           # Search args
  143 HERE
  144 my @iistokens = (0..2, 5, 10..12, 7, 6, 13, 3, 4, 14);
  145 my $iisout = "%s, %s, %s, %s, %s, -, %s, %s, %s, %s, %s, %s, %s, %s,";
  146 
  147 my $msext = 0;   # Whether extended format is genuine or Microsoft version
  148 
  149 # ======== End of global declarations, start of main program ======== #
  150 
  151 if ($conffile) {
  152     open(CONFFILE, $conffile) ||
  153     die "$progname: Cannot open configuration file $conffile: $!\n";
  154     parse_config();
  155 }
  156 
  157 open(LOGFILE, $logfile) || die "$progname: Cannot open logfile $logfile: $!\n";
  158 if ($newlog eq '') { $newlog = '-'; }
  159 open(NEWLOG, ">$newlog") ||
  160     die "$progname: Cannot write to new logfile $newlog: $!\n";
  161 
  162 @servernames = split(/,\s*/, $servernames);
  163 @unchfiles = split(/,\s*/, $unchfiles);
  164 
  165 if ($dictionary) {
  166     unless (open(DICT, $dictionary)) {
  167     warn "$progname: Cannot open dictionary $dictionary: $!\n";
  168     } else { construct_dict(); }
  169 }
  170 
  171 # == End of initialisation, now process logfile == #
  172 
  173 $firstline = 1;
  174 while (<LOGFILE>) {
  175     if ($firstline) {
  176     detect_format();
  177     $firstline = 0;
  178     }
  179     if ($logformat eq 'extended' && /^\#/) {
  180     # special case: extended format, line beginning with #
  181     if (/^\#Fields:\s/) { parse_extfmt(); }
  182     print NEWLOG;
  183     }
  184     else {
  185     @data[@tokens] = /^$format$/x;
  186     unless (defined($data[$tokens[0]])) {
  187         print STDERR "$progname: Unparseable line: ";
  188         print STDERR;
  189     }
  190     else {
  191         $data[0] = anon_host($data[0]);
  192         $data[4] = anon_file($data[4]);
  193         $data[8] = anon_referrer($data[8]);
  194         $data[1] = anon_user($data[1]);
  195         $data[10] = anon_vhost($data[10]);
  196         if ($data[14] ne '' && $data[14] ne '-') { $data[14] = 'args'; }
  197         printf NEWLOG "$outstr\n", @data[@tokens];
  198     }
  199     }
  200 }
  201 
  202 # == Finished processing logfile, finally output translations == #
  203 
  204 if ($translations) {
  205     unless (open(TRANS, ">$translations")) {
  206     warn "$progname: Cannot write to translation file $translations: $!\n";
  207     } else {
  208     if (%filetree) {
  209         print TRANS "** FILES **\n\n";
  210         writeout(\%filetree, 1, '/', 1, 1);
  211     }
  212     if (%hosttree) {
  213         print TRANS "\n** HOSTS **\n\n";
  214         writeout(\%hosttree, 2, '.', 0, 0);
  215     }
  216     if (%reftree) {
  217         print TRANS "\n** REFERRERS **\n\n";
  218         writeout(\%reftree, 1, '/', 0, 0);
  219     }
  220     if (%usertree) {
  221         print TRANS "\n** USERS **\n\n";
  222         writeout(\%usertree, 1, '', 0, 0);
  223     }
  224     if (%vhosttree) {
  225         print TRANS "\n** VIRTUAL HOSTS **\n\n";
  226         writeout(\%vhosttree, 1, '', 0, 0);
  227     }
  228     }
  229 }
  230 
  231 # ======== End of main program. Rest is subroutines. ======== #
  232 
  233 # Parse the configuration file.
  234 sub parse_config {
  235     my ($name, $value);
  236 
  237     while (<CONFFILE>) {
  238     chomp;
  239     s/\#.*$//;    # Remove comments
  240     if (/\S/) {   # If any non-space character left on line
  241         ($name, $value) = /^\s*(.*?)\s*=\s*(.*?)\s*$/;
  242         $name =~ tr/A-Z/a-z/;
  243         if (!defined($name)) {
  244         warn "$progname: Can't parse configuration line: $_\n";
  245         }
  246         elsif ($name eq 'logfile' && $value ne '') { $logfile = $value }
  247         elsif ($name eq 'logformat') { $logformat = $value }
  248         elsif ($name eq 'newlog') { $newlog = $value }
  249         elsif ($name eq 'dictionary') { $dictionary = $value }
  250         elsif ($name eq 'translations') { $translations = $value }
  251         elsif ($name eq 'servernames') { $servernames = $value }
  252         elsif ($name eq 'unchfiles') { $unchfiles = $value }
  253         elsif ($name eq 'matchlength' &&
  254            ($value eq '0' || $value == '1')) {
  255         $matchlength = $value;
  256         }
  257         elsif ($name eq 'case_sensitive' &&
  258            ($value eq '0' || $value == '1')) {
  259         $case_sensitive = $value;
  260         }
  261         elsif ($name eq 'usercase_sensitive' &&
  262            ($value eq '0' || $value == '1')) {
  263         $usercase_sensitive = $value;
  264         }
  265         else {
  266         warn "$progname: Can't understand configuration line: $_\n";
  267         }
  268     }
  269     }
  270 }
  271 
  272 # Construct the dictionary.
  273 sub construct_dict {
  274     local $_;
  275     my ($w, $d, $i, $tmp, @words, @ignore, %h);
  276 
  277     while (<DICT>) { $w .= $_ }
  278     @words = split(/\s+/, $w);
  279     @ignore = map(/^([^.]*)/, @unchfiles);
  280     # @ignore contains the 'index' in index.html (or index.html.gz or index)
  281     # We delete them from the dictionary below (could instead be careful in
  282     # lookup_or_create_filename, but this is easier and faster).
  283     foreach (@ignore) { tr/A-Z/a-z/; }
  284     # Put words of length l into the array at $dict[l].
  285     foreach (@words) {
  286     tr/A-Z/a-z/;
  287     $tmp = $_;
  288     $i = $matchlength?length():0;
  289     push(@{$dict[$i]}, $_)  # Take only words, and not in @ignore
  290         unless (/[^a-z]/ || grep($tmp eq $_, @ignore));
  291     }
  292     foreach $d (@dict) { if ($d == undef) { @$d = (); }}
  293 }
  294 
  295 # Detect logfile format from first line. (NB Line is already in (global) $_ ).
  296 sub detect_format {
  297     my $i;
  298 
  299     unless ($logformat) {
  300     if ((split /,\s*/) == 15) { $logformat = 'iis'; }
  301     elsif (($i = index($_, '[')) >= 6 && substr($_, $i + 27, 1) eq ']' &&
  302            index($_, '"') == $i + 29) {
  303         if (($i = split(/\"/)) == 3) { $logformat = 'common'; }
  304         elsif ($i == 7) { $logformat = 'combined'; }
  305     }
  306     elsif (/^\#/) { $logformat = 'extended'; }
  307     unless ($logformat) { die "$progname: Can't detect format of logfile $logfile from first line: specify it in $conffile\n"; }
  308     }
  309 
  310     $logformat =~ tr/A-Z/a-z/;
  311     if ($logformat eq 'common') {
  312     print STDERR "$progname: Reading $logfile in common format\n";
  313     $format = $commonfmt;
  314     @tokens = @commontokens;
  315     $outstr = $commonout;
  316     }
  317     elsif ($logformat eq 'combined') {
  318     print STDERR "$progname: Reading $logfile in combined format\n";
  319     $format = $combfmt;
  320     @tokens = @combtokens;
  321     $outstr = $combout;
  322     }
  323     elsif ($logformat eq 'iis') {
  324     print STDERR "$progname: Reading $logfile in IIS format\n";
  325     $format = $iisfmt;
  326     @tokens = @iistokens;
  327     $outstr = $iisout;
  328     }
  329     elsif ($logformat eq 'extended' || $logformat eq 'ms-extended') {
  330     # In this case, have to construct the log format from the #Fields line
  331     if ($logformat eq 'ms-extended') { $msext = 1; }
  332     $logformat = 'extended';
  333     while (/^\#/) {   # process all # lines before handing back
  334         if (/^\#Software: Microsoft Internet Information Serv/) {
  335         $msext = 1;
  336         }
  337         elsif (/^\#Fields:\s/) { parse_extfmt(); }
  338         print NEWLOG;
  339         $_ = <LOGFILE>;
  340     }
  341     print NEWLOG "#Remark: Logfile anonymized by anonlog $version, $progurl\n";
  342     if ($msext) { print STDERR "$progname: Reading $logfile in Microsoft extended format\n"; }
  343     else { print STDERR "$progname: Reading $logfile in W3C extended format\n"; }
  344     }
  345     else { die "$progname: Don't know what you mean by 'logformat = $logformat' in $conffile\n"; }
  346 }
  347 
  348 # Parse the #Fields: line from an extended format logfile. The #Fields: line
  349 # is already in (global) $_ .
  350 sub parse_extfmt { 
  351     my ($i, $first);
  352 
  353     $format = '';
  354     @tokens = ();
  355     $outstr = '';
  356     $first = 1;
  357 
  358     foreach $i (split(' ', substr($_, 9))) {  # substr skips "#Fields: " itself
  359     if ($first) { $first = 0; }
  360     else { $format .= '\s+'; $outstr .= "\t"; }
  361     $i =~ tr/A-Z/a-z/;
  362     if ($i eq 'date') {
  363         $format .= '(\d{4}-\d{2}-\d{2})';
  364         push(@tokens, 2);
  365         $outstr .= '%s';
  366     }
  367     elsif ($i eq 'time') {
  368         $format .= '(\d{2}:\d{2}(?::\d{2}(?:\.\d*)?)?)';
  369         push(@tokens, 15);
  370         $outstr .= '%s';
  371     }
  372     elsif ($i eq 'bytes' || $i eq 'sc-bytes') {
  373         $format .= '(\d+|-)';
  374         push(@tokens, 7);
  375         $outstr .= '%d';
  376     }
  377     elsif ($i eq 'cs-bytes') {
  378         $format .= '(\d+|-)';
  379         push(@tokens, 12);
  380         $outstr .= '%d';
  381     }
  382     elsif ($i eq 'sc-status') {
  383         $format .= '(\d{3})';
  384         push(@tokens, 6);
  385         $outstr .= '%d';
  386     }
  387     elsif ($i eq 'c-dns' || $i eq 'cs-dns' ||
  388            $i eq 'c-ip' || $i eq 'cs-ip') {
  389         $format .= '(\S+)';
  390         push(@tokens, 0);
  391         $outstr .= '%s';
  392     }
  393     elsif ($i eq 'cs-uri' || $i eq 'cs-uri-stem') {
  394         $format .= '(\S+)';
  395         push(@tokens, 4);
  396         $outstr .= '%s';
  397     }
  398     elsif ($i eq 'cs(referer)') {
  399         if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
  400         else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
  401         push(@tokens, 8);
  402     }
  403     elsif ($i eq 'cs(user-agent)') {
  404         if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
  405         else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
  406         push(@tokens, 9);
  407     }
  408     elsif ($i eq 'cs-host' || $i eq 's-ip' || $i eq 's-dns' ||
  409            $i eq 'cs-sip' || $i eq 's-sitename' ||
  410            $i eq 's-computername') {
  411         $format .= '(\S+)';
  412         push(@tokens, 10);
  413         $outstr .= '%s';
  414     }
  415     elsif ($i eq 'cs(host)') {
  416         $format .= '\"(.*?)\"';
  417         push(@tokens, 10);
  418         $outstr .= '"%s"';
  419     }
  420     elsif ($i eq 'cs-uri-query') {
  421         $format .= '(\S+)';
  422         push(@tokens, 14);
  423         $outstr .= '%s';
  424     }
  425     elsif ($i eq 'cs-username') {
  426         $format .= '(\S+)';
  427         push(@tokens, 1);
  428         $outstr .= '%s';
  429     }
  430     elsif ($i eq 'cs(from)') {
  431         $format .= '\"(.*?)\"';
  432         push(@tokens, 1);
  433         $outstr .= '"%s"';
  434     }
  435     elsif ($i eq 'time-taken') {
  436         $format .= '([\d\.]+|-)';
  437         push(@tokens, 11);
  438         $outstr .= '%s';
  439     }
  440     elsif ($i eq 'cs-method') {
  441         $format .= '([A-Za-z]+)';
  442         push(@tokens, 3);
  443         $outstr .= '%s';
  444     }
  445     else {  # unknown token
  446         $format .= '\S+';
  447         $outstr .= '-';
  448     }
  449     }
  450 }
  451 
  452 # The anonymizing functions
  453 #
  454 # The translations are looked up in a tree. Each node of the tree is a hash
  455 # as follows:
  456 # Keys: The part of the name being translated
  457 # Values: A 2-element \array (translation, \hash of the same type recursively)
  458 #
  459 # All these functions work the same way.
  460 # They are called with one argument from the main part of the
  461 # program, a second argument (\sub-hash) when called recursively.
  462 # The name is split into two components, $b to be translated immediately
  463 # and $a, the rest. $b's (translation, \subhash) is assigned to @b.
  464 sub anon_host {
  465     local $_ = $_[0];
  466     my @b;
  467     my $numhost = 0;
  468     if (!defined($_[1])) { tr/A-Z/a-z/; }
  469     if (/\.$/) { $_ = substr($_, 0, length($_) - 1); } # strip trailing dot
  470     my ($a, $b) = /^(.*)\.(.*)$/;
  471     if (!defined($b)) { $a = ''; $b = $_; }  # if no dot in (sub-)name
  472 
  473     if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
  474     else {
  475     if ($_ eq '' || $_ eq '-') { return '-'; }
  476     my($n1, $n2, $n3, $n4) =
  477         /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
  478     if (defined($n1) && $n1 <= 255 && $n2 <= 255 && $n3 <= 255 &&
  479         $n4 <= 255) { return(anon_numhost($_, \%hosttree)); }
  480     if (!/\./ && !grep($b eq $_, @domains)) {  # no dot in whole name
  481         @b = lookup_or_create($b, \%hosttree);
  482     } else { @b = lookup_or_create($b, \%hosttree, \@domains); }
  483     }
  484 
  485     if ($a eq '') { return($b[0]); }
  486     else { return(anon_host($a, $b[1]) . '.' . $b[0]); }
  487 }
  488 
  489 sub anon_numhost {     # Numerical hostnames
  490     local $_ = $_[0];
  491     my @b;
  492     my ($b, $a) = /^(.*?)\.(.*)$/;
  493     if (!defined($b)) { $a = ''; $b = $_; }
  494     my @newnumber = (rand255($_[1]));
  495     @b = lookup_or_create($b, $_[1], \@newnumber);
  496     if ($a eq '') { return($b[0]); }
  497     else { return($b[0] . '.' . anon_numhost($a, $b[1])); }
  498 }
  499 
  500 # anon_file also takes an optional argument number 2, overriding global
  501 # $case_sensitive. If it is present, initial stripping of anchors and search
  502 # arguments is also performed (because it is also used only on the first call
  503 # for that data, in this case a referrer).
  504 sub anon_file {
  505     local $_ = $_[0];
  506     my $args = '';
  507     my $case = $case_sensitive;
  508     my (@b, @tmp, $ans);
  509     if (!defined($_[1]) || defined($_[2])) {
  510     s/\#.*//;  # remove anchors
  511     if (s/\?.*//) { $args = '?args'; }
  512     s/%([\da-fA-F]{2})/pack("C", hex($1))/ge;  # change %7E to ~ etc.
  513     if (defined($_[2])) { $case = $_[2]; }
  514     if (!$case) { tr/A-Z/a-z/; }
  515     }
  516 
  517     my ($b, $a) = m[^/(.*?)(/.*)$];
  518     if (!defined($b)) {  # not two slashes in name
  519     if (!m[^/]) { return '-'; }
  520     # top level should always begin with slash, and lower levels forced to
  521     $b = substr($_, 1);
  522     if ($b eq '') { return "/$args"; }
  523     if (grep($b eq $_, @unchfiles)) { return "/$b$args"; }
  524     if (defined($_[1])) { @b = lookup_or_create_filename($b, \%{$_[1]}); }
  525     else { @b = lookup_or_create_filename($b, \%filetree); }
  526     return("/$b[0]$args");
  527     }
  528     # rest only reached if there were two slashes in name
  529     if ($b eq '.' || $b eq '..') {  # special case: leave these alone
  530     @tmp = ($b);
  531     if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}, \@tmp); }
  532     else { @b = lookup_or_create($b, \%filetree, \@tmp); }
  533     }
  534     elsif (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
  535     else { @b = lookup_or_create($b, \%filetree); }
  536     return("/$b[0]" . anon_file($a, $b[1]) . $args);
  537 }
  538 
  539 # Referrers are a bit different because they're split into 3 parts:
  540 # a scheme, a hostname and a filename. We only allow through http: and
  541 # ftp: URLs. If the hostname is in the list of known $servernames, we preserve
  542 # the hostname and use the existing local translations for the filename part.
  543 # Otherwise we translate the hostname according to the existing hosttree, and
  544 # use this as the root for the referrer tree.
  545 sub anon_referrer {
  546     local $_ = $_[0];
  547     my @b;
  548     my ($scheme, $hostname, $port, $path) = m[^(.*?)://(.*?)(:.*?)?(/.*)$];
  549 
  550     if (!defined($scheme) || $scheme !~ /^(ht|f)tp/i) { return "-"; }
  551     $scheme =~ tr/A-Z/a-z/;
  552     if ($hostname =~ /\.$/) {  # strip trailing dot
  553     $hostname = substr($hostname, 0, length($hostname) - 1);
  554     }
  555     if (grep($_ eq $hostname, @servernames)) {
  556     return($scheme . '://' . $hostname . anon_file($path));
  557     }
  558     else {
  559     my @newhost = (anon_host($hostname));
  560     @b = lookup_or_create($hostname, \%reftree, \@newhost);
  561     return("$scheme://$b[0]$port" . anon_file($path, $b[1], 1));
  562     }
  563 }
  564 
  565 sub anon_user {   # users and virtual hosts aren't hierarchical
  566     local $_ = $_[0];
  567     my @b;
  568 
  569     if ($_ eq '' || $_ eq '-') { return '-'; }
  570     if (!$usercase_sensitive) { tr/A-Z/a-z/; }
  571     @b = lookup_or_create($_, \%usertree);
  572     return($b[0]);
  573 }
  574 
  575 sub anon_vhost {
  576     local $_ = $_[0];
  577     my @b;
  578 
  579     if ($_ eq '' || $_ eq '-') { return '-'; }
  580     tr/A-Z/a-z/;
  581     @b = lookup_or_create($_, \%vhosttree);
  582     return($b[0]);
  583 }
  584 
  585 # Look up an item (arg 0) in a tree node (arg 1), or create a new entry if
  586 # necessary. The entry is selected from array arg 2 if present, else from
  587 # dictionary entry of correct length, else a random string of correct length.
  588 # See also lookup_or_create_filename below.
  589 sub lookup_or_create {
  590     my $n = $_[0];
  591     my (@ans, %h);
  592 
  593     unless (defined(${$_[1]}{$n})) {
  594     if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1]); }
  595     else {
  596         $ans[0] = random_entry($dict[$matchlength?length($n):0], $_[1]);
  597     }
  598     unless (defined($ans[0])) {
  599         $ans[0] = random_string(length($n), $_[1]);
  600     }
  601     # Start hash table so it isn't undef later
  602     $h{''} = undef;
  603     $ans[1] = \%h;
  604     ${$_[1]}{$n} = \@ans;
  605     }
  606     return(@{${$_[1]}{$n}});
  607 }
  608 
  609 # The same as lookup_or_create above, but preserves the extension of filenames.
  610 # (Actually, this is never called with 3 args, but we leave it in for possible
  611 # future use, and to keep it parallel with the previous function).
  612 sub lookup_or_create_filename {
  613     local $_ = $_[0];
  614     my (@ans, %h);
  615 
  616     unless (defined(${$_[1]}{$_})) {
  617         my ($name, $ext) = m[^(.*)(\..*)$];
  618     if (!defined($name)) { $ext = ''; $name = $_; }  # no extension
  619     if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1], $ext); }
  620     else {
  621         $ans[0] = random_entry($dict[$matchlength?length($name):0], $_[1],
  622                    $ext);
  623     }
  624     unless (defined($ans[0])) {
  625         $ans[0] = random_string(length($name), $_[1], $ext);
  626     }
  627     $h{''} = undef;
  628     $ans[1] = \%h;
  629     ${$_[1]}{$_} = \@ans;
  630     }
  631     return(@{${$_[1]}{$_}});
  632 }
  633 
  634 # Select a random entry from array arg 0, but must not occur as value in hash
  635 # arg 1. If failed after $no_tries, give up and return undef.
  636 # If arg 2 exists, then the random entry is "arg0_element . arg2" instead.
  637 sub random_entry {
  638     if ($_[0] == undef) { return undef; }
  639     my @l = @{$_[0]};
  640     my @v = values(%{$_[1]});
  641     my ($ans, $k);
  642 
  643     if (@l == ()) { return undef; }
  644     for ($k = 0;
  645      (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries;
  646      $k++) { $ans = $l[rand($#l + 0.9999999)] . $_[2]; }
  647     if ($k < $no_tries) { return($ans); }
  648     else { return undef; }
  649 }
  650 
  651 # Create random string, length given by arg 0 (unless global $matchlength is
  652 # false), again must not occur as value in hash arg 1. Same arg2 as in
  653 # random_entry. This time if failed after $no_tries2, return any answer.
  654 sub random_string {
  655     my $l = $_[0];
  656     my @v = values(%{$_[1]});
  657     my ($ans, $i, $j, $k);
  658 
  659     if ($l == 0) { return(''); }
  660     unless ($matchlength) { $l = int(5 + rand(6)); }  # i.e. 5 to 10
  661     for ($k = 0;
  662      (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries2;
  663      $k++) {
  664     $ans = '';
  665     for ($i = 0; $i < $l; $i++) {
  666         $j = 65 + rand(52);
  667         if ($j >= 91) { $j += 6; }
  668         $ans .= chr($j);
  669     }
  670     $ans .= $_[2];
  671     }
  672     return($ans);
  673 }
  674 
  675 # Select a random number from 0 to 255, but again not already occurring as
  676 # value in hash (arg 0). This should never fail in the context of this
  677 # program, but we use $no_tries2 again just in case.
  678 sub rand255 {
  679     my @v = values(%{$_[0]});
  680     my ($ans, $k);
  681 
  682     for ($k = 0;
  683      (!defined($ans) || grep {$ans == ${$_}[0]} @v) && $k < $no_tries2;
  684      $k++) { $ans = int(rand(255.9999999)); }
  685     return($ans);
  686 }
  687 
  688 # Write out the translations to file TRANS.
  689 # The 0th argument is the \hash to be interpreted;
  690 # The 1st argument is 0 if the name-parts are collated backwards, 1 if
  691 # forwards, 2 if they are hostnames (backwards unless numerical);
  692 # The 2nd argument is the delimiter between name-parts, or empty string if they
  693 # are not hierarchical.
  694 # The 3rd argument says whether the delimiter should also occur at the start of
  695 # the string.
  696 # The 4th argument says whether the entry should still be printed if it is
  697 # not (known to be) a leaf.
  698 # The 5th and 6th arguments, if present, are the up-tree name-parts for the
  699 # original and translated names respectively.
  700 sub writeout {
  701     my %hash = %{$_[0]};
  702     my ($colorder, $delim, $initial, $printall) = ($_[1], $_[2], $_[3], $_[4]);
  703     my ($partname_old, $partname_new) = ($_[5], $_[6]);
  704     my ($name, @value, $name_new, $name_old, $newcolord, $fieldwidth);
  705 
  706     foreach $name (sort {  # Declare sort order inline so we can use %hash
  707     # The first case is binned immediately below, but must catch here too
  708     (!defined($hash{$a}) || !defined($hash{$b}))?(lc($a) cmp lc($b)):
  709         (((${$hash{$a}}[0] + 0) <=> (${$hash{$b}}[0] + 0)) ||
  710     # If they start with (or are) numbers, sort them that way
  711          (lc(${$hash{$a}}[0]) cmp lc(${$hash{$b}}[0])) || # Usual ordering
  712          (($a + 0) <=> ($b + 0)) ||    # Fallback to untranslated names: but by
  713          (lc($a) cmp lc($b)))  # construction this should (all but) never occur
  714     } keys(%hash)) {       # End of sort order. Phew.
  715     if (defined($hash{$name})) {
  716         @value = @{$hash{$name}};
  717         if ($colorder == 2) { $newcolord = (($name =~ /^\d{1,3}$/)?1:0); }
  718         else { $newcolord = $colorder; }
  719         # NB Incomplete test for numerical hostname (unlike in anon_host)
  720         if ($newcolord == 1) {
  721         if (!$partname_old && !$initial) {
  722             $name_new = "$value[0]";
  723             $name_old = "$name";
  724         } else {
  725             $name_new = "$partname_new$delim$value[0]";
  726             $name_old = "$partname_old$delim$name";
  727         }
  728         $fieldwidth = -$width;
  729         }
  730         else {
  731         if (!$partname_old && !$initial) {
  732             $name_new = "$value[0]";
  733             $name_old = "$name";
  734         } else {
  735             $name_new = "$value[0]$delim$partname_new";
  736             $name_old = "$name$delim$partname_old";
  737         }
  738         $fieldwidth = $width;
  739         }
  740             printf TRANS ("%*s  =  %*s\n",
  741                           $fieldwidth, $name_new, $fieldwidth, $name_old)
  742                 if ($printall || keys(%{$value[1]}) <= 1);
  743         writeout($value[1], $newcolord, $delim, $initial, $printall,
  744              $name_old, $name_new);
  745     }
  746     }
  747 }