"Fossies" - the Fresh Open Source Software Archive

Member "anonlog-1.0.1/anonlog.pl" (24 Oct 2002, 25491 Bytes) of package /linux/www/old/anonlog-1.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "anonlog.pl" see the Fossies "Dox" file reference documentation.

    1 #!/usr/bin/perl
    2 #
    3 ###        anonlog 1.0.1             http://anonlog.sourceforge.net/
    4 ### This program is copyright (c) Stephen R. E. Turner 2000-2002.
    5 ### It is free software; you can redistribute it and/or modify
    6 ### it under the terms of version 2 of the GNU General Public License as
    7 ### published by the Free Software Foundation.
    8 ###
    9 ### This program is distributed in the hope that it will be useful,
   10 ### but without any warranty; without even the implied warranty of
   11 ### merchantability or fitness for a particular purpose.  See the
   12 ### GNU General Public License for more details.
   13 ###
   14 ### You should have received a copy of the GNU General Public License
   15 ### along with this program; if not, see http://www.gnu.org/copyleft/gpl.html
   16 ### or write to the Free Software Foundation, Inc., 59 Temple Place,
   17 ### Suite 330, Boston, MA 02111-1307, USA
   18 
   19 require 5.004;  # for rand()
   20 use strict;
   21 my ($conffile, $logfile, $logformat, $newlog, $dictionary, $translations,
   22     $servernames, $unchfiles, $matchlength,
   23     $case_sensitive, $usercase_sensitive);
   24 
   25 # ======== User-settable parameters start here ======== #
   26 #
   27 # NB All parameters can also be set in the configuration file, $conffile,
   28 # normally anonlog.cfg. This is usually more convenient. The variables here
   29 # have the same names as those in anonlog.cfg (with the addition of a $ at
   30 # the front). So see Readme.html for documentation on the various options.
   31 #
   32 # On Unix, you can edit the top line of this program to give the location of
   33 # Perl on your system. (Try 'which perl' to find out).
   34 #
   35 # The configuration file to override all these options.
   36 # $conffile = ''; for none.
   37 $conffile = 'anonlog.cfg';
   38 
   39 $logfile = 'logfile.log';
   40 $newlog = '';
   41 $servernames = '';
   42 $logformat = '';
   43 $dictionary = 'dictionary';
   44 $translations = '';
   45 $unchfiles = 'index.html';
   46 $matchlength = 0;
   47 $case_sensitive = 1;
   48 $usercase_sensitive = 0;
   49 
   50 # ======== User-settable parameters end here ======== #
   51 
   52 my $progname = $0 || 'anonlog';
   53 my $version = '1.0.1';  # version of this program
   54 my $progurl = 'http://anonlog.sourceforge.net/';
   55 
   56 # All legal domain names
   57 my @domains = ("ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq",
   58            "ar", "as", "at", "au", "aw", "az", "ba", "bb", "bd", "be",
   59            "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs",
   60            "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg",
   61            "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "cr", "cs",
   62            "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do",
   63            "dz", "ec", "edu", "ee", "eg", "eh", "er", "es", "et", "fi",
   64            "fj", "fk", "fm", "fo", "fr", "fx", "ga", "gb", "gd", "ge",
   65            "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq",
   66            "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr",
   67            "ht", "hu", "id", "ie", "il", "im", "in", "int", "io", "iq",
   68            "ir", "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh",
   69            "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", "lb",
   70            "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma",
   71            "mc", "md", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo",
   72            "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my",
   73            "mz", "na", "nc", "ne", "net", "nf", "ng", "ni", "nl", "no",
   74            "np", "nr", "nu", "nz", "om", "org", "pa", "pe", "pf", "pg",
   75            "ph", "pk", "pl", "pm", "pn", "pr", "pt", "pw", "py", "qa",
   76            "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
   77            "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st",
   78            "su", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj",
   79            "tk", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz",
   80            "ua", "ug", "uk", "um", "us", "uy", "uz", "va", "vc", "ve",
   81            "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "yu", "za",
   82            "zm", "zr", "zw");
   83 my $no_tries = 100;    # See random_entry() and random_string()
   84 my $no_tries2 = 2000;
   85 my $width = 35;        # See writeout
   86 my $firstline;
   87 
   88 my @data;
   89 # @data has the folloing components:
   90 # 0 = host; 1 = user; 2 = date/time; 3 = HTTP method; 4 = filename;
   91 # 5 = HTTP version / W3SVC string; 6 = HTTP status code; 7 = bytes sent;
   92 # 8 = referrer; 9 = browser; 10 = virtual hostname; 11 = processing time;
   93 # 12 = bytes received, 13 = IIS status; 14 = search args;
   94 # 15 = time separate from date.
   95 my (%hosttree, %filetree, %reftree, %usertree, %vhosttree);
   96 my (@servernames, @unchfiles, @dict);
   97 
   98 # Now all log formats
   99 
  100 my ($format, @tokens, $outstr);
  101 
  102 my $commonfmt = <<'HERE';
  103     (\S*)\            # host
  104     \S+\              # (unused)
  105     (\S+)\            # user
  106     \[([^\]]+)\]\     # date and time
  107     \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\   # request line
  108     (\d{3})\          # status code
  109     (\d+|-)           # bytes
  110 HERE
  111 my @commontokens = (0..7);
  112 my $commonout = "%s - %s [%s] \"%s %s %s\" %s %s";
  113 
  114 my $combfmt = <<'HERE';
  115     (\S*)\            # host
  116     \S+\              # (unused)
  117     (\S+)\            # user
  118     \[([^\]]+)\]\     # date and time
  119     \"\s*([A-Za-z]+)\s+(.+?)(?:\s+(HTTP\/\d.\d))?\s*\"\   # request line
  120     (\d{3})\          # status code
  121     (\d+|-)\          # bytes
  122     \"(.*)\"\         # referrer
  123     \"([^\"]*)\"      # browser
  124 HERE
  125 my @combtokens = (0..9);
  126 my $combout = "%s - %s [%s] \"%s %s %s\" %s %s \"%s\" \"%s\"";
  127 
  128 my $iisfmt = <<'HERE';
  129     ([^,]*),\             # host
  130     ([^,]*),\             # user
  131     ([^,]*,\ [^,]*),\     # date and time
  132     (W3SVC[^,]*),\        # W3SVC line
  133     ([^,]*),\             # server name
  134     [^,]*,\               # server address
  135     (\d+|-),\             # processing time
  136     (\d+|-),\             # bytes received
  137     (\d+|-),\             # bytes sent
  138     (\d{3}|-),\           # HTTP status code
  139     ([^,]*),\             # IIS status
  140     ([^,]*),\             # Operation
  141     ([^,]*),\             # Filename
  142     ([^,]*),\ ?           # Search args
  143 HERE
  144 my @iistokens = (0..2, 5, 10..12, 7, 6, 13, 3, 4, 14);
  145 my $iisout = "%s, %s, %s, %s, %s, -, %s, %s, %s, %s, %s, %s, %s, %s,";
  146 
  147 my $msext = 0;   # Whether extended format is genuine or Microsoft version
  148 
  149 # ======== End of global declarations, start of main program ======== #
  150 
  151 if ($conffile) {
  152     open(CONFFILE, $conffile) ||
  153     die "$progname: Cannot open configuration file $conffile: $!\n";
  154     parse_config();
  155 }
  156 
  157 open(LOGFILE, $logfile) || die "$progname: Cannot open logfile $logfile: $!\n";
  158 if ($newlog eq '') { $newlog = '-'; }
  159 open(NEWLOG, ">$newlog") ||
  160     die "$progname: Cannot write to new logfile $newlog: $!\n";
  161 
  162 @servernames = split(/,\s*/, $servernames);
  163 @unchfiles = split(/,\s*/, $unchfiles);
  164 
  165 if ($dictionary) {
  166     unless (open(DICT, $dictionary)) {
  167     warn "$progname: Cannot open dictionary $dictionary: $!\n";
  168     } else { construct_dict(); }
  169 }
  170 
  171 # == End of initialisation, now process logfile == #
  172 
  173 $firstline = 1;
  174 while (<LOGFILE>) {
  175     if ($firstline) {
  176     detect_format();
  177     $firstline = 0;
  178     }
  179     if ($logformat eq 'extended' && /^\#/) {
  180     # special case: extended format, line beginning with #
  181     if (/^\#Fields:\s/) { parse_extfmt(); }
  182     print NEWLOG;
  183     }
  184     else {
  185     @data[@tokens] = /^$format$/x;
  186     unless (defined($data[$tokens[0]])) {
  187         print STDERR "$progname: Unparseable line: ";
  188         print STDERR;
  189     }
  190     else {
  191         $data[0] = anon_host($data[0]);
  192         $data[4] = anon_file($data[4]);
  193         $data[8] = anon_referrer($data[8]);
  194         $data[1] = anon_user($data[1]);
  195         $data[10] = anon_vhost($data[10]);
  196         if ($data[14] ne '' && $data[14] ne '-') { $data[14] = 'args'; }
  197         printf NEWLOG "$outstr\n", @data[@tokens];
  198     }
  199     }
  200 }
  201 
  202 # == Finished processing logfile, finally output translations == #
  203 
  204 if ($translations) {
  205     unless (open(TRANS, ">$translations")) {
  206     warn "$progname: Cannot write to translation file $translations: $!\n";
  207     } else {
  208     if (%filetree) {
  209         print TRANS "** FILES **\n\n";
  210         writeout(\%filetree, 1, '/', 1, 1);
  211     }
  212     if (%hosttree) {
  213         print TRANS "\n** HOSTS **\n\n";
  214         writeout(\%hosttree, 2, '.', 0, 0);
  215     }
  216     if (%reftree) {
  217         print TRANS "\n** REFERRERS **\n\n";
  218         writeout(\%reftree, 1, '/', 0, 0);
  219     }
  220     if (%usertree) {
  221         print TRANS "\n** USERS **\n\n";
  222         writeout(\%usertree, 1, '', 0, 0);
  223     }
  224     if (%vhosttree) {
  225         print TRANS "\n** VIRTUAL HOSTS **\n\n";
  226         writeout(\%vhosttree, 1, '', 0, 0);
  227     }
  228     }
  229 }
  230 
  231 # ======== End of main program. Rest is subroutines. ======== #
  232 
  233 # Parse the configuration file.
  234 sub parse_config {
  235     my ($name, $value);
  236 
  237     while (<CONFFILE>) {
  238     chomp;
  239     s/\#.*$//;    # Remove comments
  240     if (/\S/) {   # If any non-space character left on line
  241         ($name, $value) = /^\s*(.*?)\s*=\s*(.*?)\s*$/;
  242         $name =~ tr/A-Z/a-z/;
  243         if (!defined($name)) {
  244         warn "$progname: Can't parse configuration line: $_\n";
  245         }
  246         elsif ($name eq 'logfile' && $value ne '') { $logfile = $value }
  247         elsif ($name eq 'logformat') { $logformat = $value }
  248         elsif ($name eq 'newlog') { $newlog = $value }
  249         elsif ($name eq 'dictionary') { $dictionary = $value }
  250         elsif ($name eq 'translations') { $translations = $value }
  251         elsif ($name eq 'servernames') { $servernames = $value }
  252         elsif ($name eq 'unchfiles') { $unchfiles = $value }
  253         elsif ($name eq 'matchlength' &&
  254            ($value eq '0' || $value == '1')) {
  255         $matchlength = $value;
  256         }
  257         elsif ($name eq 'case_sensitive' &&
  258            ($value eq '0' || $value == '1')) {
  259         $case_sensitive = $value;
  260         }
  261         elsif ($name eq 'usercase_sensitive' &&
  262            ($value eq '0' || $value == '1')) {
  263         $usercase_sensitive = $value;
  264         }
  265         else {
  266         warn "$progname: Can't understand configuration line: $_\n";
  267         }
  268     }
  269     }
  270 }
  271 
  272 # Construct the dictionary.
  273 sub construct_dict {
  274     local $_;
  275     my ($w, $d, $i, $tmp, @words, @ignore, %h);
  276 
  277     while (<DICT>) { $w .= $_ }
  278     @words = split(/\s+/, $w);
  279     @ignore = map(/^([^.]*)/, @unchfiles);
  280     # @ignore contains the 'index' in index.html (or index.html.gz or index)
  281     # We delete them from the dictionary below (could instead be careful in
  282     # lookup_or_create_filename, but this is easier and faster).
  283     foreach (@ignore) { tr/A-Z/a-z/; }
  284     # Put words of length l into the array at $dict[l].
  285     foreach (@words) {
  286     tr/A-Z/a-z/;
  287     $tmp = $_;
  288     $i = $matchlength?length():0;
  289     push(@{$dict[$i]}, $_)  # Take only words, and not in @ignore
  290         unless (/[^a-z]/ || grep($tmp eq $_, @ignore));
  291     }
  292     foreach $d (@dict) { if ($d == undef) { @$d = (); }}
  293 }
  294 
  295 # Detect logfile format from first line. (NB Line is already in (global) $_ ).
  296 sub detect_format {
  297     my $i;
  298 
  299     unless ($logformat) {
  300     if ((split /,\s*/) == 15) { $logformat = 'iis'; }
  301     elsif (($i = index($_, '[')) >= 6 && substr($_, $i + 27, 1) eq ']' &&
  302            index($_, '"') == $i + 29) {
  303         if (($i = split(/\"/)) == 3) { $logformat = 'common'; }
  304         elsif ($i == 7) { $logformat = 'combined'; }
  305     }
  306     elsif (/^\#/) { $logformat = 'extended'; }
  307     unless ($logformat) { die "$progname: Can't detect format of logfile $logfile from first line: specify it in $conffile\n"; }
  308     }
  309 
  310     $logformat =~ tr/A-Z/a-z/;
  311     if ($logformat eq 'common') {
  312     print STDERR "$progname: Reading $logfile in common format\n";
  313     $format = $commonfmt;
  314     @tokens = @commontokens;
  315     $outstr = $commonout;
  316     }
  317     elsif ($logformat eq 'combined') {
  318     print STDERR "$progname: Reading $logfile in combined format\n";
  319     $format = $combfmt;
  320     @tokens = @combtokens;
  321     $outstr = $combout;
  322     }
  323     elsif ($logformat eq 'iis') {
  324     print STDERR "$progname: Reading $logfile in IIS format\n";
  325     $format = $iisfmt;
  326     @tokens = @iistokens;
  327     $outstr = $iisout;
  328     }
  329     elsif ($logformat eq 'extended' || $logformat eq 'ms-extended') {
  330     # In this case, have to construct the log format from the #Fields line
  331     if ($logformat eq 'ms-extended') { $msext = 1; }
  332     $logformat = 'extended';
  333     while (/^\#/) {   # process all # lines before handing back
  334         if (/^\#Software: Microsoft Internet Information Serv/) {
  335         $msext = 1;
  336         }
  337         elsif (/^\#Fields:\s/) { parse_extfmt(); }
  338         print NEWLOG;
  339         $_ = <LOGFILE>;
  340     }
  341     print NEWLOG "#Remark: Logfile anonymized by anonlog $version, $progurl\n";
  342     if ($msext) { print STDERR "$progname: Reading $logfile in Microsoft extended format\n"; }
  343     else { print STDERR "$progname: Reading $logfile in W3C extended format\n"; }
  344     }
  345     else { die "$progname: Don't know what you mean by 'logformat = $logformat' in $conffile\n"; }
  346 }
  347 
  348 # Parse the #Fields: line from an extended format logfile. The #Fields: line
  349 # is already in (global) $_ .
  350 sub parse_extfmt { 
  351     my ($i, $first);
  352 
  353     $format = '';
  354     @tokens = ();
  355     $outstr = '';
  356     $first = 1;
  357 
  358     foreach $i (split(' ', substr($_, 9))) {  # substr skips "#Fields: " itself
  359     if ($first) { $first = 0; }
  360     else { $format .= '\s+'; $outstr .= "\t"; }
  361     $i =~ tr/A-Z/a-z/;
  362     if ($i eq 'date') {
  363         $format .= '(\d{4}-\d{2}-\d{2})';
  364         push(@tokens, 2);
  365         $outstr .= '%s';
  366     }
  367     elsif ($i eq 'time') {
  368         $format .= '(\d{2}:\d{2}(?::\d{2}(?:\.\d*)?)?)';
  369         push(@tokens, 15);
  370         $outstr .= '%s';
  371     }
  372     elsif ($i eq 'bytes' || $i eq 'sc-bytes') {
  373         $format .= '(\d+|-)';
  374         push(@tokens, 7);
  375         $outstr .= '%d';
  376     }
  377     elsif ($i eq 'cs-bytes') {
  378         $format .= '(\d+|-)';
  379         push(@tokens, 12);
  380         $outstr .= '%d';
  381     }
  382     elsif ($i eq 'sc-status') {
  383         $format .= '(\d{3})';
  384         push(@tokens, 6);
  385         $outstr .= '%d';
  386     }
  387     elsif ($i eq 'c-dns' || $i eq 'cs-dns' ||
  388            $i eq 'c-ip' || $i eq 'cs-ip') {
  389         $format .= '(\S+)';
  390         push(@tokens, 0);
  391         $outstr .= '%s';
  392     }
  393     elsif ($i eq 'cs-uri' || $i eq 'cs-uri-stem') {
  394         $format .= '(\S+)';
  395         push(@tokens, 4);
  396         $outstr .= '%s';
  397     }
  398     elsif ($i eq 'cs(referer)') {
  399         if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
  400         else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
  401         push(@tokens, 8);
  402     }
  403     elsif ($i eq 'cs(user-agent)') {
  404         if ($msext) { $format .= '(\S+)'; $outstr .= '%s'; }
  405         else { $format .= '\"(.*?)\"'; $outstr .= '"%s"'; }
  406         push(@tokens, 9);
  407     }
  408     elsif ($i eq 'cs-host' || $i eq 's-ip' || $i eq 's-dns' ||
  409            $i eq 'cs-sip' || $i eq 's-sitename' ||
  410            $i eq 's-computername') {
  411         $format .= '(\S+)';
  412         push(@tokens, 10);
  413         $outstr .= '%s';
  414     }
  415     elsif ($i eq 'cs(host)') {
  416         $format .= '\"(.*?)\"';
  417         push(@tokens, 10);
  418         $outstr .= '"%s"';
  419     }
  420     elsif ($i eq 'cs-uri-query') {
  421         $format .= '(\S+)';
  422         push(@tokens, 14);
  423         $outstr .= '%s';
  424     }
  425     elsif ($i eq 'cs-username') {
  426         $format .= '(\S+)';
  427         push(@tokens, 1);
  428         $outstr .= '%s';
  429     }
  430     elsif ($i eq 'cs(from)') {
  431         $format .= '\"(.*?)\"';
  432         push(@tokens, 1);
  433         $outstr .= '"%s"';
  434     }
  435     elsif ($i eq 'time-taken') {
  436         $format .= '([\d\.]+|-)';
  437         push(@tokens, 11);
  438         $outstr .= '%s';
  439     }
  440     elsif ($i eq 'cs-method') {
  441         $format .= '([A-Za-z]+)';
  442         push(@tokens, 3);
  443         $outstr .= '%s';
  444     }
  445     else {  # unknown token
  446         $format .= '\S+';
  447         $outstr .= '-';
  448     }
  449     }
  450 }
  451 
  452 # The anonymizing functions
  453 #
  454 # The translations are looked up in a tree. Each node of the tree is a hash
  455 # as follows:
  456 # Keys: The part of the name being translated
  457 # Values: A 2-element \array (translation, \hash of the same type recursively)
  458 #
  459 # All these functions work the same way.
  460 # They are called with one argument from the main part of the
  461 # program, a second argument (\sub-hash) when called recursively.
  462 # The name is split into two components, $b to be translated immediately
  463 # and $a, the rest. $b's (translation, \subhash) is assigned to @b.
  464 sub anon_host {
  465     local $_ = $_[0];
  466     my @b;
  467     my $numhost = 0;
  468     if (!defined($_[1])) { tr/A-Z/a-z/; }
  469     if (/\.$/) { $_ = substr($_, 0, length($_) - 1); } # strip trailing dot
  470     my ($a, $b) = /^(.*)\.(.*)$/;
  471     if (!defined($b)) { $a = ''; $b = $_; }  # if no dot in (sub-)name
  472 
  473     if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
  474     else {
  475     if ($_ eq '' || $_ eq '-') { return '-'; }
  476     my($n1, $n2, $n3, $n4) =
  477         /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
  478     if (defined($n1) && $n1 <= 255 && $n2 <= 255 && $n3 <= 255 &&
  479         $n4 <= 255) { return(anon_numhost($_, \%hosttree)); }
  480     if (!/\./ && !grep($b eq $_, @domains)) {  # no dot in whole name
  481         @b = lookup_or_create($b, \%hosttree);
  482     } else { @b = lookup_or_create($b, \%hosttree, \@domains); }
  483     }
  484 
  485     if ($a eq '') { return($b[0]); }
  486     else { return(anon_host($a, $b[1]) . '.' . $b[0]); }
  487 }
  488 
  489 sub anon_numhost {     # Numerical hostnames
  490     local $_ = $_[0];
  491     my @b;
  492     my ($b, $a) = /^(.*?)\.(.*)$/;
  493     if (!defined($b)) { $a = ''; $b = $_; }
  494     my @newnumber = (rand255($_[1]));
  495     @b = lookup_or_create($b, $_[1], \@newnumber);
  496     if ($a eq '') { return($b[0]); }
  497     else { return($b[0] . '.' . anon_numhost($a, $b[1])); }
  498 }
  499 
  500 # anon_file also takes an optional argument number 2, overriding global
  501 # $case_sensitive. If it is present, initial stripping of anchors and search
  502 # arguments is also performed (because it is also used only on the first call
  503 # for that data, in this case a referrer).
  504 sub anon_file {
  505     local $_ = $_[0];
  506     my $args = '';
  507     my $case = $case_sensitive;
  508     my (@b, @tmp, $ans);
  509     if (!defined($_[1]) || defined($_[2])) {
  510     s/\#.*//;  # remove anchors
  511     if (s/\?.*//) { $args = '?args'; }
  512     s/%([\da-fA-F]{2})/pack("C", hex($1))/ge;  # change %7E to ~ etc.
  513     if (defined($_[2])) { $case = $_[2]; }
  514     if (!$case) { tr/A-Z/a-z/; }
  515     }
  516 
  517     my ($b, $a) = m[^/(.*?)(/.*)$];
  518     if (!defined($b)) {  # not two slashes in name
  519     if (!m[^/]) { return '-'; }
  520     # top level should always begin with slash, and lower levels forced to
  521     $b = substr($_, 1);
  522     if ($b eq '') { return "/$args"; }
  523     if (grep($b eq $_, @unchfiles)) { return "/$b$args"; }
  524     if (defined($_[1])) { @b = lookup_or_create_filename($b, \%{$_[1]}); }
  525     else { @b = lookup_or_create_filename($b, \%filetree); }
  526     return("/$b[0]$args");
  527     }
  528     # rest only reached if there were two slashes in name
  529     if ($b eq '.' || $b eq '..') {  # special case: leave these alone
  530     @tmp = ($b);
  531     if (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}, \@tmp); }
  532     else { @b = lookup_or_create($b, \%filetree, \@tmp); }
  533     }
  534     elsif (defined($_[1])) { @b = lookup_or_create($b, \%{$_[1]}); }
  535     else { @b = lookup_or_create($b, \%filetree); }
  536     return("/$b[0]" . anon_file($a, $b[1]) . $args);
  537 }
  538 
  539 # Referrers are a bit different because they're split into 3 parts:
  540 # a scheme, a hostname and a filename. We only allow through http: and
  541 # ftp: URLs. If the hostname is in the list of known $servernames, we preserve
  542 # the hostname and use the existing local translations for the filename part.
  543 # Otherwise we translate the hostname according to the existing hosttree, and
  544 # use this as the root for the referrer tree.
  545 sub anon_referrer {
  546     local $_ = $_[0];
  547     my @b;
  548     my ($scheme, $hostname, $port, $path) = m[^(.*?)://(.*?)(:.*?)?(/.*)$];
  549 
  550     if (!defined($scheme) || $scheme !~ /^(ht|f)tp/i) { return "-"; }
  551     $scheme =~ tr/A-Z/a-z/;
  552     if ($hostname =~ /\.$/) {  # strip trailing dot
  553     $hostname = substr($hostname, 0, length($hostname) - 1);
  554     }
  555     if (grep($_ eq $hostname, @servernames)) {
  556     return($scheme . '://' . $hostname . anon_file($path));
  557     }
  558     else {
  559     my @newhost = (anon_host($hostname));
  560     @b = lookup_or_create($hostname, \%reftree, \@newhost);
  561     return("$scheme://$b[0]$port" . anon_file($path, $b[1], 1));
  562     }
  563 }
  564 
  565 sub anon_user {   # users and virtual hosts aren't hierarchical
  566     local $_ = $_[0];
  567     my @b;
  568 
  569     if ($_ eq '' || $_ eq '-') { return '-'; }
  570     if (!$usercase_sensitive) { tr/A-Z/a-z/; }
  571     @b = lookup_or_create($_, \%usertree);
  572     return($b[0]);
  573 }
  574 
  575 sub anon_vhost {
  576     local $_ = $_[0];
  577     my @b;
  578 
  579     if ($_ eq '' || $_ eq '-') { return '-'; }
  580     tr/A-Z/a-z/;
  581     @b = lookup_or_create($_, \%vhosttree);
  582     return($b[0]);
  583 }
  584 
  585 # Look up an item (arg 0) in a tree node (arg 1), or create a new entry if
  586 # necessary. The entry is selected from array arg 2 if present, else from
  587 # dictionary entry of correct length, else a random string of correct length.
  588 # See also lookup_or_create_filename below.
  589 sub lookup_or_create {
  590     my $n = $_[0];
  591     my (@ans, %h);
  592 
  593     unless (defined(${$_[1]}{$n})) {
  594     if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1]); }
  595     else {
  596         $ans[0] = random_entry($dict[$matchlength?length($n):0], $_[1]);
  597     }
  598     unless (defined($ans[0])) {
  599         $ans[0] = random_string(length($n), $_[1]);
  600     }
  601     # Start hash table so it isn't undef later
  602     $h{''} = undef;
  603     $ans[1] = \%h;
  604     ${$_[1]}{$n} = \@ans;
  605     }
  606     return(@{${$_[1]}{$n}});
  607 }
  608 
  609 # The same as lookup_or_create above, but preserves the extension of filenames.
  610 # (Actually, this is never called with 3 args, but we leave it in for possible
  611 # future use, and to keep it parallel with the previous function).
  612 sub lookup_or_create_filename {
  613     local $_ = $_[0];
  614     my (@ans, %h);
  615 
  616     unless (defined(${$_[1]}{$_})) {
  617         my ($name, $ext) = m[^(.*)(\..*)$];
  618     if (!defined($name)) { $ext = ''; $name = $_; }  # no extension
  619     if (defined($_[2])) { $ans[0] = random_entry($_[2], $_[1], $ext); }
  620     else {
  621         $ans[0] = random_entry($dict[$matchlength?length($name):0], $_[1],
  622                    $ext);
  623     }
  624     unless (defined($ans[0])) {
  625         $ans[0] = random_string(length($name), $_[1], $ext);
  626     }
  627     $h{''} = undef;
  628     $ans[1] = \%h;
  629     ${$_[1]}{$_} = \@ans;
  630     }
  631     return(@{${$_[1]}{$_}});
  632 }
  633 
  634 # Select a random entry from array arg 0, but must not occur as value in hash
  635 # arg 1. If failed after $no_tries, give up and return undef.
  636 # If arg 2 exists, then the random entry is "arg0_element . arg2" instead.
  637 sub random_entry {
  638     if ($_[0] == undef) { return undef; }
  639     my @l = @{$_[0]};
  640     my @v = values(%{$_[1]});
  641     my ($ans, $k);
  642 
  643     if (@l == ()) { return undef; }
  644     for ($k = 0;
  645      (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries;
  646      $k++) { $ans = $l[rand($#l + 0.9999999)] . $_[2]; }
  647     if ($k < $no_tries) { return($ans); }
  648     else { return undef; }
  649 }
  650 
  651 # Create random string, length given by arg 0 (unless global $matchlength is
  652 # false), again must not occur as value in hash arg 1. Same arg2 as in
  653 # random_entry. This time if failed after $no_tries2, return any answer.
  654 sub random_string {
  655     my $l = $_[0];
  656     my @v = values(%{$_[1]});
  657     my ($ans, $i, $j, $k);
  658 
  659     if ($l == 0) { return(''); }
  660     unless ($matchlength) { $l = int(5 + rand(6)); }  # i.e. 5 to 10
  661     for ($k = 0;
  662      (!defined($ans) || grep {$ans eq ${$_}[0]} @v) && $k < $no_tries2;
  663      $k++) {
  664     $ans = '';
  665     for ($i = 0; $i < $l; $i++) {
  666         $j = 65 + rand(52);
  667         if ($j >= 91) { $j += 6; }
  668         $ans .= chr($j);
  669     }
  670     $ans .= $_[2];
  671     }
  672     return($ans);
  673 }
  674 
  675 # Select a random number from 0 to 255, but again not already occurring as
  676 # value in hash (arg 0). This should never fail in the context of this
  677 # program, but we use $no_tries2 again just in case.
  678 sub rand255 {
  679     my @v = values(%{$_[0]});
  680     my ($ans, $k);
  681 
  682     for ($k = 0;
  683      (!defined($ans) || grep {$ans == ${$_}[0]} @v) && $k < $no_tries2;
  684      $k++) { $ans = int(rand(255.9999999)); }
  685     return($ans);
  686 }
  687 
  688 # Write out the translations to file TRANS.
  689 # The 0th argument is the \hash to be interpreted;
  690 # The 1st argument is 0 if the name-parts are collated backwards, 1 if
  691 # forwards, 2 if they are hostnames (backwards unless numerical);
  692 # The 2nd argument is the delimiter between name-parts, or empty string if they
  693 # are not hierarchical.
  694 # The 3rd argument says whether the delimiter should also occur at the start of
  695 # the string.
  696 # The 4th argument says whether the entry should still be printed if it is
  697 # not (known to be) a leaf.
  698 # The 5th and 6th arguments, if present, are the up-tree name-parts for the
  699 # original and translated names respectively.
  700 sub writeout {
  701     my %hash = %{$_[0]};
  702     my ($colorder, $delim, $initial, $printall) = ($_[1], $_[2], $_[3], $_[4]);
  703     my ($partname_old, $partname_new) = ($_[5], $_[6]);
  704     my ($name, @value, $name_new, $name_old, $newcolord, $fieldwidth);
  705 
  706     foreach $name (sort {  # Declare sort order inline so we can use %hash
  707     # The first case is binned immediately below, but must catch here too
  708     (!defined($hash{$a}) || !defined($hash{$b}))?(lc($a) cmp lc($b)):
  709         (((${$hash{$a}}[0] + 0) <=> (${$hash{$b}}[0] + 0)) ||
  710     # If they start with (or are) numbers, sort them that way
  711          (lc(${$hash{$a}}[0]) cmp lc(${$hash{$b}}[0])) || # Usual ordering
  712          (($a + 0) <=> ($b + 0)) ||    # Fallback to untranslated names: but by
  713          (lc($a) cmp lc($b)))  # construction this should (all but) never occur
  714     } keys(%hash)) {       # End of sort order. Phew.
  715     if (defined($hash{$name})) {
  716         @value = @{$hash{$name}};
  717         if ($colorder == 2) { $newcolord = (($name =~ /^\d{1,3}$/)?1:0); }
  718         else { $newcolord = $colorder; }
  719         # NB Incomplete test for numerical hostname (unlike in anon_host)
  720         if ($newcolord == 1) {
  721         if (!$partname_old && !$initial) {
  722             $name_new = "$value[0]";
  723             $name_old = "$name";
  724         } else {
  725             $name_new = "$partname_new$delim$value[0]";
  726             $name_old = "$partname_old$delim$name";
  727         }
  728         $fieldwidth = -$width;
  729         }
  730         else {
  731         if (!$partname_old && !$initial) {
  732             $name_new = "$value[0]";
  733             $name_old = "$name";
  734         } else {
  735             $name_new = "$value[0]$delim$partname_new";
  736             $name_old = "$name$delim$partname_old";
  737         }
  738         $fieldwidth = $width;
  739         }
  740             printf TRANS ("%*s  =  %*s\n",
  741                           $fieldwidth, $name_new, $fieldwidth, $name_old)
  742                 if ($printall || keys(%{$value[1]}) <= 1);
  743         writeout($value[1], $newcolord, $delim, $initial, $printall,
  744              $name_old, $name_new);
  745     }
  746     }
  747 }