"Fossies" - the Fresh Open Source Software Archive

Member "littleutils-1.2.5/repeats/repeats.pl.in" (29 Oct 2021, 7826 Bytes) of package /linux/privat/littleutils-1.2.5.tar.lz:


As a special service "Fossies" has tried to format the requested text file into HTML format (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "repeats.pl.in": 1.2.4_vs_1.2.5.

    1 #! PROGPERL
    2 # vim: set filetype=perl:
    3 
    4 # repeats.pl: Searches for duplicate files in the specified directories (just
    5 # like repeats)
    6 
    7 # Copyright (C) 2020 by Brian Lindholm.  This file is part of the littleutils
    8 # utility set.
    9 #
   10 # The repeats.pl utility is free software; you can redistribute it and/or
   11 # modify it under the terms of the GNU General Public License as published by
   12 # the Free Software Foundation; either version 3, or (at your option) any later
   13 # version.
   14 #
   15 # The repeats.pl utility is distributed in the hope that it will be useful, but
   16 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   17 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   18 # more details.
   19 #
   20 # You should have received a copy of the GNU General Public License along with
   21 # the littleutils.  If not, see <https://www.gnu.org/licenses/>.
   22 
   23 ### MODULES ###
   24 use strict;
   25 use warnings;
   26 use Getopt::Std;
   27 use Crypt::Digest::MD5;  # all digests require the CryptX module/package
   28 use Crypt::Digest::SHA1;
   29 use Crypt::Digest::SHA224;
   30 use Crypt::Digest::SHA256;
   31 use Crypt::Digest::SHA384;
   32 use Crypt::Digest::SHA512;
   33 use Crypt::Digest::BLAKE2b_256;
   34 use Crypt::Digest::BLAKE2b_512;
   35 
   36 ### INPUT ARGUMENTS ###
   37 our $opt_1 = ''; our $opt_a = 8; our $opt_h = ''; our $opt_l = '';
   38 our $opt_m = 4096; our $opt_r = 4; our $opt_v = ''; our $opt_z = '';
   39 my $goodopt = getopts('1a:hlm:r:vz');
   40 # print help if requested or if bad options used, then quit
   41 if ((not $goodopt) or $opt_h) {
   42   print "repeats.pl LU_VERSION\n";
   43   print "usage: repeats.pl [-1(line)] [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]\n";
   44   print "         [-m(idsize) bytecount] [-r ramp_factor] [-v(erbose)] [-z(eros)]\n";
   45   print "         [directory ...]\n";
   46   print "algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,\n";
   47   print "             6 = SHA512 (default), 7 = BLAKE2B-256, 8 = BLAKE2B-512\n";
   48   exit(0);
   49 }
   50 $opt_m = 4096 if ($opt_m < 1);
   51 $opt_r = 4 if ($opt_r < 1);
   52 
   53 ### GLOBAL VARIABLES ###
   54 my $BUFSIZE = 1024 * 1024; my $match_count = 0;
   55 my %filedev = (); my %filehash = (); my %filenode = (); my %filesize = ();
   56 my %digest_seen = (); my %size_seen = ();
   57 my @results = ();
   58 
   59 ### LIST FUNCTIONS ###
   60 # list with subsequent duplicates removed
   61 sub uniq {
   62   my %seen = ();
   63   return grep { ! $seen{$_}++ } @_;
   64 }
   65 # list with subsequent hardlinks (matching inode number & dev number) removed
   66 sub uniq_inode {
   67   my %seen = ();
   68   return grep { ! $seen{pack('L2', $filenode{$_}, $filedev{$_})}++ } @_;
   69 }
   70 # add list of matching files to results buffer
   71 sub push_to_results {
   72   if ($opt_1) {
   73     push(@results, join("\t", @_));
   74   }
   75   else {
   76     for my $i (0 .. ($#_-1)) {
   77       push(@results, $_[$i] . "\t" . $_[$i+1]);
   78     }
   79   }
   80   $match_count += ($#_+1);
   81 }
   82 
   83 ### FIND FUNCTION ###
   84 # find files (excluding symlinks) and fetch stats
   85 sub find_files {
   86   foreach my $dir (@_) {
   87     $dir =~ s/\/+$//;
   88     opendir(my $DIR, $dir) or die "repeats.pl error: cannot open $dir\n";
   89     my @subdirs = ();
   90     while (defined(my $entry = readdir($DIR))) {
   91       next if (($entry eq '.') || ($entry eq '..'));
   92       my $fullname = (($#ARGV == -1) && ($dir eq '.')) ? $entry : $dir . '/' . $entry;
   93       my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size) = lstat($fullname);
   94       if ( -f _ ) {
   95         $filedev{$fullname} = $dev;
   96         $filenode{$fullname} = $ino;
   97         $filesize{$fullname} = $size;
   98         $size_seen{$size}++;
   99       }
  100       elsif ( -d _ ) {
  101         push(@subdirs, $fullname);
  102       }
  103     }
  104     close($DIR);
  105     find_files(@subdirs) if ($#subdirs > -1);
  106   }
  107 }
  108 
  109 ### DIGEST FUNCTION ###
  110 # grab partial file hash, skipping already-read bytes
  111 sub grab_digest {
  112   my ($file, $bytes_read, $bytes_to_read) = @_; my $tmp;
  113   open(my $FILE, "<:raw", $file) or die "repeats.pl error: unable to open $file\n";
  114   seek($FILE, $bytes_read, 0) if ($bytes_read > 0);
  115   if ($bytes_read > 0) { $tmp = $filehash{$file}->clone(); }  # clone required for BLAKE2b
  116   elsif ($opt_a == 8) { $tmp = Crypt::Digest::BLAKE2b_512->new; }
  117   elsif ($opt_a == 7) { $tmp = Crypt::Digest::BLAKE2b_256->new; }
  118   elsif ($opt_a == 6) { $tmp = Crypt::Digest::SHA512->new; }
  119   elsif ($opt_a == 5) { $tmp = Crypt::Digest::SHA384->new; }
  120   elsif ($opt_a == 4) { $tmp = Crypt::Digest::SHA256->new; }
  121   elsif ($opt_a == 3) { $tmp = Crypt::Digest::SHA224->new; }
  122   elsif ($opt_a == 2) { $tmp = Crypt::Digest::SHA1->new; }
  123   elsif ($opt_a == 1) { $tmp = Crypt::Digest::MD5->new; }
  124   else { die "repeats.pl error: unsupported algorithm selected\n"; }
  125   do {
  126     my $rc = read($FILE, my $data, ($bytes_to_read > $BUFSIZE) ? $BUFSIZE : $bytes_to_read);
  127     die "repeats.pl error: unable to read data from $file\n" unless (defined($rc));
  128     $tmp->add($data) if ($rc > 0);
  129     $bytes_to_read -= $rc;
  130     $bytes_read += $rc;
  131   } while (($bytes_read < $filesize{$file}) and ($bytes_to_read > 0));
  132   close($FILE);
  133   $filehash{$file} = $tmp->clone();  # clone required for BLAKE2b
  134   my $digest = $tmp->digest . pack('Q', $filesize{$file});
  135   ($bytes_read < $filesize{$file}) ? $digest_seen{$digest}-- : $digest_seen{$digest}++;
  136   return $digest;
  137 }
  138 
  139 ### BEGIN MAIN PROGRAM ###
  140 # traverse listed paths
  141 my @search_paths = ($#ARGV > -1) ? grep { -d } uniq(@ARGV) : ('.');
  142 find_files(@search_paths);
  143 my @candidates = keys(%filesize);
  144 printf STDERR "repeats.pl stage 0: total number of files = %d\n", ($#candidates + 1) if ($opt_v);
  145 
  146 ### Optional STAGE 1 ###
  147 if ($#candidates > 0) {
  148   # all zero-length files are the same
  149   my @zeros = grep { $filesize{$_} == 0 } @candidates;
  150   printf STDERR "repeats.pl stage 1: num files with zero length = %d\n", ($#zeros + 1) if ($opt_v);
  151   push_to_results(@zeros) if (($opt_z) and ($#zeros > 0));
  152   # remove zero-length files from the list and re-sort
  153   @candidates = sort(grep { $filesize{$_} > 0 } @candidates);
  154 }
  155 if ((not $opt_l) and ($#candidates > 0)) {
  156   # remove files that use the same inode number as a previous file, leaving only the first
  157   @candidates = uniq_inode(@candidates);
  158   printf STDERR "repeats.pl stage 1: num files excluding hardlinks = %d\n", ($#candidates + 1) if ($opt_v);
  159 }
  160 
  161 ### STAGE 2 ###
  162 # remove files with a unique filesize
  163 if ($#candidates > 0) {
  164   @candidates = grep { $size_seen{$filesize{$_}} > 1 } @candidates;
  165   printf STDERR "repeats.pl stage 2: num files with non-unique filesize = %d\n", ($#candidates + 1) if ($opt_v);
  166 }
  167 
  168 ### STAGE 3 ###
  169 # examine files with repeated sizes and add to results buffer if hashes match
  170 my $bytes_read = 0; my $bytes_to_read = int($opt_m);
  171 @candidates = sort { ($filedev{$a} <=> $filedev{$b}) || ($filenode{$a} <=> $filenode{$b}) } @candidates;
  172 while ($#candidates > 0) {
  173   # grab digests for all candidates
  174   %digest_seen = ();
  175   my %digest = map { $_ => grab_digest($_, $bytes_read, $bytes_to_read) } @candidates;
  176   $bytes_read += $bytes_to_read;
  177   # write finished files to results buffer
  178   my @finished = grep { $digest_seen{$digest{$_}} > 1 } @candidates;  # grab all complete digests seen at least twice
  179   if ($#finished > 0) {
  180     my @dup_digest_array = grep { $digest_seen{$_} > 1 } keys(%digest_seen);
  181     my %file_via_digest = ();
  182     foreach (@dup_digest_array) {
  183       @{$file_via_digest{$_}} = ();
  184     }
  185     foreach (@finished) {
  186       push (@{$file_via_digest{$digest{$_}}}, $_);
  187     }
  188     foreach (@dup_digest_array) {
  189       push_to_results(sort(@{$file_via_digest{$_}})) if ($#{$file_via_digest{$_}} > 0);
  190     }
  191   }
  192   # reduce list to unfinished files
  193   @candidates = grep { $digest_seen{$digest{$_}} < -1 } @candidates;  # grab all partial digests seen at least twice
  194   printf STDERR "repeats.pl stage 3: num files with matching digest after %d bytes = %d (%d remaining)\n",
  195     $bytes_read, $match_count, ($#candidates + 1) if ($opt_v);
  196   $bytes_to_read = int($bytes_to_read * $opt_r);
  197 }
  198 
  199 ### FINAL RESULTS ###
  200 foreach (sort(@results)) {
  201   print $_, "\n";
  202 }