"Fossies" - the Fresh Open Source Software Archive

Member "littleutils-1.2.4/repeats/repeats.in" (28 Mar 2021, 5250 Bytes) of package /linux/privat/littleutils-1.2.4.tar.lz:


As a special service "Fossies" has tried to format the requested text file into HTML format (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "repeats.in": 1.2.3_vs_1.2.4.

    1 #! PROGBASH
    2 # set vim: syntax=bash:
    3 
    4 # repeats: Searches for duplicate files in the specified directories
    5 
    6 # Copyright (C) 2004-2021 by Brian Lindholm.  This file is part of the
    7 # littleutils utility set.
    8 #
    9 # The repeats utility is free software; you can redistribute it and/or modify
   10 # it under the terms of the GNU General Public License as published by the Free
   11 # Software Foundation; either version 3, or (at your option) any later version.
   12 #
   13 # The repeats utility is distributed in the hope that it will be useful, but
   14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   15 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   16 # more details.
   17 #
   18 # You should have received a copy of the GNU General Public License along with
   19 # the littleutils.  If not, see <https://www.gnu.org/licenses/>.
   20 
   21 # get command-line options
   22 declare -i ALGORITHM=8
   23 HARDLINKS='n'
   24 declare -i MIDSIZE=65536
   25 PARANOID='n'
   26 VERBOSE='n'
   27 ZEROS='n'
   28 while getopts a:hlm:pvz opts
   29 do
   30   case $opts in
   31     a) ALGORITHM=$OPTARG ;;
   32     h) echo 'repeats LU_VERSION'
   33        echo 'usage: repeats [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]'
   34        echo '         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]'
   35        echo '         [directory ...]'
   36        echo 'algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,'
   37        echo '             6 = SHA512, 7 = BLAKE2B-256, 8 = BLAKE2B-512 (default)'
   38        exit 0 ;;
   39     l) HARDLINKS='y' ;;
   40     m) MIDSIZE=$OPTARG ;;
   41     p) PARANOID='y' ;;
   42     v) VERBOSE='y' ;;
   43     z) ZEROS='y' ;;
   44     *) echo 'repeats LU_VERSION'
   45        echo 'usage: repeats [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]'
   46        echo '         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]'
   47        echo '         [directory ...]'
   48        echo 'algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,'
   49        echo '             6 = SHA512, 7 = BLAKE2B-256, 8 = BLAKE2B-512 (default)'
   50        exit 1 ;;
   51   esac
   52 done
   53 shift $((${OPTIND}-1))
   54 
   55 # set up traps
   56 trap 'rm -f $TMPFILE0 $TMPFILE1 $TMPFILE2 $TMPFILE3 $TMPFILE4 $TMPFILE5 ; exit 1' 1 2 3 13 15
   57 
   58 # generate the initial list of files
   59 TMPFILE0=$(tempname repeats_$$_0) || exit 99
   60 if [ $# -eq 0 ]; then
   61   find . -type f -readable -print | sed -e 's/^\.\///' > $TMPFILE0
   62 elif [ $# -eq 1 ]; then
   63   if [ -d "$1" -a -r "$1" -a -x "$1" ]; then
   64     find "$1" -type f -readable -print > $TMPFILE0
   65   else
   66     echo "repeats error: $1 is not a readable directory"
   67     rm -f $TMPFILE0
   68     exit 1
   69   fi
   70 else
   71   while [ $# -gt 0 ]; do
   72     if [ -d "$1" -a -r "$1" -a -x "$1" ]; then
   73       find "$1" -type f -readable -print >> $TMPFILE0
   74     else
   75       echo "repeats error: $1 is not a readable directory"
   76       rm -f $TMPFILE0
   77       exit 1
   78     fi
   79     shift
   80   done
   81 fi
   82 if [ "$VERBOSE" = 'y' ]; then
   83   echo "repeats stage 0: total number of files = $(wc -l $TMPFILE0)" | sed -e "s: ${TMPFILE0}::" 1>&2
   84 fi
   85 
   86 # grab filesizes and eliminated zero-length files if requested
   87 TMPFILE1=$(tempname repeats_$$_1) || exit 99
   88 if [ "$ZEROS" = 'n' ]; then
   89   sort -u $TMPFILE0 | filesize -p | grep -v '	0$' | sort -t '	' -k2n,2n | REPSIZE > $TMPFILE1
   90   if [ "$VERBOSE" = 'y' ]; then
   91     echo "repeats stage 1: num files with non-unique and non-zero filesize = $(wc -l $TMPFILE1)" | sed -e "s: ${TMPFILE1}::" 1>&2
   92   fi
   93 else
   94   sort -u $TMPFILE0 | filesize -p | sort -t '	' -k2n,2n | REPSIZE > $TMPFILE1
   95   if [ "$VERBOSE" = 'y' ]; then
   96     echo "repeats stage 1: num files with non-unique filesize = $(wc -l $TMPFILE1)" | sed -e "s: ${TMPFILE1}::" 1>&2
   97   fi
   98 fi
   99 
  100 # search for duplicates based on node numbers (eliminate hardlinks)
  101 TMPFILE2=$(tempname repeats_$$_2) || exit 99
  102 if [ "$HARDLINKS" = 'n' ]; then
  103   sort $TMPFILE1 | REPHARD -p | sort -t '	' -k4n,4n -k3n,3n -k2n,2n -k1,1 | REPNODE > $TMPFILE2
  104   if [ "$VERBOSE" = 'y' ]; then
  105     echo "repeats stage 2: num files excluding hardlinks = $(wc -l $TMPFILE2)" | sed -e "s: ${TMPFILE2}::" 1>&2
  106   fi
  107 else
  108   mv $TMPFILE1 $TMPFILE2
  109 fi
  110 
  111 # search for duplicates based on a partial filehash
  112 TMPFILE3=$(tempname repeats_$$_3) || exit 99
  113 sort $TMPFILE2 | filehash -v -s -$ALGORITHM -p -n $MIDSIZE | sort -t '	' -k2n,2n -k3,3 -k1,1 | REPHASH > $TMPFILE3
  114 if [ "$VERBOSE" = 'y' ]; then
  115   echo "repeats stage 3: num file pairs with matching digest after $MIDSIZE bytes = $(wc -l $TMPFILE3)" | sed -e "s: ${TMPFILE3}::" 1>&2
  116 fi
  117 
  118 # search for duplicates based on a complete filehash
  119 TMPFILE4=$(tempname repeats_$$_4) || exit 99
  120 sed -e 's/\t/\n/' $TMPFILE3 | sort -u | filehash -v -s -$ALGORITHM -p | sort -t '	' -k2n,2n -k3,3 -k1,1 | REPHASH > $TMPFILE4
  121 if [ "$VERBOSE" = 'y' ]; then
  122   echo "repeats stage 4: num file pairs with matching complete digest = $(wc -l $TMPFILE4)" | sed -e "s: ${TMPFILE4}::" 1>&2
  123 fi
  124 
  125 # do final paranoia check if requested
  126 if [ "$PARANOID" = 'n' ]; then
  127   # make it final: print results
  128   sort $TMPFILE4
  129 else
  130   TMPFILE5=$(tempname repeats_$$_5) || exit 99
  131   sort $TMPFILE4 | REPCMP > $TMPFILE5
  132   if [ "$VERBOSE" = 'y' ]; then
  133     echo "repeats stage 5: num file pairs based on cmp results = $(wc -l $TMPFILE5)" | sed -e "s: ${TMPFILE5}::" 1>&2
  134   fi
  135   # make it final: print results
  136   cat $TMPFILE5
  137 fi
  138 
  139 # clean up temp files
  140 rm -f $TMPFILE0 $TMPFILE1 $TMPFILE2 $TMPFILE3 $TMPFILE4 $TMPFILE5