"Fossies" - the Fresh Open Source Software Archive

Member "fslint-2.46/fslint/findup" (2 Feb 2017, 8898 Bytes) of package /linux/privat/fslint-2.46.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Bash source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "findup": 2.44_vs_2.46.

    1 #!/bin/bash
    2 
    3 # findup - find duplicate files
    4 # Copyright © 2000-2017 by Pádraig Brady <P@draigBrady.com>.
    5 #
    6 # This program is free software; you can redistribute it and/or modify
    7 # it under the terms of the GNU General Public License as published by
    8 # the Free Software Foundation; either version 2 of the License, or
    9 # any later version.
   10 #
   11 # This program is distributed in the hope that it will be useful,
   12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   14 # See the GNU General Public License for more details,
   15 # which is available at www.gnu.org
   16 
   17 
   18 # Description
   19 #
   20 #   will show duplicate files in the specified directories
   21 #   (and their subdirectories), in the format:
   22 #
   23 #       file1
   24 #       file2
   25 #
   26 #       file3
   27 #       file4
   28 #       file5
   29 #
   30 #   or if the --summary option is specified:
   31 #
   32 #       2 * 2048    file1 file2
   33 #       3 * 1024    file3 file4 file5
   34 #
   35 #   Where the number is the disk usage in bytes of each of the
   36 #   duplicate files on that line, and all duplicate files are
   37 #   shown on the same line.
   38 #       Output it ordered by largest disk usage first and
   39 #   then by the number of duplicate files.
   40 #
   41 # Caveats/Notes:
   42 #   I compared this to any equivalent utils I could find (as of Nov 2000)
   43 #   and it's (by far) the fastest, has the most functionality (thanks to
   44 #   find) and has no (known) bugs. In my opinion fdupes is the next best but
   45 #   is slower (even though written in C), and has a bug where hard links
   46 #   in different directories are reported as duplicates sometimes.
   47 #
   48 #   This script requires uniq > V2.0.21 (part of GNU textutils|coreutils)
   49 #   dir/file names containing \n are ignored
   50 #   undefined operation for dir/file names containing \1
   51 #   sparse files are not treated differently.
   52 #   Don't specify params to find that affect output etc. (e.g -printf etc.)
   53 #   symbolic links are ignored.
   54 #   path1 & path2 can be files and/or directories
   55 
   56 script_dir=$(dirname "$0")              #directory of this script
   57 script_dir=$(readlink -f "$script_dir") #Make sure absolute path
   58 
   59 . "$script_dir"/supprt/fslver
   60 
   61 Usage() {
   62     ProgName=$(basename "$0")
   63     echo "find dUPlicate files.
   64 Usage: $ProgName [[[-t [-m|-d]] | [--summary]] [-r] [-f] paths(s) ...]
   65 
   66 If no path(s) specified then the current directory is assumed.
   67 
   68 
   69 When -m is specified any found duplicates will be merged (using hardlinks).
   70 When -s is specified any found duplicates will be replaced (using symlinks).
   71 When -d is specified any found duplicates will be deleted (leaving just 1).
   72 When -t is specfied, only report what -m or -d would do.
   73 
   74 When --summary is specified change output format to include file sizes.
   75 You can also pipe this summary format to "$script_dir"/fstool/dupwaste
   76 to get a total of the wastage due to duplicates.
   77 
   78 Examples:
   79 
   80 search for duplicates in current directory and below
   81     findup or findup .
   82 search for duplicates in all linux source directories and merge using hardlinks
   83     findup -m /usr/src/linux*
   84 same as above but don't look in subdirectories
   85     findup -r .
   86 search for duplicates in /usr/bin
   87     findup /usr/bin
   88 search in multiple directories but not their subdirectories
   89     findup -r /usr/bin /bin /usr/sbin /sbin
   90 search for duplicates in \$PATH
   91     findup \$("$script_dir"/supprt/getffp)
   92 exclude zero length files from the search
   93     findup / -size +0c
   94 search system for duplicate files over 100K in size
   95     findup / -size +100k
   96 search only my files (that I own and are in my home dir)
   97     findup ~ -user \$(id -u)
   98 search system for duplicate files belonging to roger
   99     findup / -user \$(id -u roger)"
  100     exit
  101 }
  102 
  103 cleanup_sum() {
  104 
  105     sed '
  106     # md5sum and sha1sum et. al. from coreutils at least,
  107     # to deal with \n in filenames, convert any \ and \n chars
  108     # to \\ and \\n respectively. Currently we ignore files with \n
  109     # so just undo this problematic escaping
  110     /^\\/{s/.//; s/\\\\/\\/g};
  111 
  112     # These utils also add a "*" flag character for normal files
  113     # on platforms where O_BINARY is significant (like CYGWIN).
  114     # We always process in binary mode and so remove that flag here
  115     s/^\([^ ]*\) \*/\1  /;
  116     '
  117 }
  118 
  119 for arg
  120 do
  121     case "$arg" in
  122     -h|--help|-help)
  123         Usage ;;
  124     -v|--version)
  125         Version ;;
  126     --summary)
  127         mode="summary" ;;
  128     --gui)
  129         # Undocumented option to avoid extra
  130         # hardlink merging already done in GUI
  131         gui=1 ;;
  132     -m)
  133         mode="merge" ;;
  134     -d)
  135         mode="del" ;;
  136     -s)
  137         mode="symlink" ;;
  138     -t)
  139         t="t" ;;
  140     *)
  141         argsToPassOn="$argsToPassOn $(shell_quote "$arg")" ;;
  142     esac
  143 done
  144 
  145 sep_mode="separate"
  146 
  147 if [ "$mode" = "summary" ]; then
  148     #Don't do extra hardlink processing.
  149     #This speeds things up, and also removes the python dependency
  150     merge_early="-u"
  151 fi
  152 
  153 . "$script_dir"/supprt/getfpf "$argsToPassOn"
  154 
  155 check_uniq
  156 
  157 dev_id="$(find /bin/sh -printf '%D' 2>/dev/null)"
  158 if [ "$dev_id" = "D" ] || [ ! "$dev_id" ]; then
  159     devFmt="\060" #0
  160 else
  161     devFmt=%D #This is new in findutils-4.2 and will help find more duplicates
  162 fi
  163 
  164                                     #print name, dev, inode & size.
  165 find "$@" -type f ! -name "*$LF*" -printf "$FPF\0$devFmt\0%i\0%s\n" |
  166 sort -u |            #merge files (indirectly) specified multiple times
  167 tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names
  168 sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes
  169 if [ -z "$merge_early" ]; then
  170     "$script_dir"/supprt/rmlint/merge_hardlinks
  171 else
  172     uniq -3 -D       #pick just duplicate filesizes
  173 fi |
  174 sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk
  175 cut -f1 -d' ' -s |   #get filenames to work on
  176 tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
  177 
  178 # The following optional block, md5sums a small sample of each file,
  179 # which can help when there are many files of the same size,
  180 # even more so if they are large. This usually adds a small amount of
  181 # runtime, however it can save a large amount of time in certain situations.
  182 if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
  183     xargs -r0 "$script_dir"/supprt/md5sum_approx |
  184     sort |                     #group duplicate files together
  185     uniq --all-repeated -w32 | #pick just duplicates
  186     cut -d' ' -f3- |           #get filenames
  187     sort |                     #sort by paths to try to minimise disk seeks
  188     tr '\n' '\0'               #delimit names with \0
  189 else
  190     cat
  191 fi |
  192 
  193 # This block selects duplicates using md5sum of whole file
  194 xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
  195 cleanup_sum |              #undo any backslash escaping
  196 sort |                     #group duplicate files together
  197 uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
  198 
  199 # The following optional block, checks duplicates again using sha1
  200 # Note for data sets that don't totally fit in cache this will
  201 # probably read duplicate files off the disk again.
  202 cut -s -d' ' -f3- |        #get filenames
  203 sort |                     #sort by paths to try to minimise disk seeks
  204 tr '\n' '\0' |             #delimit names with \0
  205 xargs -r0 sha1sum -- |     #to be sure to be sure
  206 cleanup_sum |              #undo any backslash escaping
  207 sort |                     #group duplicate files together
  208 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
  209 
  210 cut -d' ' -f3- |           #get filenames (and leave separating lines)
  211 
  212 if [ "$gui" ]; then
  213   # GUI already does similar processing for accurate disk usage reporting
  214   cat
  215 elif [ "$mode" ]; then
  216 
  217   # exclude already fully hardlinked duplicate groups
  218   "$script_dir"/supprt/rmlint/merge_hardlinks --non-gui |
  219 
  220   if [ ! $mode = "summary" ]; then # external call to python as this is faster
  221     if "$script_dir"/supprt/rmlint/fixdup </dev/null 2>/dev/null; then
  222         "$script_dir"/supprt/rmlint/fixdup $t$mode
  223     elif "$script_dir"/supprt/rmlint/fixdup.sh </dev/null 2>/dev/null; then
  224         "$script_dir"/supprt/rmlint/fixdup.sh $t$mode
  225     else
  226         echo "Error, couldn't execute merge util" >&2
  227         exit 1
  228     fi
  229   else
  230     (
  231     line=''
  232     declare -i counter #Use bash arithmetic, not expr (for speed)
  233     counter=0
  234     while read; do
  235         # note we dont specify "file" to `read`
  236         # as otherwise trailing IFS will be stripped
  237         file="$REPLY"
  238         if [ ! "$file" ]; then
  239             if [ ! -z "$line" ]; then
  240                 echo "$counter * $line"
  241             fi
  242             counter=0
  243         else
  244             if [ $counter -eq 0 ]; then
  245                 line=$(du -B1 "$file")
  246             else
  247                 line="$line $file"
  248             fi
  249             counter=counter+1
  250         fi
  251     done
  252 
  253     if [ ! -z "$line" ]; then
  254         echo "$counter * $line"
  255     fi
  256     ) |
  257     sort -k3,3 -k1,1 -brn
  258   fi
  259 else
  260   # exclude already fully hardlinked duplicate groups
  261   "$script_dir"/supprt/rmlint/merge_hardlinks --non-gui
  262 fi