"Fossies" - the Fresh Open Source Software Archive 
Member "fslint-2.46/fslint/findup" (2 Feb 2017, 8898 Bytes) of package /linux/privat/fslint-2.46.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Bash source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
See also the latest
Fossies "Diffs" side-by-side code changes report for "findup":
2.44_vs_2.46.
1 #!/bin/bash
2
3 # findup - find duplicate files
4 # Copyright © 2000-2017 by Pádraig Brady <P@draigBrady.com>.
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 # See the GNU General Public License for more details,
15 # which is available at www.gnu.org
16
17
18 # Description
19 #
20 # will show duplicate files in the specified directories
21 # (and their subdirectories), in the format:
22 #
23 # file1
24 # file2
25 #
26 # file3
27 # file4
28 # file5
29 #
30 # or if the --summary option is specified:
31 #
32 # 2 * 2048 file1 file2
33 # 3 * 1024 file3 file4 file5
34 #
35 # Where the number is the disk usage in bytes of each of the
36 # duplicate files on that line, and all duplicate files are
37 # shown on the same line.
38 # Output it ordered by largest disk usage first and
39 # then by the number of duplicate files.
40 #
41 # Caveats/Notes:
42 # I compared this to any equivalent utils I could find (as of Nov 2000)
43 # and it's (by far) the fastest, has the most functionality (thanks to
44 # find) and has no (known) bugs. In my opinion fdupes is the next best but
45 # is slower (even though written in C), and has a bug where hard links
46 # in different directories are reported as duplicates sometimes.
47 #
48 # This script requires uniq > V2.0.21 (part of GNU textutils|coreutils)
49 # dir/file names containing \n are ignored
50 # undefined operation for dir/file names containing \1
51 # sparse files are not treated differently.
52 # Don't specify params to find that affect output etc. (e.g -printf etc.)
53 # symbolic links are ignored.
54 # path1 & path2 can be files and/or directories
55
56 script_dir=$(dirname "$0") #directory of this script
57 script_dir=$(readlink -f "$script_dir") #Make sure absolute path
58
59 . "$script_dir"/supprt/fslver
60
61 Usage() {
62 ProgName=$(basename "$0")
63 echo "find dUPlicate files.
64 Usage: $ProgName [[[-t [-m|-d]] | [--summary]] [-r] [-f] paths(s) ...]
65
66 If no path(s) specified then the current directory is assumed.
67
68
69 When -m is specified any found duplicates will be merged (using hardlinks).
70 When -s is specified any found duplicates will be replaced (using symlinks).
71 When -d is specified any found duplicates will be deleted (leaving just 1).
72 When -t is specfied, only report what -m or -d would do.
73
74 When --summary is specified change output format to include file sizes.
75 You can also pipe this summary format to "$script_dir"/fstool/dupwaste
76 to get a total of the wastage due to duplicates.
77
78 Examples:
79
80 search for duplicates in current directory and below
81 findup or findup .
82 search for duplicates in all linux source directories and merge using hardlinks
83 findup -m /usr/src/linux*
84 same as above but don't look in subdirectories
85 findup -r .
86 search for duplicates in /usr/bin
87 findup /usr/bin
88 search in multiple directories but not their subdirectories
89 findup -r /usr/bin /bin /usr/sbin /sbin
90 search for duplicates in \$PATH
91 findup \$("$script_dir"/supprt/getffp)
92 exclude zero length files from the search
93 findup / -size +0c
94 search system for duplicate files over 100K in size
95 findup / -size +100k
96 search only my files (that I own and are in my home dir)
97 findup ~ -user \$(id -u)
98 search system for duplicate files belonging to roger
99 findup / -user \$(id -u roger)"
100 exit
101 }
102
103 cleanup_sum() {
104
105 sed '
106 # md5sum and sha1sum et. al. from coreutils at least,
107 # to deal with \n in filenames, convert any \ and \n chars
108 # to \\ and \\n respectively. Currently we ignore files with \n
109 # so just undo this problematic escaping
110 /^\\/{s/.//; s/\\\\/\\/g};
111
112 # These utils also add a "*" flag character for normal files
113 # on platforms where O_BINARY is significant (like CYGWIN).
114 # We always process in binary mode and so remove that flag here
115 s/^\([^ ]*\) \*/\1 /;
116 '
117 }
118
119 for arg
120 do
121 case "$arg" in
122 -h|--help|-help)
123 Usage ;;
124 -v|--version)
125 Version ;;
126 --summary)
127 mode="summary" ;;
128 --gui)
129 # Undocumented option to avoid extra
130 # hardlink merging already done in GUI
131 gui=1 ;;
132 -m)
133 mode="merge" ;;
134 -d)
135 mode="del" ;;
136 -s)
137 mode="symlink" ;;
138 -t)
139 t="t" ;;
140 *)
141 argsToPassOn="$argsToPassOn $(shell_quote "$arg")" ;;
142 esac
143 done
144
145 sep_mode="separate"
146
147 if [ "$mode" = "summary" ]; then
148 #Don't do extra hardlink processing.
149 #This speeds things up, and also removes the python dependency
150 merge_early="-u"
151 fi
152
153 . "$script_dir"/supprt/getfpf "$argsToPassOn"
154
155 check_uniq
156
157 dev_id="$(find /bin/sh -printf '%D' 2>/dev/null)"
158 if [ "$dev_id" = "D" ] || [ ! "$dev_id" ]; then
159 devFmt="\060" #0
160 else
161 devFmt=%D #This is new in findutils-4.2 and will help find more duplicates
162 fi
163
164 #print name, dev, inode & size.
165 find "$@" -type f ! -name "*$LF*" -printf "$FPF\0$devFmt\0%i\0%s\n" |
166 sort -u | #merge files (indirectly) specified multiple times
167 tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names
168 sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes
169 if [ -z "$merge_early" ]; then
170 "$script_dir"/supprt/rmlint/merge_hardlinks
171 else
172 uniq -3 -D #pick just duplicate filesizes
173 fi |
174 sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk
175 cut -f1 -d' ' -s | #get filenames to work on
176 tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
177
178 # The following optional block, md5sums a small sample of each file,
179 # which can help when there are many files of the same size,
180 # even more so if they are large. This usually adds a small amount of
181 # runtime, however it can save a large amount of time in certain situations.
182 if "$script_dir"/supprt/md5sum_approx </dev/null 2>/dev/null; then
183 xargs -r0 "$script_dir"/supprt/md5sum_approx |
184 sort | #group duplicate files together
185 uniq --all-repeated -w32 | #pick just duplicates
186 cut -d' ' -f3- | #get filenames
187 sort | #sort by paths to try to minimise disk seeks
188 tr '\n' '\0' #delimit names with \0
189 else
190 cat
191 fi |
192
193 # This block selects duplicates using md5sum of whole file
194 xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
195 cleanup_sum | #undo any backslash escaping
196 sort | #group duplicate files together
197 uniq --all-repeated=$sep_mode -w32 | #pick just duplicates
198
199 # The following optional block, checks duplicates again using sha1
200 # Note for data sets that don't totally fit in cache this will
201 # probably read duplicate files off the disk again.
202 cut -s -d' ' -f3- | #get filenames
203 sort | #sort by paths to try to minimise disk seeks
204 tr '\n' '\0' | #delimit names with \0
205 xargs -r0 sha1sum -- | #to be sure to be sure
206 cleanup_sum | #undo any backslash escaping
207 sort | #group duplicate files together
208 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
209
210 cut -d' ' -f3- | #get filenames (and leave separating lines)
211
212 if [ "$gui" ]; then
213 # GUI already does similar processing for accurate disk usage reporting
214 cat
215 elif [ "$mode" ]; then
216
217 # exclude already fully hardlinked duplicate groups
218 "$script_dir"/supprt/rmlint/merge_hardlinks --non-gui |
219
220 if [ ! $mode = "summary" ]; then # external call to python as this is faster
221 if "$script_dir"/supprt/rmlint/fixdup </dev/null 2>/dev/null; then
222 "$script_dir"/supprt/rmlint/fixdup $t$mode
223 elif "$script_dir"/supprt/rmlint/fixdup.sh </dev/null 2>/dev/null; then
224 "$script_dir"/supprt/rmlint/fixdup.sh $t$mode
225 else
226 echo "Error, couldn't execute merge util" >&2
227 exit 1
228 fi
229 else
230 (
231 line=''
232 declare -i counter #Use bash arithmetic, not expr (for speed)
233 counter=0
234 while read; do
235 # note we dont specify "file" to `read`
236 # as otherwise trailing IFS will be stripped
237 file="$REPLY"
238 if [ ! "$file" ]; then
239 if [ ! -z "$line" ]; then
240 echo "$counter * $line"
241 fi
242 counter=0
243 else
244 if [ $counter -eq 0 ]; then
245 line=$(du -B1 "$file")
246 else
247 line="$line $file"
248 fi
249 counter=counter+1
250 fi
251 done
252
253 if [ ! -z "$line" ]; then
254 echo "$counter * $line"
255 fi
256 ) |
257 sort -k3,3 -k1,1 -brn
258 fi
259 else
260 # exclude already fully hardlinked duplicate groups
261 "$script_dir"/supprt/rmlint/merge_hardlinks --non-gui
262 fi