findup (fslint-2.44) | : | findup (fslint-2.46) | ||
---|---|---|---|---|
#!/bin/bash | #!/bin/bash | |||
# findup - find duplicate files | # findup - find duplicate files | |||
# Copyright © 2000-2009 by Pádraig Brady <P@draigBrady.com>. | # Copyright © 2000-2017 by Pádraig Brady <P@draigBrady.com>. | |||
# | # | |||
# This program is free software; you can redistribute it and/or modify | # This program is free software; you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | # it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation; either version 2 of the License, or | # the Free Software Foundation; either version 2 of the License, or | |||
# any later version. | # any later version. | |||
# | # | |||
# This program is distributed in the hope that it will be useful, | # This program is distributed in the hope that it will be useful, | |||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |||
# See the GNU General Public License for more details, | # See the GNU General Public License for more details, | |||
skipping to change at line 52 | skipping to change at line 52 | |||
# and it's (by far) the fastest, has the most functionality (thanks to | # and it's (by far) the fastest, has the most functionality (thanks to | |||
# find) and has no (known) bugs. In my opinion fdupes is the next best but | # find) and has no (known) bugs. In my opinion fdupes is the next best but | |||
# is slower (even though written in C), and has a bug where hard links | # is slower (even though written in C), and has a bug where hard links | |||
# in different directories are reported as duplicates sometimes. | # in different directories are reported as duplicates sometimes. | |||
# | # | |||
# This script requires uniq > V2.0.21 (part of GNU textutils|coreutils) | # This script requires uniq > V2.0.21 (part of GNU textutils|coreutils) | |||
# dir/file names containing \n are ignored | # dir/file names containing \n are ignored | |||
# undefined operation for dir/file names containing \1 | # undefined operation for dir/file names containing \1 | |||
# sparse files are not treated differently. | # sparse files are not treated differently. | |||
# Don't specify params to find that affect output etc. (e.g -printf etc.) | # Don't specify params to find that affect output etc. (e.g -printf etc.) | |||
# zero length files are ignored. | ||||
# symbolic links are ignored. | # symbolic links are ignored. | |||
# path1 & path2 can be files &/or directories | # path1 & path2 can be files and/or directories | |||
script_dir=$(dirname "$0") #directory of this script | script_dir=$(dirname "$0") #directory of this script | |||
script_dir=$(readlink -f "$script_dir") #Make sure absolute path | script_dir=$(readlink -f "$script_dir") #Make sure absolute path | |||
. "$script_dir"/supprt/fslver | . "$script_dir"/supprt/fslver | |||
Usage() { | Usage() { | |||
ProgName=$(basename "$0") | ProgName=$(basename "$0") | |||
echo "find dUPlicate files. | echo "find dUPlicate files. | |||
Usage: $ProgName [[[-t [-m|-d]] | [--summary]] [-r] [-f] paths(s) ...] | Usage: $ProgName [[[-t [-m|-d]] | [--summary]] [-r] [-f] paths(s) ...] | |||
If no path(s) specified then the current directory is assumed. | If no path(s) specified then the current directory is assumed. | |||
When -m is specified any found duplicates will be merged (using hardlinks). | When -m is specified any found duplicates will be merged (using hardlinks). | |||
When -s is specified any found duplicates will be replaced (using symlinks). | ||||
When -d is specified any found duplicates will be deleted (leaving just 1). | When -d is specified any found duplicates will be deleted (leaving just 1). | |||
When -t is specfied, only report what -m or -d would do. | When -t is specfied, only report what -m or -d would do. | |||
When --summary is specified change output format to include file sizes. | When --summary is specified change output format to include file sizes. | |||
You can also pipe this summary format to "$script_dir"/fstool/dupwaste | You can also pipe this summary format to "$script_dir"/fstool/dupwaste | |||
to get a total of the wastage due to duplicates. | to get a total of the wastage due to duplicates. | |||
Examples: | Examples: | |||
search for duplicates in current directory and below | search for duplicates in current directory and below | |||
skipping to change at line 90 | skipping to change at line 90 | |||
search for duplicates in all linux source directories and merge using hardlinks | search for duplicates in all linux source directories and merge using hardlinks | |||
findup -m /usr/src/linux* | findup -m /usr/src/linux* | |||
same as above but don't look in subdirectories | same as above but don't look in subdirectories | |||
findup -r . | findup -r . | |||
search for duplicates in /usr/bin | search for duplicates in /usr/bin | |||
findup /usr/bin | findup /usr/bin | |||
search in multiple directories but not their subdirectories | search in multiple directories but not their subdirectories | |||
findup -r /usr/bin /bin /usr/sbin /sbin | findup -r /usr/bin /bin /usr/sbin /sbin | |||
search for duplicates in \$PATH | search for duplicates in \$PATH | |||
findup \$("$script_dir"/supprt/getffp) | findup \$("$script_dir"/supprt/getffp) | |||
exclude zero length files from the search | ||||
findup / -size +0c | ||||
search system for duplicate files over 100K in size | search system for duplicate files over 100K in size | |||
findup / -size +100k | findup / -size +100k | |||
search only my files (that I own and are in my home dir) | search only my files (that I own and are in my home dir) | |||
findup ~ -user \$(id -u) | findup ~ -user \$(id -u) | |||
search system for duplicate files belonging to roger | search system for duplicate files belonging to roger | |||
findup / -user \$(id -u roger)" | findup / -user \$(id -u roger)" | |||
exit | exit | |||
} | } | |||
cleanup_sum() { | cleanup_sum() { | |||
skipping to change at line 124 | skipping to change at line 126 | |||
for arg | for arg | |||
do | do | |||
case "$arg" in | case "$arg" in | |||
-h|--help|-help) | -h|--help|-help) | |||
Usage ;; | Usage ;; | |||
-v|--version) | -v|--version) | |||
Version ;; | Version ;; | |||
--summary) | --summary) | |||
mode="summary" ;; | mode="summary" ;; | |||
--gui) | ||||
# Undocumented option to avoid extra | ||||
# hardlink merging already done in GUI | ||||
gui=1 ;; | ||||
-m) | -m) | |||
mode="merge" ;; | mode="merge" ;; | |||
-d) | -d) | |||
mode="del" ;; | mode="del" ;; | |||
-s) | ||||
mode="symlink" ;; | ||||
-t) | -t) | |||
t="t" ;; | t="t" ;; | |||
*) | *) | |||
argsToPassOn="$argsToPassOn $(shell_quote "$arg")" ;; | argsToPassOn="$argsToPassOn $(shell_quote "$arg")" ;; | |||
esac | esac | |||
done | done | |||
sep_mode="separate" | sep_mode="separate" | |||
if [ "$mode" = "summary" ]; then | if [ "$mode" = "summary" ]; then | |||
skipping to change at line 154 | skipping to change at line 162 | |||
check_uniq | check_uniq | |||
dev_id="$(find /bin/sh -printf '%D' 2>/dev/null)" | dev_id="$(find /bin/sh -printf '%D' 2>/dev/null)" | |||
if [ "$dev_id" = "D" ] || [ ! "$dev_id" ]; then | if [ "$dev_id" = "D" ] || [ ! "$dev_id" ]; then | |||
devFmt="\060" #0 | devFmt="\060" #0 | |||
else | else | |||
devFmt=%D #This is new in findutils-4.2 and will help find more duplicates | devFmt=%D #This is new in findutils-4.2 and will help find more duplicates | |||
fi | fi | |||
#print name, dev, inode & size. | #print name, dev, inode & size. | |||
find "$@" -size +0c -type f ! -name "*$LF*" -printf "$FPF\0$devFmt\0%i\0%s\n" | | find "$@" -type f ! -name "*$LF*" -printf "$FPF\0$devFmt\0%i\0%s\n" | | |||
sort -u | #merge files (indirectly) specified multiple times | sort -u | #merge files (indirectly) specified multiple times | |||
tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names | tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names | |||
sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes | sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes | |||
if [ -z "$merge_early" ]; then | if [ -z "$merge_early" ]; then | |||
"$script_dir"/supprt/rmlint/merge_hardlinks | "$script_dir"/supprt/rmlint/merge_hardlinks | |||
else | else | |||
uniq -3 -D #pick just duplicate filesizes | uniq -3 -D #pick just duplicate filesizes | |||
fi | | fi | | |||
sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk | sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk | |||
cut -f1 -d' ' -s | #get filenames to work on | cut -f1 -d' ' -s | #get filenames to work on | |||
skipping to change at line 202 | skipping to change at line 210 | |||
cut -s -d' ' -f3- | #get filenames | cut -s -d' ' -f3- | #get filenames | |||
sort | #sort by paths to try to minimise disk seeks | sort | #sort by paths to try to minimise disk seeks | |||
tr '\n' '\0' | #delimit names with \0 | tr '\n' '\0' | #delimit names with \0 | |||
xargs -r0 sha1sum -- | #to be sure to be sure | xargs -r0 sha1sum -- | #to be sure to be sure | |||
cleanup_sum | #undo any backslash escaping | cleanup_sum | #undo any backslash escaping | |||
sort | #group duplicate files together | sort | #group duplicate files together | |||
uniq --all-repeated=$sep_mode -w40 | #pick just duplicates | uniq --all-repeated=$sep_mode -w40 | #pick just duplicates | |||
cut -d' ' -f3- | #get filenames (and leave separating lines) | cut -d' ' -f3- | #get filenames (and leave separating lines) | |||
if [ "$mode" ]; then | if [ "$gui" ]; then | |||
# GUI already does similar processing for accurate disk usage reporting | ||||
cat | ||||
elif [ "$mode" ]; then | ||||
# exclude already fully hardlinked duplicate groups | ||||
"$script_dir"/supprt/rmlint/merge_hardlinks --non-gui | | ||||
if [ ! $mode = "summary" ]; then # external call to python as this is faster | if [ ! $mode = "summary" ]; then # external call to python as this is faster | |||
if "$script_dir"/supprt/rmlint/fixdup </dev/null 2>/dev/null; then | if "$script_dir"/supprt/rmlint/fixdup </dev/null 2>/dev/null; then | |||
"$script_dir"/supprt/rmlint/fixdup $t$mode | "$script_dir"/supprt/rmlint/fixdup $t$mode | |||
elif "$script_dir"/supprt/rmlint/fixdup.sh </dev/null 2>/dev/null; then | elif "$script_dir"/supprt/rmlint/fixdup.sh </dev/null 2>/dev/null; then | |||
"$script_dir"/supprt/rmlint/fixdup.sh $t$mode | "$script_dir"/supprt/rmlint/fixdup.sh $t$mode | |||
else | else | |||
echo "Error, couldn't execute merge util" >&2 | echo "Error, couldn't execute merge util" >&2 | |||
exit 1 | exit 1 | |||
fi | fi | |||
else | else | |||
( | ( | |||
line='' | line='' | |||
declare -i counter #Use bash arithmetic, not expr (for speed) | declare -i counter #Use bash arithmetic, not expr (for speed) | |||
counter=0 | counter=0 | |||
while read; do | while read; do | |||
# note we don't specify "file" to `read` | # note we dont specify "file" to `read` | |||
# as otherwise trailing IFS will be stripped | # as otherwise trailing IFS will be stripped | |||
file="$REPLY" | file="$REPLY" | |||
if [ ! "$file" ]; then | if [ ! "$file" ]; then | |||
if [ ! -z "$line" ]; then | if [ ! -z "$line" ]; then | |||
echo "$counter * $line" | echo "$counter * $line" | |||
fi | fi | |||
counter=0 | counter=0 | |||
else | else | |||
if [ $counter -eq 0 ]; then | if [ $counter -eq 0 ]; then | |||
line=$(du -B1 "$file") | line=$(du -B1 "$file") | |||
skipping to change at line 243 | skipping to change at line 258 | |||
fi | fi | |||
done | done | |||
if [ ! -z "$line" ]; then | if [ ! -z "$line" ]; then | |||
echo "$counter * $line" | echo "$counter * $line" | |||
fi | fi | |||
) | | ) | | |||
sort -k3,3 -k1,1 -brn | sort -k3,3 -k1,1 -brn | |||
fi | fi | |||
else | else | |||
cat | # exclude already fully hardlinked duplicate groups | |||
"$script_dir"/supprt/rmlint/merge_hardlinks --non-gui | ||||
fi | fi | |||
End of changes. 11 change blocks. | ||||
8 lines changed or deleted | 24 lines changed or added |