]> git.somenet.org - pub/jan/scripts.git/blob - rmdupe
remove duplicate files (crapscript)
[pub/jan/scripts.git] / rmdupe
1 #!/bin/bash
2 # Script Name: rmdupe    http://igurublog.wordpress.com/downloads/script-rmdupe/ 
3 # Requires: 
4 # License: GNU GENERAL PUBLIC LICENSE Version 3 http://www.gnu.org/licenses/gpl-3.0.txt
5
6 argsneeded=1
7
8 help ()
9 {
10         cat << EOF
11 rmdupe version 1.0.4
12 Usage: rmdupe [OPTIONS] FOLDER [...]
13 Removes duplicate files in specified folders.  By default, newest duplicates
14  are removed.
15 Options:
16 -R, -r              search specified folders recursively
17 --ref FOLDER        also search FOLDER recursively for copies but don't
18                     remove any files from here (multiple --ref allowed)
19                     Note: files may be removed from a ref folder if that
20                     folder is also a specified folder
21 --trash FOLDER      copy duplicate files to FOLDER instead of removing
22 --sim               simulate and report duplicates only - no removal
23 --quiet             minimize output (disabled if used with --sim)
24 --verbose           detailed output
25 --old               remove oldest duplicates instead of newest
26 --minsize SIZE      limit search to duplicate files SIZE MB and larger
27 --maxsize SIZE      limit search to duplicate files SIZE MB and smaller
28 --rmcmd "RMCMD"     execute RMCMD instead of rm to remove copies
29                     (may contain arguments, eg: "rm -f" or "shred -u")
30 --xdev              don't descend to other filesystems when recursing
31                     specified or ref folders
32 Notes: do not use wildcards; symlinks are not followed except on the
33        command line; zero-length files are ignored
34
35 Instructions and updates:"
36 http://igurublog.wordpress.com/downloads/script-rmdupe/
37
38 EOF
39         exit 1
40 }
41
42 rcount=0
43 dcount=0
44 xdev=""
45 rmcmd="rm"
46 while [ "$1" != "" ];
47 do
48         if [ "${1:0:1}" = "-" ]; then
49                 case "$1" in
50                         --help | -help )
51                                 help
52                                 ;;
53                         -r | -R )
54                                 optrecurse=1
55                                 ;;
56                         --xdev | -xdev )
57                                 xdev="-xdev"
58                                 ;;
59                         --sim )
60                                 optsim=1
61                                 ;;
62                         --old )
63                                 optold=1
64                                 ;;
65                         --quiet )
66                                 optquiet=1
67                                 ;;
68                         --verbose )
69                                 optverbose=1
70                                 ;;
71                         --ref )
72                                 if [ "$2" == "" ] || [ "${2:0:1}" = "-" ]; then
73                                         echo Option $1 requires argument
74                                         exit 1
75                                 fi
76                                 if [ ! -d "$2" ]; then
77                                         echo "rmdupe: Error: ref folder \"$2\" does not exist"
78                                         exit 1
79                                 fi
80                                 rfolder[$rcount]="$2"
81                                 (( rcount += 1 ))
82                                 shift
83                                 ;;
84                         --trash )
85                                 if [ "$2" == "" ] || [ "${2:0:1}" = "-" ]; then
86                                         echo Option $1 requires argument
87                                         exit 1
88                                 fi
89                                 if [ "$trash" != "" ]; then
90                                         echo "rmdupe: Error: only one trash folder allowed"
91                                         exit 1
92                                 fi
93                                 trash="$2"
94                                 shift
95                                 ;;
96                         --rmcmd )
97                                 if [ "$2" == "" ] || [ "${2:0:1}" = "-" ]; then
98                                         echo Option $1 requires argument
99                                         exit 1
100                                 elif [ "$rmcmd" != "rm" ]; then
101                                         echo "rmdupe: Error: only one rmcmd accepted"
102                                         exit 1
103                                 fi
104                                 rmcmd="$2"
105                                 shift
106                                 ;;
107                         --minsize )
108                                 if [ "$2" == "" ] || [ "${2:0:1}" = "-" ]; then
109                                         echo Option $1 requires argument
110                                         exit 1
111                                 fi
112                                 minsize="$2"
113                                 if (( minsize < 1 )); then
114                                         echo "rmdupe: Error: invalid minsize"
115                                         exit 1
116                                 fi
117                                 (( minsize = minsize * 1024 * 1024 ))
118                                 shift
119                                 ;;
120                         --maxsize )
121                                 if [ "$2" == "" ] || [ "${2:0:1}" = "-" ]; then
122                                         echo Option $1 requires argument
123                                         exit 1
124                                 fi
125                                 maxsize="$2"
126                                 if (( maxsize < 1 )); then
127                                         echo "rmdupe: Error: invalid maxsize"
128                                         exit 1
129                                 fi
130                                 (( maxsize = maxsize * 1024 * 1024 ))
131                                 shift
132                                 ;;
133                         * )
134                                 echo Unknown option $1
135                                 exit
136                                 ;;
137                 esac
138         else
139                 if [ "$1" = "/" ] || [ "$1" = "/." ]; then
140                         echo "rmdupe: Error: use on folder / not permitted for safety"
141                         exit 1
142                 fi
143                 if [ "${1:0:1}" != "/" ]; then
144                         echo "rmdupe: Error: relative folder spec not permitted for safety"
145                         exit 1
146                 fi
147                 if [ ! -d "$1" ]; then
148                         echo "rmdupe: Error: folder \"$1\" does not exist"
149                         exit 1
150                 fi
151                 dfolder[$dcount]="$1"
152                 (( dcount += 1 ))
153         fi
154         shift
155 done
156 if (( dcount == 0 )); then
157         help
158 fi
159 if [ "$trash" != "" ] && [ "$rmcmd" != "rm" ]; then
160         echo "rmdupe: Error: can't specify both --trash and --rmcmd"
161         exit 1
162 fi
163
164 removefile () {  # $1=file
165         if (( optquiet != 1 )); then
166                 echo "    $rmmsg $1"
167         fi
168         removesize=$(stat -c%s "$1")
169         if [ "$trash" != "" ]; then
170                 # trash
171                 if (( optsim != 1 )); then
172                         # get trash name
173                         trashfile="`basename "$1"`"
174                         ext="${trashfile##*.}"
175                         if [ "$ext" = "$trashfile" ]; then
176                                 ext=""
177                         fi
178                         num=0
179                         while [ -e "$trash/$trashfile" ]; do
180                                 (( num += 1 ))
181                                 if [ "$ext" = "" ]; then
182                                         trashfile="`basename "$1"`""-copy$num"
183                                 else
184                                         trashfile="`basename "$1" "$ext"`""copy$num.$ext"
185                                 fi
186                         done
187                         mv "$1" "$trash/$trashfile"     
188                         if [ "$?" != "0" ] || [ -e "$1" ]; then
189                                 echo "      rmdupe: Error: move to trash failed for copy $1"    
190                         else
191                                 (( removecount += 1 ))
192                                 ((totalsize += removesize ))
193                         fi
194                 else
195                         sremoved[$sremovedcount]="$1"
196                         (( sremovedcount += 1 ))
197                         (( removecount += 1 ))
198                         ((totalsize += removesize ))
199                 fi              
200         else
201                 # remove
202                 if (( optverbose == 1 )) && [ "$rmcmd" != "rm" ]; then
203                         echo "      > $rmcmd \"$1\""
204                 fi
205                 if (( optsim == 1 )); then
206                         sremoved[$sremovedcount]="$1"
207                         (( sremovedcount += 1 ))
208                         (( removecount += 1 ))
209                         ((totalsize += removesize ))
210                 else
211                         IFS=' '
212                         $rmcmd "$1"
213                         IFS=$'\n'
214                         if [ -e "$1" ]; then
215                                 echo "      rmdupe: Error: removal failed of copy $1"
216                         else
217                                 (( removecount += 1 ))
218                                 ((totalsize += removesize ))
219                         fi
220                 fi
221         fi
222 }
223
224 simcheck () {   # $1=file
225         # file has been sim-removed?
226         srx=0
227         while (( srx < sremovedcount )); do
228                 if [ "$1" = "${sremoved[$srx]}" ]; then
229                         return 1
230                 fi
231                 (( srx += 1 ))
232         done
233         return 0
234 }
235
236 checkdupe () {  # $1=file   $2="ref"(optional - reference mode);  uses $curfile
237         # $1 or $curfile may be removed in non-reference mode
238         # $curfile may be removed in reference mode
239         # $curfile is set to empty if removed
240
241         if [ "$1" = "$curfile" ]; then
242                 return
243         fi
244         # simremoved?
245         if (( optsim == 1 )); then
246                 simcheck "$1"
247                 if [ "$?" = "0" ]; then
248                         simcheck "$curfile"
249                         if [ "$?" != "0" ]; then
250                                 return
251                         fi
252                 else
253                         return
254                 fi
255         fi
256         if [ -f "$1" ] && [ -f "$curfile" ]; then
257                 if [ "$2" == "ref" ]; then
258                         refmsg=" (REF)"
259                 else
260                         refmsg=""
261                 fi
262                 # already compared?
263                 test1="[$curfile][$1]"
264                 test2="[$1][$curfile]"
265                 crx=0
266                 while (( crx < comparedcount )); do
267                         if [ "$test1" = "${compared[$crx]}" ] || [ "$test2" = "${compared[$crx]}" ]; then
268                                 crx=-1
269                                 break
270                         fi
271                         (( crx += 1 ))
272                 done
273                 if [ "$crx" != "-1" ]; then
274                         # compare
275                         if (( optverbose == 1 )); then
276                                 echo "  Comparing to$refmsg: $1"
277                         fi
278                         cmp -s "$curfile" "$1"
279                         if [ "$?" = "0" ]; then
280                                 # it's a copy
281                                 if [ "$2" = "ref" ]; then
282                                         # ref mode
283                                         removefile "$curfile"
284                                         curfile=""
285                                 else
286                                         # non-ref mode - determine which is older
287                                         err=0
288                                         curage=$(stat -c%Z "$curfile")
289                                         curage="${curage%.*}"  # corrects .000000000 bug in stat
290                                         if [ "$?" != "0" ] || [ "$curage" = "" ]; then
291                                                 echo "    rmdupe: Error: get timestamp failed on $curfile"
292                                                 err=1
293                                         fi
294                                         otherage=$(stat -c%Z "$1")
295                                         otherage="${otherage%.*}"  # corrects .000000000 bug in stat
296                                         if [ "$?" != "0" ] || [ "$otherage" = "" ]; then
297                                                 echo "    rmdupe: Error: get timestamp failed on $1"
298                                                 err=1
299                                         fi
300                                         if (( err == 0 )); then
301                                                 if (( curage < otherage )); then
302                                                         # curfile older
303                                                         if (( optold == 1 )); then
304                                                                 removefile "$curfile"
305                                                                 curfile=""
306                                                         else
307                                                                 removefile "$1"
308                                                         fi
309                                                 else
310                                                         # $1 older
311                                                         if (( optold == 1 )); then
312                                                                 removefile "$1"
313                                                         else
314                                                                 removefile "$curfile"
315                                                                 curfile=""
316                                                         fi
317                                                 fi
318                                         fi
319                                 fi
320                         else
321                                 # not a copy, remember compare
322                                 compared[$comparedcount]="[$curfile][$1]"
323                                 (( comparedcount += 1 ))
324                         fi
325                 else
326                         if (( optverbose == 1 )); then
327                                 echo "  Already compared to$refmsg: $1"
328                         fi
329                 fi
330         fi
331 }
332
333
334 # init
335 sremovedcount=0
336 comparedcount=0
337 if [ "$trash" != "" ]; then
338         rmmsg="trashing"
339         if (( optsim != 1 )); then
340                 mkdir -p "$trash"
341                 if [ ! -d "$trash" ]; then
342                         echo "rmdupe: Error: trash folder $trash could not be created"
343                         exit 1
344                 fi
345         fi
346 else
347         rmmsg="removing"
348 fi
349 if (( optsim == 1 )); then 
350         rmmsg="sim-$rmmsg"
351         optquiet=0
352 fi
353 if (( optverbose == 1 )); then
354         optquiet=0
355 fi
356 if (( minsize != 0 )) && ((maxsize != 0 )) && (( minsize > maxsize )); then
357         echo "rmdupe: minsize greater than maxsize - nothing to do"
358         exit
359 fi
360 dfolders=""
361 dx=0
362 while (( dx < dcount )); do
363         dfolders="$dfolders \"${dfolder[$dx]}\""
364         (( dx += 1 ))
365 done
366 rfolders=""
367 rx=0
368 while (( rx < rcount )); do
369         rfolders="$rfolders \"${rfolder[$rx]}\""
370         (( rx += 1 ))
371 done
372 removecount=0
373
374 # find all files in dfolders
375 if (( optrecurse == 1 )); then
376         md=""
377 else
378         md="-maxdepth 1"
379 fi
380 flist=`eval find -H $dfolders $xdev $md -type f`
381 if [ "$flist" = "" ]; then
382         if (( optquiet != 1 )); then
383                 echo "rmdupe: no files found in specified folders - nothing to do"
384         fi
385         exit
386 fi
387
388 # check for dupes of files
389 IFSold="$IFS"
390 IFS=$'\n'
391 for f in $flist; do
392         # simremoved?
393         simremoved=0
394         if (( optsim == 1 )); then
395                 simcheck "$f"
396                 if [ "$?" != "0" ]; then
397                         simremoved=1
398                 fi
399         fi
400         if [ -f "$f" ] && (( simremoved == 0 )); then
401                 fsize=$(stat -c%s "$f")
402                 fsizec="$fsize"c
403                 sizeok=1
404                 if (( minsize > 0 )) && (( fsize < minsize )); then
405                         sizeok=0
406                 elif (( maxsize > 0 )) && (( fsize > maxsize )); then
407                         sizeok=0
408                 fi
409                 if [ "$?" != "0" ] || [ "$fsize" = "" ] || (( fsize == 0 )); then
410                         if [ "$fsize" != "0" ]; then
411                                 echo "rmdupe: Error: get filesize failed on $f"
412                         elif (( optverbose == 1 )); then
413                                 echo "Ignoring zero-length file: $f"
414                         fi
415                 elif (( sizeok == 1 )); then
416                         if (( optverbose == 1 )); then
417                                 echo "Searching for copies of: $f"
418                         fi
419                         # search dfolders
420                         if [ "$dfolders" != "" ]; then  # safety
421                                 curfile="$f"
422                                 dlist=`eval find -H $dfolders $xdev $md -type f -size $fsizec`
423                                 if [ "$dlist" != "" ]; then
424                                         for df in $dlist; do
425                                                 checkdupe "$df"  # may remove $curfile
426                                                 if [ "$curfile" = "" ]; then
427                                                         curfile="$df"
428                                                         if (( optverbose == 1 )); then
429                                                                 echo "XSearching for copies of: $curfile"
430                                                         fi
431                                                 fi
432                                         done
433                                 fi
434                         fi
435                         # search rfolders
436                         if [ "$rfolders" != "" ]; then
437                                 rlist=`eval find -H $rfolders $xdev -type f -size $fsizec`
438                                 if [ "$rlist" != "" ]; then
439                                         for rf in $rlist; do
440                                                 checkdupe "$rf" "ref"  # may remove $curfile
441                                                 if [ "$curfile" = "" ]; then
442                                                         break
443                                                 fi
444                                         done
445                                 fi
446                         fi
447                 elif (( optverbose == 1 )); then
448                         echo "Skipping $fsize-byte file $f"
449                 fi
450         fi
451 done
452 IFS="$IFSold"
453
454 # summary
455 if (( optquiet != 1 )); then
456         if (( removecount == 0 )); then
457                 echo "No duplicates were found"
458                 exit 0
459         elif (( removecount == 1 )); then
460                 msg="duplicate was"
461         else
462                 msg="duplicates were"
463         fi
464         if [ "$trash" != "" ]; then
465                 rmmsg="trashed"
466         else
467                 rmmsg="removed"
468         fi
469         (( totalsize = totalsize / 1024 / 1024 ))
470         echo
471         if (( optsim == 1 )); then
472                 echo "$removecount $msg sim-$rmmsg (approx $totalsize MB)"
473         else
474                 echo "$removecount $msg $rmmsg (approx $totalsize MB)"
475         fi
476 fi
477
478 exit 0
479
480 # CHANGELOG
481 # 1.0.4:        fixed cannot remove filenames with special characters #2 #3
482 # 1.0.3:        corrected for stat %Z .000000000 bug
483