123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548 |
- #!/usr/bin/env python3
- # Copyright (c) 2016, Antonio SJ Musumeci <trapexit@spawn.link>
- # Permission to use, copy, modify, and/or distribute this software for any
- # purpose with or without fee is hereby granted, provided that the above
- # copyright notice and this permission notice appear in all copies.
- # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- import argparse
- import ctypes
- import errno
- import fnmatch
- import hashlib
- import io
- import os
- import random
- import shlex
- import sys
- _libc = ctypes.CDLL("libc.so.6",use_errno=True)
- _lgetxattr = _libc.lgetxattr
- _lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t]
- def lgetxattr(path,name):
- if type(path) == str:
- path = path.encode(errors='backslashreplace')
- if type(name) == str:
- name = name.encode(errors='backslashreplace')
- length = 64
- while True:
- buf = ctypes.create_string_buffer(length)
- res = _lgetxattr(path,name,buf,ctypes.c_size_t(length))
- if res >= 0:
- return buf.raw[0:res]
- else:
- err = ctypes.get_errno()
- if err == errno.ERANGE:
- length *= 2
- elif err == errno.ENODATA:
- return None
- else:
- raise IOError(err,os.strerror(err),path)
- def ismergerfs(path):
- try:
- lgetxattr(path,b'user.mergerfs.fullpath')
- return True
- except IOError as e:
- return False
- def hash_file(filepath, hasher=None, blocksize=65536):
- if not hasher:
- hasher = hashlib.md5()
- with open(filepath,'rb') as afile:
- buf = afile.read(blocksize)
- while buf:
- hasher.update(buf)
- buf = afile.read(blocksize)
- return hasher.hexdigest()
- def short_hash_file(filepath, hasher=None, blocksize=65536, blocks=16):
- if not hasher:
- hasher = hashlib.md5()
- with open(filepath,'rb') as f:
- size = os.fstat(f.fileno()).st_size
- if size <= blocksize:
- size = 1
- blocks = 1
- random.seed(size,version=2)
- for _ in range(blocks):
- offset = random.randrange(size)
- f.seek(offset)
- buf = f.read(blocksize)
- if buf:
- hasher.update(buf)
- else:
- break
- return hasher.hexdigest()
- def sizeof_fmt(num):
- for unit in ['','K','M','G','T','P','E','Z']:
- if abs(num) < 1024.0:
- return "%3.1f%sB" % (num,unit)
- num /= 1024.0
- return "%.1f%sB" % (num,'Y')
- def stat_files(paths):
- rv = []
- for path in paths:
- try:
- st = os.stat(path)
- rv.append((path,st))
- except:
- pass
- return rv
- def remove(files,execute,verbose):
- for (path,stat) in files:
- try:
- print('rm -vf',shlex.quote(path))
- if execute:
- os.remove(path)
- except Exception as e:
- print("%s" % e)
- def print_stats(stats):
- for i in range(0,len(stats)):
- print("# %i: %s" % (i+1,stats[i][0]))
- data = ("# - uid: {0:5}; gid: {1:5}; mode: {2:6o}; "
- "size: {3}; mtime: {4}").format(
- stats[i][1].st_uid,
- stats[i][1].st_gid,
- stats[i][1].st_mode,
- sizeof_fmt(stats[i][1].st_size),
- stats[i][1].st_mtime)
- print(data)
- def total_size(stats):
- total = 0
- for (name,stat) in stats:
- total = total + stat.st_size
- return total
- def manual_dedup(fullpath,stats):
- done = False
- while not done:
- value = input("# Which to keep? ('s' to skip):")
- if value.lower() == 's':
- stats.clear()
- done = True
- continue
- try:
- value = int(value) - 1
- if value < 0 or value >= len(stats):
- raise ValueError
- stats.remove(stats[value])
- done = True
- except NameError:
- print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
- except ValueError:
- print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
- def mtime_all(stats):
- mtime = stats[0][1].st_mtime
- return all(x[1].st_mtime == mtime for x in stats)
- def mtime_any(mtime,stats):
- return any([st.st_mtime == mtime for (path,st) in stats])
- def size_all(stats):
- size = stats[0][1].st_size
- return all(x[1].st_size == size for x in stats)
- def size_any(size,stats):
- return any([st.st_size == size for (path,st) in stats])
- def md5sums_all(stats):
- if size_all(stats):
- hashval = hash_file(stats[0][0])
- return all(hash_file(path) == hashval for (path,st) in stats[1:])
- return False
- def short_md5sums_all(stats):
- if size_all(stats):
- hashval = short_hash_file(stats[0][0])
- return all(short_hash_file(path) == hashval for (path,st) in stats[1:])
- return False
- def oldest_dedup(fullpath,stats):
- if size_all(stats) and mtime_all(stats):
- drive_with_most_space_dedup(fullpath,stats)
- return
- stats.sort(key=lambda st: st[1].st_mtime)
- oldest = stats[0]
- stats.remove(oldest)
- def strict_oldest_dedup(fullpath,stats):
- stats.sort(key=lambda st: st[1].st_mtime,reverse=False)
- oldest = stats[0]
- stats.remove(oldest)
- if mtime_any(oldest[1].st_mtime,stats):
- stats.clear()
- def newest_dedup(fullpath,stats):
- if size_all(stats) and mtime_all(stats):
- drive_with_most_space_dedup(fullpath,stats)
- return
- stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
- newest = stats[0]
- stats.remove(newest)
- def strict_newest_dedup(fullpath,stats):
- stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
- newest = stats[0]
- stats.remove(newest)
- if mtime_any(newest[1].st_mtime,stats):
- stats.clear()
- def largest_dedup(fullpath,stats):
- if size_all(stats) and mtime_all(stats):
- drive_with_most_space_dedup(fullpath,stats)
- return
- stats.sort(key=lambda st: st[1].st_size,reverse=True)
- largest = stats[0]
- stats.remove(largest)
- def strict_largest_dedup(fullpath,stats):
- stats.sort(key=lambda st: st[1].st_size,reverse=True)
- largest = stats[0]
- stats.remove(largest)
- if size_any(largest[1].st_size,stats):
- stats.clear()
- def smallest_dedup(fullpath,stats):
- if size_all(stats) and mtime_all(stats):
- drive_with_most_space_dedup(fullpath,stats)
- return
- stats.sort(key=lambda st: st[1].st_size)
- smallest = stats[0]
- stats.remove(smallest)
- def strict_smallest_dedup(fullpath,stats):
- stats.sort(key=lambda st: st[1].st_size,reverse=False)
- smallest = stats[0]
- stats.remove(smallest)
- if size_any(smallest[1].st_size,stats):
- stats.clear()
- def calc_space_free(stat):
- st = os.statvfs(stat[0])
- return st.f_frsize * st.f_bfree
- def drive_with_most_space_dedup(fullpath,stats):
- stats.sort(key=calc_space_free,reverse=True)
- largest = stats[0]
- stats.remove(largest)
- def mergerfs_getattr_dedup(origpath,stats):
- fullpath = getxattr(origpath,b'user.mergerfs.fullpath')
- for (path,stat) in stats:
- if path != fullpath:
- continue
- stats.remove((path,stat))
- break
- def get_dedupfun(name,strict):
- if strict:
- name = 'strict-' + name
- funs = {
- 'manual': manual_dedup,
- 'strict-manual': manual_dedup,
- 'mostfreespace': drive_with_most_space_dedup,
- 'strict-mostfreespace': drive_with_most_space_dedup,
- 'newest': newest_dedup,
- 'strict-newest': strict_newest_dedup,
- 'oldest': oldest_dedup,
- 'strict-oldest': strict_oldest_dedup,
- 'largest': largest_dedup,
- 'strict-largest': strict_largest_dedup,
- 'smallest': smallest_dedup,
- 'strict-smallest': strict_smallest_dedup,
- 'mergerfs': mergerfs_getattr_dedup,
- 'strict-mergerfs': mergerfs_getattr_dedup
- }
- return funs[name]
- def get_ignorefun(name):
- funs = {
- None: lambda x: None,
- 'same-time': mtime_all,
- 'diff-time': lambda x: not mtime_all(x),
- 'same-size': size_all,
- 'diff-size': lambda x: not size_all(x),
- 'same-hash': md5sums_all,
- 'diff-hash': lambda x: not md5sums_all(x),
- 'same-short-hash': short_md5sums_all,
- 'diff-short-hash': lambda x: not short_md5sums_all(x)
- }
- return funs[name]
- def getxattr(path,key):
- try:
- attr = lgetxattr(path,key)
- if attr:
- return attr.decode('utf-8')
- return ''
- except IOError as e:
- if e.errno == errno.ENODATA:
- return ''
- raise
- except UnicodeDecodeError as e:
- print(e)
- print(attr)
- return ''
- def match(filename,matches):
- for match in matches:
- if fnmatch.fnmatch(filename,match):
- return True
- return False
- def dedup(fullpath,verbose,ignorefun,execute,dedupfun):
- paths = getxattr(fullpath,b'user.mergerfs.allpaths').split('\0')
- if len(paths) <= 1:
- return 0
- stats = stat_files(paths)
- if ignorefun(stats):
- if verbose >= 2:
- print('# ignored:',fullpath)
- return 0
- if (dedupfun == manual_dedup):
- print('#',fullpath)
- print_stats(stats)
- try:
- dedupfun(fullpath,stats)
- if not stats:
- if verbose >= 2:
- print('# skipped:',fullpath)
- return 0
- if (dedupfun != manual_dedup):
- if verbose >= 2:
- print('#',fullpath)
- if verbose >= 3:
- print_stats(stats)
- for (path,stat) in stats:
- try:
- if verbose:
- print('rm -vf',shlex.quote(path))
- if execute:
- os.remove(path)
- except Exception as e:
- print('#',e)
- return total_size(stats)
- except Exception as e:
- print(e)
- return 0
- def print_help():
- help = \
- '''
- usage: mergerfs.dedup [<options>] <dir>
- Remove duplicate files across branches of a mergerfs pool. Provides
- multiple algos for determining which file to keep and what to skip.
- positional arguments:
- dir Starting directory
- optional arguments:
- -v, --verbose Once to print `rm` commands
- Twice for status info
- Three for file info
- -i, --ignore= Ignore files if... (default: none)
- * same-size : have the same size
- * diff-size : have different sizes
- * same-time : have the same mtime
- * diff-time : have different mtimes
- * same-hash : have the same md5sum
- * diff-hash : have different md5sums
- * same-short-hash : have the same short md5sums
- * diff-short-hash : have different short md5sums
- 'hash' is expensive. 'short-hash' far less
- expensive, not as safe, but pretty good.
- -d, --dedup= What file to *keep* (default: mergerfs)
- * manual : ask user
- * oldest : file with smallest mtime
- * newest : file with largest mtime
- * largest : file with largest size
- * smallest : file with smallest size
- * mostfreespace : file on drive with most free space
- * mergerfs : file selected by the mergerfs
- getattr policy
- -s, --strict Skip dedup if all files have same (mtime,size) value.
- Only applies to oldest, newest, largest, smallest.
- -e, --execute Will not perform file removal without this.
- -I, --include= fnmatch compatible filter to include files.
- Can be used multiple times.
- -E, --exclude= fnmatch compatible filter to exclude files.
- Can be used multiple times.
- '''
- print(help)
- def buildargparser():
- desc = 'dedup files across branches in a mergerfs pool'
- usage = 'mergerfs.dedup [<options>] <dir>'
- parser = argparse.ArgumentParser(add_help=False)
- parser.add_argument('dir',
- type=str,
- nargs='?',
- default=None,
- help='starting directory')
- parser.add_argument('-v','--verbose',
- action='count',
- default=0)
- parser.add_argument('-i','--ignore',
- choices=['same-size','diff-size',
- 'same-time','diff-time',
- 'same-hash','diff-hash',
- 'same-short-hash',
- 'diff-short-hash'])
- parser.add_argument('-d','--dedup',
- choices=['manual',
- 'oldest','newest',
- 'smallest','largest',
- 'mostfreespace',
- 'mergerfs'],
- default='mergerfs')
- parser.add_argument('-s','--strict',
- action='store_true')
- parser.add_argument('-e','--execute',
- action='store_true')
- parser.add_argument('-I','--include',
- type=str,
- action='append',
- default=[])
- parser.add_argument('-E','--exclude',
- type=str,
- action='append',
- default=[])
- parser.add_argument('-h','--help',
- action='store_true')
- return parser
- def main():
- sys.stdout = io.TextIOWrapper(sys.stdout.buffer,
- encoding='utf8',
- errors='backslashreplace',
- line_buffering=True)
- sys.stderr = io.TextIOWrapper(sys.stderr.buffer,
- encoding='utf8',
- errors='backslashreplace',
- line_buffering=True)
- parser = buildargparser()
- args = parser.parse_args()
- if args.help or not args.dir:
- print_help()
- sys.exit(0)
- args.dir = os.path.realpath(args.dir)
- if not ismergerfs(args.dir):
- print("%s is not a mergerfs directory" % args.dir)
- sys.exit(1)
- dedupfun = get_dedupfun(args.dedup,args.strict)
- ignorefun = get_ignorefun(args.ignore)
- verbose = args.verbose
- execute = args.execute
- includes = ['*'] if not args.include else args.include
- excludes = args.exclude
- total_size = 0
- try:
- for (dirname,dirnames,filenames) in os.walk(args.dir):
- for filename in filenames:
- if match(filename,excludes):
- continue
- if not match(filename,includes):
- continue
- fullpath = os.path.join(dirname,filename)
- total_size += dedup(fullpath,verbose,ignorefun,execute,dedupfun)
- except KeyboardInterrupt:
- print("# exiting: CTRL-C pressed")
- except IOError as e:
- if e.errno == errno.EPIPE:
- pass
- else:
- raise
- print('# Total savings:',sizeof_fmt(total_size))
- sys.exit(0)
- if __name__ == "__main__":
- main()
|