|  | @@ -0,0 +1,548 @@
 | 
	
		
			
				|  |  | +#!/usr/bin/env python3
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +# Copyright (c) 2016, Antonio SJ Musumeci <trapexit@spawn.link>
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +# Permission to use, copy, modify, and/or distribute this software for any
 | 
	
		
			
				|  |  | +# purpose with or without fee is hereby granted, provided that the above
 | 
	
		
			
				|  |  | +# copyright notice and this permission notice appear in all copies.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 | 
	
		
			
				|  |  | +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 | 
	
		
			
				|  |  | +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 | 
	
		
			
				|  |  | +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 | 
	
		
			
				|  |  | +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 | 
	
		
			
				|  |  | +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 | 
	
		
			
				|  |  | +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +import argparse
 | 
	
		
			
				|  |  | +import ctypes
 | 
	
		
			
				|  |  | +import errno
 | 
	
		
			
				|  |  | +import fnmatch
 | 
	
		
			
				|  |  | +import hashlib
 | 
	
		
			
				|  |  | +import io
 | 
	
		
			
				|  |  | +import os
 | 
	
		
			
				|  |  | +import random
 | 
	
		
			
				|  |  | +import shlex
 | 
	
		
			
				|  |  | +import sys
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +_libc = ctypes.CDLL("libc.so.6",use_errno=True)
 | 
	
		
			
				|  |  | +_lgetxattr = _libc.lgetxattr
 | 
	
		
			
				|  |  | +_lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t]
 | 
	
		
			
				|  |  | +def lgetxattr(path,name):
 | 
	
		
			
				|  |  | +    if type(path) == str:
 | 
	
		
			
				|  |  | +        path = path.encode(errors='backslashreplace')
 | 
	
		
			
				|  |  | +    if type(name) == str:
 | 
	
		
			
				|  |  | +        name = name.encode(errors='backslashreplace')
 | 
	
		
			
				|  |  | +    length = 64
 | 
	
		
			
				|  |  | +    while True:
 | 
	
		
			
				|  |  | +        buf = ctypes.create_string_buffer(length)
 | 
	
		
			
				|  |  | +        res = _lgetxattr(path,name,buf,ctypes.c_size_t(length))
 | 
	
		
			
				|  |  | +        if res >= 0:
 | 
	
		
			
				|  |  | +            return buf.raw[0:res]
 | 
	
		
			
				|  |  | +        else:
 | 
	
		
			
				|  |  | +            err = ctypes.get_errno()
 | 
	
		
			
				|  |  | +            if err == errno.ERANGE:
 | 
	
		
			
				|  |  | +                length *= 2
 | 
	
		
			
				|  |  | +            elif err == errno.ENODATA:
 | 
	
		
			
				|  |  | +                return None
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                raise IOError(err,os.strerror(err),path)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def ismergerfs(path):
 | 
	
		
			
				|  |  | +    try:
 | 
	
		
			
				|  |  | +        lgetxattr(path,b'user.mergerfs.fullpath')
 | 
	
		
			
				|  |  | +        return True
 | 
	
		
			
				|  |  | +    except IOError as e:
 | 
	
		
			
				|  |  | +        return False
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def hash_file(filepath, hasher=None, blocksize=65536):
 | 
	
		
			
				|  |  | +    if not hasher:
 | 
	
		
			
				|  |  | +        hasher = hashlib.md5()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    with open(filepath,'rb') as afile:
 | 
	
		
			
				|  |  | +        buf = afile.read(blocksize)
 | 
	
		
			
				|  |  | +        while buf:
 | 
	
		
			
				|  |  | +            hasher.update(buf)
 | 
	
		
			
				|  |  | +            buf = afile.read(blocksize)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return hasher.hexdigest()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def short_hash_file(filepath, hasher=None, blocksize=65536, blocks=16):
 | 
	
		
			
				|  |  | +    if not hasher:
 | 
	
		
			
				|  |  | +        hasher = hashlib.md5()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    with open(filepath,'rb') as f:
 | 
	
		
			
				|  |  | +        size = os.fstat(f.fileno()).st_size
 | 
	
		
			
				|  |  | +        if size <= blocksize:
 | 
	
		
			
				|  |  | +            size = 1
 | 
	
		
			
				|  |  | +            blocks = 1
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        random.seed(size,version=2)
 | 
	
		
			
				|  |  | +        for _ in range(blocks):
 | 
	
		
			
				|  |  | +            offset = random.randrange(size)
 | 
	
		
			
				|  |  | +            f.seek(offset)
 | 
	
		
			
				|  |  | +            buf = f.read(blocksize)
 | 
	
		
			
				|  |  | +            if buf:
 | 
	
		
			
				|  |  | +                hasher.update(buf)
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                break
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return hasher.hexdigest()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def sizeof_fmt(num):
 | 
	
		
			
				|  |  | +    for unit in ['','K','M','G','T','P','E','Z']:
 | 
	
		
			
				|  |  | +        if abs(num) < 1024.0:
 | 
	
		
			
				|  |  | +            return "%3.1f%sB" % (num,unit)
 | 
	
		
			
				|  |  | +        num /= 1024.0
 | 
	
		
			
				|  |  | +    return "%.1f%sB" % (num,'Y')
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def stat_files(paths):
 | 
	
		
			
				|  |  | +    rv = []
 | 
	
		
			
				|  |  | +    for path in paths:
 | 
	
		
			
				|  |  | +        try:
 | 
	
		
			
				|  |  | +            st = os.stat(path)
 | 
	
		
			
				|  |  | +            rv.append((path,st))
 | 
	
		
			
				|  |  | +        except:
 | 
	
		
			
				|  |  | +            pass
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return rv
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def remove(files,execute,verbose):
 | 
	
		
			
				|  |  | +    for (path,stat) in files:
 | 
	
		
			
				|  |  | +        try:
 | 
	
		
			
				|  |  | +            print('rm -vf',shlex.quote(path))
 | 
	
		
			
				|  |  | +            if execute:
 | 
	
		
			
				|  |  | +                os.remove(path)
 | 
	
		
			
				|  |  | +        except Exception as e:
 | 
	
		
			
				|  |  | +            print("%s" % e)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def print_stats(stats):
 | 
	
		
			
				|  |  | +    for i in range(0,len(stats)):
 | 
	
		
			
				|  |  | +        print("#  %i: %s" % (i+1,stats[i][0]))
 | 
	
		
			
				|  |  | +        data = ("#   - uid: {0:5}; gid: {1:5}; mode: {2:6o}; "
 | 
	
		
			
				|  |  | +                "size: {3}; mtime: {4}").format(
 | 
	
		
			
				|  |  | +            stats[i][1].st_uid,
 | 
	
		
			
				|  |  | +            stats[i][1].st_gid,
 | 
	
		
			
				|  |  | +            stats[i][1].st_mode,
 | 
	
		
			
				|  |  | +            sizeof_fmt(stats[i][1].st_size),
 | 
	
		
			
				|  |  | +            stats[i][1].st_mtime)
 | 
	
		
			
				|  |  | +        print(data)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def total_size(stats):
 | 
	
		
			
				|  |  | +    total = 0
 | 
	
		
			
				|  |  | +    for (name,stat) in stats:
 | 
	
		
			
				|  |  | +        total = total + stat.st_size
 | 
	
		
			
				|  |  | +    return total
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def manual_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    done = False
 | 
	
		
			
				|  |  | +    while not done:
 | 
	
		
			
				|  |  | +        value = input("# Which to keep? ('s' to skip):")
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if value.lower() == 's':
 | 
	
		
			
				|  |  | +            stats.clear()
 | 
	
		
			
				|  |  | +            done = True
 | 
	
		
			
				|  |  | +            continue
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        try:
 | 
	
		
			
				|  |  | +            value = int(value) - 1
 | 
	
		
			
				|  |  | +            if value < 0 or value >= len(stats):
 | 
	
		
			
				|  |  | +                raise ValueError
 | 
	
		
			
				|  |  | +            stats.remove(stats[value])
 | 
	
		
			
				|  |  | +            done = True
 | 
	
		
			
				|  |  | +        except NameError:
 | 
	
		
			
				|  |  | +            print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
 | 
	
		
			
				|  |  | +        except ValueError:
 | 
	
		
			
				|  |  | +            print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def mtime_all(stats):
 | 
	
		
			
				|  |  | +    mtime = stats[0][1].st_mtime
 | 
	
		
			
				|  |  | +    return all(x[1].st_mtime == mtime for x in stats)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def mtime_any(mtime,stats):
 | 
	
		
			
				|  |  | +    return any([st.st_mtime == mtime for (path,st) in stats])
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def size_all(stats):
 | 
	
		
			
				|  |  | +    size = stats[0][1].st_size
 | 
	
		
			
				|  |  | +    return all(x[1].st_size == size for x in stats)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def size_any(size,stats):
 | 
	
		
			
				|  |  | +    return any([st.st_size == size for (path,st) in stats])
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def md5sums_all(stats):
 | 
	
		
			
				|  |  | +    if size_all(stats):
 | 
	
		
			
				|  |  | +        hashval = hash_file(stats[0][0])
 | 
	
		
			
				|  |  | +        return all(hash_file(path) == hashval for (path,st) in stats[1:])
 | 
	
		
			
				|  |  | +    return False
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def short_md5sums_all(stats):
 | 
	
		
			
				|  |  | +    if size_all(stats):
 | 
	
		
			
				|  |  | +        hashval = short_hash_file(stats[0][0])
 | 
	
		
			
				|  |  | +        return all(short_hash_file(path) == hashval for (path,st) in stats[1:])
 | 
	
		
			
				|  |  | +    return False
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def oldest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    if size_all(stats) and mtime_all(stats):
 | 
	
		
			
				|  |  | +        drive_with_most_space_dedup(fullpath,stats)
 | 
	
		
			
				|  |  | +        return
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_mtime)
 | 
	
		
			
				|  |  | +    oldest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(oldest)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def strict_oldest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_mtime,reverse=False)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    oldest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(oldest)
 | 
	
		
			
				|  |  | +    if mtime_any(oldest[1].st_mtime,stats):
 | 
	
		
			
				|  |  | +        stats.clear()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def newest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    if size_all(stats) and mtime_all(stats):
 | 
	
		
			
				|  |  | +        drive_with_most_space_dedup(fullpath,stats)
 | 
	
		
			
				|  |  | +        return
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
 | 
	
		
			
				|  |  | +    newest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(newest)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def strict_newest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    newest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(newest)
 | 
	
		
			
				|  |  | +    if mtime_any(newest[1].st_mtime,stats):
 | 
	
		
			
				|  |  | +        stats.clear()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def largest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    if size_all(stats) and mtime_all(stats):
 | 
	
		
			
				|  |  | +        drive_with_most_space_dedup(fullpath,stats)
 | 
	
		
			
				|  |  | +        return
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_size,reverse=True)
 | 
	
		
			
				|  |  | +    largest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(largest)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def strict_largest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_size,reverse=True)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    largest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(largest)
 | 
	
		
			
				|  |  | +    if size_any(largest[1].st_size,stats):
 | 
	
		
			
				|  |  | +        stats.clear()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def smallest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    if size_all(stats) and mtime_all(stats):
 | 
	
		
			
				|  |  | +        drive_with_most_space_dedup(fullpath,stats)
 | 
	
		
			
				|  |  | +        return
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_size)
 | 
	
		
			
				|  |  | +    smallest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(smallest)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def strict_smallest_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    stats.sort(key=lambda st: st[1].st_size,reverse=False)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    smallest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(smallest)
 | 
	
		
			
				|  |  | +    if size_any(smallest[1].st_size,stats):
 | 
	
		
			
				|  |  | +        stats.clear()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def calc_space_free(stat):
 | 
	
		
			
				|  |  | +    st = os.statvfs(stat[0])
 | 
	
		
			
				|  |  | +    return st.f_frsize * st.f_bfree
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def drive_with_most_space_dedup(fullpath,stats):
 | 
	
		
			
				|  |  | +    stats.sort(key=calc_space_free,reverse=True)
 | 
	
		
			
				|  |  | +    largest = stats[0]
 | 
	
		
			
				|  |  | +    stats.remove(largest)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def mergerfs_getattr_dedup(origpath,stats):
 | 
	
		
			
				|  |  | +    fullpath = getxattr(origpath,b'user.mergerfs.fullpath')
 | 
	
		
			
				|  |  | +    for (path,stat) in stats:
 | 
	
		
			
				|  |  | +        if path != fullpath:
 | 
	
		
			
				|  |  | +            continue
 | 
	
		
			
				|  |  | +        stats.remove((path,stat))
 | 
	
		
			
				|  |  | +        break
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def get_dedupfun(name,strict):
 | 
	
		
			
				|  |  | +    if strict:
 | 
	
		
			
				|  |  | +        name = 'strict-' + name
 | 
	
		
			
				|  |  | +    funs = {
 | 
	
		
			
				|  |  | +        'manual': manual_dedup,
 | 
	
		
			
				|  |  | +        'strict-manual': manual_dedup,
 | 
	
		
			
				|  |  | +        'mostfreespace': drive_with_most_space_dedup,
 | 
	
		
			
				|  |  | +        'strict-mostfreespace': drive_with_most_space_dedup,
 | 
	
		
			
				|  |  | +        'newest': newest_dedup,
 | 
	
		
			
				|  |  | +        'strict-newest': strict_newest_dedup,
 | 
	
		
			
				|  |  | +        'oldest': oldest_dedup,
 | 
	
		
			
				|  |  | +        'strict-oldest': strict_oldest_dedup,
 | 
	
		
			
				|  |  | +        'largest': largest_dedup,
 | 
	
		
			
				|  |  | +        'strict-largest': strict_largest_dedup,
 | 
	
		
			
				|  |  | +        'smallest': smallest_dedup,
 | 
	
		
			
				|  |  | +        'strict-smallest': strict_smallest_dedup,
 | 
	
		
			
				|  |  | +        'mergerfs': mergerfs_getattr_dedup,
 | 
	
		
			
				|  |  | +        'strict-mergerfs': mergerfs_getattr_dedup
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    return funs[name]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def get_ignorefun(name):
 | 
	
		
			
				|  |  | +    funs = {
 | 
	
		
			
				|  |  | +        None: lambda x: None,
 | 
	
		
			
				|  |  | +        'same-time': mtime_all,
 | 
	
		
			
				|  |  | +        'diff-time': lambda x: not mtime_all(x),
 | 
	
		
			
				|  |  | +        'same-size': size_all,
 | 
	
		
			
				|  |  | +        'diff-size': lambda x: not size_all(x),
 | 
	
		
			
				|  |  | +        'same-hash': md5sums_all,
 | 
	
		
			
				|  |  | +        'diff-hash': lambda x: not md5sums_all(x),
 | 
	
		
			
				|  |  | +        'same-short-hash': short_md5sums_all,
 | 
	
		
			
				|  |  | +        'diff-short-hash': lambda x: not short_md5sums_all(x)
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return funs[name]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def getxattr(path,key):
 | 
	
		
			
				|  |  | +    try:
 | 
	
		
			
				|  |  | +        attr = lgetxattr(path,key)
 | 
	
		
			
				|  |  | +        if attr:
 | 
	
		
			
				|  |  | +            return attr.decode('utf-8')
 | 
	
		
			
				|  |  | +        return ''
 | 
	
		
			
				|  |  | +    except IOError as e:
 | 
	
		
			
				|  |  | +        if e.errno == errno.ENODATA:
 | 
	
		
			
				|  |  | +            return ''
 | 
	
		
			
				|  |  | +        raise
 | 
	
		
			
				|  |  | +    except UnicodeDecodeError as e:
 | 
	
		
			
				|  |  | +        print(e)
 | 
	
		
			
				|  |  | +        print(attr)
 | 
	
		
			
				|  |  | +    return ''
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def match(filename,matches):
 | 
	
		
			
				|  |  | +    for match in matches:
 | 
	
		
			
				|  |  | +        if fnmatch.fnmatch(filename,match):
 | 
	
		
			
				|  |  | +            return True
 | 
	
		
			
				|  |  | +    return False
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def dedup(fullpath,verbose,ignorefun,execute,dedupfun):
 | 
	
		
			
				|  |  | +    paths = getxattr(fullpath,b'user.mergerfs.allpaths').split('\0')
 | 
	
		
			
				|  |  | +    if len(paths) <= 1:
 | 
	
		
			
				|  |  | +        return 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    stats = stat_files(paths)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if ignorefun(stats):
 | 
	
		
			
				|  |  | +        if verbose >= 2:
 | 
	
		
			
				|  |  | +            print('# ignored:',fullpath)
 | 
	
		
			
				|  |  | +        return 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (dedupfun == manual_dedup):
 | 
	
		
			
				|  |  | +        print('#',fullpath)
 | 
	
		
			
				|  |  | +        print_stats(stats)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    try:
 | 
	
		
			
				|  |  | +        dedupfun(fullpath,stats)
 | 
	
		
			
				|  |  | +        if not stats:
 | 
	
		
			
				|  |  | +            if verbose >= 2:
 | 
	
		
			
				|  |  | +                print('# skipped:',fullpath)
 | 
	
		
			
				|  |  | +            return 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (dedupfun != manual_dedup):
 | 
	
		
			
				|  |  | +            if verbose >= 2:
 | 
	
		
			
				|  |  | +                print('#',fullpath)
 | 
	
		
			
				|  |  | +            if verbose >= 3:
 | 
	
		
			
				|  |  | +                print_stats(stats)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for (path,stat) in stats:
 | 
	
		
			
				|  |  | +            try:
 | 
	
		
			
				|  |  | +                if verbose:
 | 
	
		
			
				|  |  | +                    print('rm -vf',shlex.quote(path))
 | 
	
		
			
				|  |  | +                if execute:
 | 
	
		
			
				|  |  | +                    os.remove(path)
 | 
	
		
			
				|  |  | +            except Exception as e:
 | 
	
		
			
				|  |  | +                print('#',e)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return total_size(stats)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    except Exception as e:
 | 
	
		
			
				|  |  | +        print(e)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return 0
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def print_help():
 | 
	
		
			
				|  |  | +    help = \
 | 
	
		
			
				|  |  | +'''
 | 
	
		
			
				|  |  | +usage: mergerfs.dedup [<options>] <dir>
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +Remove duplicate files across branches of a mergerfs pool. Provides
 | 
	
		
			
				|  |  | +multiple algos for determining which file to keep and what to skip.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +positional arguments:
 | 
	
		
			
				|  |  | +  dir                    Starting directory
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +optional arguments:
 | 
	
		
			
				|  |  | +  -v, --verbose          Once to print `rm` commands
 | 
	
		
			
				|  |  | +                         Twice for status info
 | 
	
		
			
				|  |  | +                         Three for file info
 | 
	
		
			
				|  |  | +  -i, --ignore=          Ignore files if... (default: none)
 | 
	
		
			
				|  |  | +                         * same-size       : have the same size
 | 
	
		
			
				|  |  | +                         * diff-size       : have different sizes
 | 
	
		
			
				|  |  | +                         * same-time       : have the same mtime
 | 
	
		
			
				|  |  | +                         * diff-time       : have different mtimes
 | 
	
		
			
				|  |  | +                         * same-hash       : have the same md5sum
 | 
	
		
			
				|  |  | +                         * diff-hash       : have different md5sums
 | 
	
		
			
				|  |  | +                         * same-short-hash : have the same short md5sums
 | 
	
		
			
				|  |  | +                         * diff-short-hash : have different short md5sums
 | 
	
		
			
				|  |  | +                         'hash' is expensive. 'short-hash' far less
 | 
	
		
			
				|  |  | +                         expensive, not as safe, but pretty good.
 | 
	
		
			
				|  |  | +  -d, --dedup=           What file to *keep* (default: mergerfs)
 | 
	
		
			
				|  |  | +                         * manual        : ask user
 | 
	
		
			
				|  |  | +                         * oldest        : file with smallest mtime
 | 
	
		
			
				|  |  | +                         * newest        : file with largest mtime
 | 
	
		
			
				|  |  | +                         * largest       : file with largest size
 | 
	
		
			
				|  |  | +                         * smallest      : file with smallest size
 | 
	
		
			
				|  |  | +                         * mostfreespace : file on drive with most free space
 | 
	
		
			
				|  |  | +                         * mergerfs      : file selected by the mergerfs
 | 
	
		
			
				|  |  | +                                           getattr policy
 | 
	
		
			
				|  |  | +  -s, --strict           Skip dedup if all files have same (mtime,size) value.
 | 
	
		
			
				|  |  | +                         Only applies to oldest, newest, largest, smallest.
 | 
	
		
			
				|  |  | +  -e, --execute          Will not perform file removal without this.
 | 
	
		
			
				|  |  | +  -I, --include=         fnmatch compatible filter to include files.
 | 
	
		
			
				|  |  | +                         Can be used multiple times.
 | 
	
		
			
				|  |  | +  -E, --exclude=         fnmatch compatible filter to exclude files.
 | 
	
		
			
				|  |  | +                         Can be used multiple times.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +'''
 | 
	
		
			
				|  |  | +    print(help)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def buildargparser():
 | 
	
		
			
				|  |  | +    desc = 'dedup files across branches in a mergerfs pool'
 | 
	
		
			
				|  |  | +    usage = 'mergerfs.dedup [<options>] <dir>'
 | 
	
		
			
				|  |  | +    parser = argparse.ArgumentParser(add_help=False)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    parser.add_argument('dir',
 | 
	
		
			
				|  |  | +                        type=str,
 | 
	
		
			
				|  |  | +                        nargs='?',
 | 
	
		
			
				|  |  | +                        default=None,
 | 
	
		
			
				|  |  | +                        help='starting directory')
 | 
	
		
			
				|  |  | +    parser.add_argument('-v','--verbose',
 | 
	
		
			
				|  |  | +                        action='count',
 | 
	
		
			
				|  |  | +                        default=0)
 | 
	
		
			
				|  |  | +    parser.add_argument('-i','--ignore',
 | 
	
		
			
				|  |  | +                        choices=['same-size','diff-size',
 | 
	
		
			
				|  |  | +                                 'same-time','diff-time',
 | 
	
		
			
				|  |  | +                                 'same-hash','diff-hash',
 | 
	
		
			
				|  |  | +                                 'same-short-hash',
 | 
	
		
			
				|  |  | +                                 'diff-short-hash'])
 | 
	
		
			
				|  |  | +    parser.add_argument('-d','--dedup',
 | 
	
		
			
				|  |  | +                        choices=['manual',
 | 
	
		
			
				|  |  | +                                 'oldest','newest',
 | 
	
		
			
				|  |  | +                                 'smallest','largest',
 | 
	
		
			
				|  |  | +                                 'mostfreespace',
 | 
	
		
			
				|  |  | +                                 'mergerfs'],
 | 
	
		
			
				|  |  | +                        default='mergerfs')
 | 
	
		
			
				|  |  | +    parser.add_argument('-s','--strict',
 | 
	
		
			
				|  |  | +                        action='store_true')
 | 
	
		
			
				|  |  | +    parser.add_argument('-e','--execute',
 | 
	
		
			
				|  |  | +                        action='store_true')
 | 
	
		
			
				|  |  | +    parser.add_argument('-I','--include',
 | 
	
		
			
				|  |  | +                        type=str,
 | 
	
		
			
				|  |  | +                        action='append',
 | 
	
		
			
				|  |  | +                        default=[])
 | 
	
		
			
				|  |  | +    parser.add_argument('-E','--exclude',
 | 
	
		
			
				|  |  | +                        type=str,
 | 
	
		
			
				|  |  | +                        action='append',
 | 
	
		
			
				|  |  | +                        default=[])
 | 
	
		
			
				|  |  | +    parser.add_argument('-h','--help',
 | 
	
		
			
				|  |  | +                        action='store_true')
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return parser
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +def main():
 | 
	
		
			
				|  |  | +    sys.stdout = io.TextIOWrapper(sys.stdout.buffer,
 | 
	
		
			
				|  |  | +                                  encoding='utf8',
 | 
	
		
			
				|  |  | +                                  errors='backslashreplace',
 | 
	
		
			
				|  |  | +                                  line_buffering=True)
 | 
	
		
			
				|  |  | +    sys.stderr = io.TextIOWrapper(sys.stderr.buffer,
 | 
	
		
			
				|  |  | +                                  encoding='utf8',
 | 
	
		
			
				|  |  | +                                  errors='backslashreplace',
 | 
	
		
			
				|  |  | +                                  line_buffering=True)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    parser = buildargparser()
 | 
	
		
			
				|  |  | +    args   = parser.parse_args()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if args.help or not args.dir:
 | 
	
		
			
				|  |  | +        print_help()
 | 
	
		
			
				|  |  | +        sys.exit(0)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    args.dir = os.path.realpath(args.dir)
 | 
	
		
			
				|  |  | +    if not ismergerfs(args.dir):
 | 
	
		
			
				|  |  | +        print("%s is not a mergerfs directory" % args.dir)
 | 
	
		
			
				|  |  | +        sys.exit(1)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    dedupfun  = get_dedupfun(args.dedup,args.strict)
 | 
	
		
			
				|  |  | +    ignorefun = get_ignorefun(args.ignore)
 | 
	
		
			
				|  |  | +    verbose   = args.verbose
 | 
	
		
			
				|  |  | +    execute   = args.execute
 | 
	
		
			
				|  |  | +    includes  = ['*'] if not args.include else args.include
 | 
	
		
			
				|  |  | +    excludes  = args.exclude
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    total_size = 0
 | 
	
		
			
				|  |  | +    try:
 | 
	
		
			
				|  |  | +        for (dirname,dirnames,filenames) in os.walk(args.dir):
 | 
	
		
			
				|  |  | +            for filename in filenames:
 | 
	
		
			
				|  |  | +                if match(filename,excludes):
 | 
	
		
			
				|  |  | +                    continue
 | 
	
		
			
				|  |  | +                if not match(filename,includes):
 | 
	
		
			
				|  |  | +                    continue
 | 
	
		
			
				|  |  | +                fullpath    = os.path.join(dirname,filename)
 | 
	
		
			
				|  |  | +                total_size += dedup(fullpath,verbose,ignorefun,execute,dedupfun)
 | 
	
		
			
				|  |  | +    except KeyboardInterrupt:
 | 
	
		
			
				|  |  | +        print("# exiting: CTRL-C pressed")
 | 
	
		
			
				|  |  | +    except IOError as e:
 | 
	
		
			
				|  |  | +        if e.errno == errno.EPIPE:
 | 
	
		
			
				|  |  | +            pass
 | 
	
		
			
				|  |  | +        else:
 | 
	
		
			
				|  |  | +            raise
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    print('# Total savings:',sizeof_fmt(total_size))
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    sys.exit(0)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +if __name__ == "__main__":
 | 
	
		
			
				|  |  | +    main()
 |