mergerfs.dedup 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. #!/usr/bin/env python3
  2. # Copyright (c) 2016, Antonio SJ Musumeci <trapexit@spawn.link>
  3. # Permission to use, copy, modify, and/or distribute this software for any
  4. # purpose with or without fee is hereby granted, provided that the above
  5. # copyright notice and this permission notice appear in all copies.
  6. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  7. # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  8. # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  9. # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  10. # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  11. # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  12. # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  13. import argparse
  14. import ctypes
  15. import errno
  16. import fnmatch
  17. import hashlib
  18. import io
  19. import os
  20. import random
  21. import shlex
  22. import sys
  23. _libc = ctypes.CDLL("libc.so.6",use_errno=True)
  24. _lgetxattr = _libc.lgetxattr
  25. _lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t]
  26. def lgetxattr(path,name):
  27. if type(path) == str:
  28. path = path.encode(errors='backslashreplace')
  29. if type(name) == str:
  30. name = name.encode(errors='backslashreplace')
  31. length = 64
  32. while True:
  33. buf = ctypes.create_string_buffer(length)
  34. res = _lgetxattr(path,name,buf,ctypes.c_size_t(length))
  35. if res >= 0:
  36. return buf.raw[0:res]
  37. else:
  38. err = ctypes.get_errno()
  39. if err == errno.ERANGE:
  40. length *= 2
  41. elif err == errno.ENODATA:
  42. return None
  43. else:
  44. raise IOError(err,os.strerror(err),path)
  45. def ismergerfs(path):
  46. try:
  47. lgetxattr(path,b'user.mergerfs.fullpath')
  48. return True
  49. except IOError as e:
  50. return False
  51. def hash_file(filepath, hasher=None, blocksize=65536):
  52. if not hasher:
  53. hasher = hashlib.md5()
  54. with open(filepath,'rb') as afile:
  55. buf = afile.read(blocksize)
  56. while buf:
  57. hasher.update(buf)
  58. buf = afile.read(blocksize)
  59. return hasher.hexdigest()
  60. def short_hash_file(filepath, hasher=None, blocksize=65536, blocks=16):
  61. if not hasher:
  62. hasher = hashlib.md5()
  63. with open(filepath,'rb') as f:
  64. size = os.fstat(f.fileno()).st_size
  65. if size <= blocksize:
  66. size = 1
  67. blocks = 1
  68. random.seed(size,version=2)
  69. for _ in range(blocks):
  70. offset = random.randrange(size)
  71. f.seek(offset)
  72. buf = f.read(blocksize)
  73. if buf:
  74. hasher.update(buf)
  75. else:
  76. break
  77. return hasher.hexdigest()
  78. def sizeof_fmt(num):
  79. for unit in ['','K','M','G','T','P','E','Z']:
  80. if abs(num) < 1024.0:
  81. return "%3.1f%sB" % (num,unit)
  82. num /= 1024.0
  83. return "%.1f%sB" % (num,'Y')
  84. def stat_files(paths):
  85. rv = []
  86. for path in paths:
  87. try:
  88. st = os.stat(path)
  89. rv.append((path,st))
  90. except:
  91. pass
  92. return rv
  93. def remove(files,execute,verbose):
  94. for (path,stat) in files:
  95. try:
  96. print('rm -vf',shlex.quote(path))
  97. if execute:
  98. os.remove(path)
  99. except Exception as e:
  100. print("%s" % e)
  101. def print_stats(stats):
  102. for i in range(0,len(stats)):
  103. print("# %i: %s" % (i+1,stats[i][0]))
  104. data = ("# - uid: {0:5}; gid: {1:5}; mode: {2:6o}; "
  105. "size: {3}; mtime: {4}").format(
  106. stats[i][1].st_uid,
  107. stats[i][1].st_gid,
  108. stats[i][1].st_mode,
  109. sizeof_fmt(stats[i][1].st_size),
  110. stats[i][1].st_mtime)
  111. print(data)
  112. def total_size(stats):
  113. total = 0
  114. for (name,stat) in stats:
  115. total = total + stat.st_size
  116. return total
  117. def manual_dedup(fullpath,stats):
  118. done = False
  119. while not done:
  120. value = input("# Which to keep? ('s' to skip):")
  121. if value.lower() == 's':
  122. stats.clear()
  123. done = True
  124. continue
  125. try:
  126. value = int(value) - 1
  127. if value < 0 or value >= len(stats):
  128. raise ValueError
  129. stats.remove(stats[value])
  130. done = True
  131. except NameError:
  132. print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
  133. except ValueError:
  134. print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
  135. def mtime_all(stats):
  136. mtime = stats[0][1].st_mtime
  137. return all(x[1].st_mtime == mtime for x in stats)
  138. def mtime_any(mtime,stats):
  139. return any([st.st_mtime == mtime for (path,st) in stats])
  140. def size_all(stats):
  141. size = stats[0][1].st_size
  142. return all(x[1].st_size == size for x in stats)
  143. def size_any(size,stats):
  144. return any([st.st_size == size for (path,st) in stats])
  145. def md5sums_all(stats):
  146. if size_all(stats):
  147. hashval = hash_file(stats[0][0])
  148. return all(hash_file(path) == hashval for (path,st) in stats[1:])
  149. return False
  150. def short_md5sums_all(stats):
  151. if size_all(stats):
  152. hashval = short_hash_file(stats[0][0])
  153. return all(short_hash_file(path) == hashval for (path,st) in stats[1:])
  154. return False
  155. def oldest_dedup(fullpath,stats):
  156. if size_all(stats) and mtime_all(stats):
  157. drive_with_most_space_dedup(fullpath,stats)
  158. return
  159. stats.sort(key=lambda st: st[1].st_mtime)
  160. oldest = stats[0]
  161. stats.remove(oldest)
  162. def strict_oldest_dedup(fullpath,stats):
  163. stats.sort(key=lambda st: st[1].st_mtime,reverse=False)
  164. oldest = stats[0]
  165. stats.remove(oldest)
  166. if mtime_any(oldest[1].st_mtime,stats):
  167. stats.clear()
  168. def newest_dedup(fullpath,stats):
  169. if size_all(stats) and mtime_all(stats):
  170. drive_with_most_space_dedup(fullpath,stats)
  171. return
  172. stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
  173. newest = stats[0]
  174. stats.remove(newest)
  175. def strict_newest_dedup(fullpath,stats):
  176. stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
  177. newest = stats[0]
  178. stats.remove(newest)
  179. if mtime_any(newest[1].st_mtime,stats):
  180. stats.clear()
  181. def largest_dedup(fullpath,stats):
  182. if size_all(stats) and mtime_all(stats):
  183. drive_with_most_space_dedup(fullpath,stats)
  184. return
  185. stats.sort(key=lambda st: st[1].st_size,reverse=True)
  186. largest = stats[0]
  187. stats.remove(largest)
  188. def strict_largest_dedup(fullpath,stats):
  189. stats.sort(key=lambda st: st[1].st_size,reverse=True)
  190. largest = stats[0]
  191. stats.remove(largest)
  192. if size_any(largest[1].st_size,stats):
  193. stats.clear()
  194. def smallest_dedup(fullpath,stats):
  195. if size_all(stats) and mtime_all(stats):
  196. drive_with_most_space_dedup(fullpath,stats)
  197. return
  198. stats.sort(key=lambda st: st[1].st_size)
  199. smallest = stats[0]
  200. stats.remove(smallest)
  201. def strict_smallest_dedup(fullpath,stats):
  202. stats.sort(key=lambda st: st[1].st_size,reverse=False)
  203. smallest = stats[0]
  204. stats.remove(smallest)
  205. if size_any(smallest[1].st_size,stats):
  206. stats.clear()
  207. def calc_space_free(stat):
  208. st = os.statvfs(stat[0])
  209. return st.f_frsize * st.f_bfree
  210. def drive_with_most_space_dedup(fullpath,stats):
  211. stats.sort(key=calc_space_free,reverse=True)
  212. largest = stats[0]
  213. stats.remove(largest)
  214. def mergerfs_getattr_dedup(origpath,stats):
  215. fullpath = getxattr(origpath,b'user.mergerfs.fullpath')
  216. for (path,stat) in stats:
  217. if path != fullpath:
  218. continue
  219. stats.remove((path,stat))
  220. break
  221. def get_dedupfun(name,strict):
  222. if strict:
  223. name = 'strict-' + name
  224. funs = {
  225. 'manual': manual_dedup,
  226. 'strict-manual': manual_dedup,
  227. 'mostfreespace': drive_with_most_space_dedup,
  228. 'strict-mostfreespace': drive_with_most_space_dedup,
  229. 'newest': newest_dedup,
  230. 'strict-newest': strict_newest_dedup,
  231. 'oldest': oldest_dedup,
  232. 'strict-oldest': strict_oldest_dedup,
  233. 'largest': largest_dedup,
  234. 'strict-largest': strict_largest_dedup,
  235. 'smallest': smallest_dedup,
  236. 'strict-smallest': strict_smallest_dedup,
  237. 'mergerfs': mergerfs_getattr_dedup,
  238. 'strict-mergerfs': mergerfs_getattr_dedup
  239. }
  240. return funs[name]
  241. def get_ignorefun(name):
  242. funs = {
  243. None: lambda x: None,
  244. 'same-time': mtime_all,
  245. 'diff-time': lambda x: not mtime_all(x),
  246. 'same-size': size_all,
  247. 'diff-size': lambda x: not size_all(x),
  248. 'same-hash': md5sums_all,
  249. 'diff-hash': lambda x: not md5sums_all(x),
  250. 'same-short-hash': short_md5sums_all,
  251. 'diff-short-hash': lambda x: not short_md5sums_all(x)
  252. }
  253. return funs[name]
  254. def getxattr(path,key):
  255. try:
  256. attr = lgetxattr(path,key)
  257. if attr:
  258. return attr.decode('utf-8')
  259. return ''
  260. except IOError as e:
  261. if e.errno == errno.ENODATA:
  262. return ''
  263. raise
  264. except UnicodeDecodeError as e:
  265. print(e)
  266. print(attr)
  267. return ''
  268. def match(filename,matches):
  269. for match in matches:
  270. if fnmatch.fnmatch(filename,match):
  271. return True
  272. return False
  273. def dedup(fullpath,verbose,ignorefun,execute,dedupfun):
  274. paths = getxattr(fullpath,b'user.mergerfs.allpaths').split('\0')
  275. if len(paths) <= 1:
  276. return 0
  277. stats = stat_files(paths)
  278. if ignorefun(stats):
  279. if verbose >= 2:
  280. print('# ignored:',fullpath)
  281. return 0
  282. if (dedupfun == manual_dedup):
  283. print('#',fullpath)
  284. print_stats(stats)
  285. try:
  286. dedupfun(fullpath,stats)
  287. if not stats:
  288. if verbose >= 2:
  289. print('# skipped:',fullpath)
  290. return 0
  291. if (dedupfun != manual_dedup):
  292. if verbose >= 2:
  293. print('#',fullpath)
  294. if verbose >= 3:
  295. print_stats(stats)
  296. for (path,stat) in stats:
  297. try:
  298. if verbose:
  299. print('rm -vf',shlex.quote(path))
  300. if execute:
  301. os.remove(path)
  302. except Exception as e:
  303. print('#',e)
  304. return total_size(stats)
  305. except Exception as e:
  306. print(e)
  307. return 0
  308. def print_help():
  309. help = \
  310. '''
  311. usage: mergerfs.dedup [<options>] <dir>
  312. Remove duplicate files across branches of a mergerfs pool. Provides
  313. multiple algos for determining which file to keep and what to skip.
  314. positional arguments:
  315. dir Starting directory
  316. optional arguments:
  317. -v, --verbose Once to print `rm` commands
  318. Twice for status info
  319. Three for file info
  320. -i, --ignore= Ignore files if... (default: none)
  321. * same-size : have the same size
  322. * diff-size : have different sizes
  323. * same-time : have the same mtime
  324. * diff-time : have different mtimes
  325. * same-hash : have the same md5sum
  326. * diff-hash : have different md5sums
  327. * same-short-hash : have the same short md5sums
  328. * diff-short-hash : have different short md5sums
  329. 'hash' is expensive. 'short-hash' far less
  330. expensive, not as safe, but pretty good.
  331. -d, --dedup= What file to *keep* (default: mergerfs)
  332. * manual : ask user
  333. * oldest : file with smallest mtime
  334. * newest : file with largest mtime
  335. * largest : file with largest size
  336. * smallest : file with smallest size
  337. * mostfreespace : file on drive with most free space
  338. * mergerfs : file selected by the mergerfs
  339. getattr policy
  340. -s, --strict Skip dedup if all files have same (mtime,size) value.
  341. Only applies to oldest, newest, largest, smallest.
  342. -e, --execute Will not perform file removal without this.
  343. -I, --include= fnmatch compatible filter to include files.
  344. Can be used multiple times.
  345. -E, --exclude= fnmatch compatible filter to exclude files.
  346. Can be used multiple times.
  347. '''
  348. print(help)
  349. def buildargparser():
  350. desc = 'dedup files across branches in a mergerfs pool'
  351. usage = 'mergerfs.dedup [<options>] <dir>'
  352. parser = argparse.ArgumentParser(add_help=False)
  353. parser.add_argument('dir',
  354. type=str,
  355. nargs='?',
  356. default=None,
  357. help='starting directory')
  358. parser.add_argument('-v','--verbose',
  359. action='count',
  360. default=0)
  361. parser.add_argument('-i','--ignore',
  362. choices=['same-size','diff-size',
  363. 'same-time','diff-time',
  364. 'same-hash','diff-hash',
  365. 'same-short-hash',
  366. 'diff-short-hash'])
  367. parser.add_argument('-d','--dedup',
  368. choices=['manual',
  369. 'oldest','newest',
  370. 'smallest','largest',
  371. 'mostfreespace',
  372. 'mergerfs'],
  373. default='mergerfs')
  374. parser.add_argument('-s','--strict',
  375. action='store_true')
  376. parser.add_argument('-e','--execute',
  377. action='store_true')
  378. parser.add_argument('-I','--include',
  379. type=str,
  380. action='append',
  381. default=[])
  382. parser.add_argument('-E','--exclude',
  383. type=str,
  384. action='append',
  385. default=[])
  386. parser.add_argument('-h','--help',
  387. action='store_true')
  388. return parser
  389. def main():
  390. sys.stdout = io.TextIOWrapper(sys.stdout.buffer,
  391. encoding='utf8',
  392. errors='backslashreplace',
  393. line_buffering=True)
  394. sys.stderr = io.TextIOWrapper(sys.stderr.buffer,
  395. encoding='utf8',
  396. errors='backslashreplace',
  397. line_buffering=True)
  398. parser = buildargparser()
  399. args = parser.parse_args()
  400. if args.help or not args.dir:
  401. print_help()
  402. sys.exit(0)
  403. args.dir = os.path.realpath(args.dir)
  404. if not ismergerfs(args.dir):
  405. print("%s is not a mergerfs directory" % args.dir)
  406. sys.exit(1)
  407. dedupfun = get_dedupfun(args.dedup,args.strict)
  408. ignorefun = get_ignorefun(args.ignore)
  409. verbose = args.verbose
  410. execute = args.execute
  411. includes = ['*'] if not args.include else args.include
  412. excludes = args.exclude
  413. total_size = 0
  414. try:
  415. for (dirname,dirnames,filenames) in os.walk(args.dir):
  416. for filename in filenames:
  417. if match(filename,excludes):
  418. continue
  419. if not match(filename,includes):
  420. continue
  421. fullpath = os.path.join(dirname,filename)
  422. total_size += dedup(fullpath,verbose,ignorefun,execute,dedupfun)
  423. except KeyboardInterrupt:
  424. print("# exiting: CTRL-C pressed")
  425. except IOError as e:
  426. if e.errno == errno.EPIPE:
  427. pass
  428. else:
  429. raise
  430. print('# Total savings:',sizeof_fmt(total_size))
  431. sys.exit(0)
  432. if __name__ == "__main__":
  433. main()