#!/usr/bin/python3 import subprocess import itertools import os import sys import threading import time GET_ALL_PICS_LIST_STRING="find . -iname '*.jpg' -o -iname '*.png' -o -iname '*.jpeg'" COMPARE_PICS_CMD="compare -metric AE -fuzz 0.1% \"{}\" \"{}\" /dev/null" IDENTIFY_PIC_CMD="identify \"{}\"" PICS_MAP_SQUSHED = {} PICS_MAP_SQUSHED_KEYS = [] THREADS_CTX = { "keys": [], "estimated": 0, "num": 2, "done": [] } def main(): out = subprocess.Popen(['sh', '-c', GET_ALL_PICS_LIST_STRING], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) all_pics_string = out.communicate()[0].decode('utf-8') all_pics = all_pics_string.split('\n') print(f"Gathered {len(all_pics)} photos.") count = 0 try: with open('./.ddp.csv', 'r') as done_file: for line in done_file.readlines(): line = line.strip('\n') if not line: continue THREADS_CTX["done"].append(line) count += 1 except FileNotFoundError as err: print(err) print(f"Exclude list for {count} keys") # I. basenames pics_map = {} for pic in all_pics: key = os.path.basename('.'.join(pic.split('.')[:-1])) if key not in pics_map: pics_map.update({key: [pic]}) continue pics_map[key].append(pic) print(f"Hashtable for {len(pics_map)} keys.") # II. sort pics_map_keys = sorted(pics_map.keys(), key=len) # III. squash pics_map_squashed = {} squash_marker = ['-', ' ', '('] while pics_map_keys: key = pics_map_keys.pop(0) if len(key) < 7: continue if not key: continue squashed_keys = [] for other_key in pics_map_keys: if other_key.startswith(key) and other_key[len(key)] in squash_marker: squashed_keys.append(other_key) if not squashed_keys: continue pics_map_squashed.update({key: pics_map[key]}) for squashed_key in squashed_keys: pics_map_squashed[key] += pics_map[squashed_key] pics_map_keys.remove(squashed_key) print(f"Squashed hashtable to {len(pics_map_squashed)} keys.") PICS_MAP_SQUSHED.update(pics_map_squashed) THREADS_CTX["keys"] = [key for key in list(PICS_MAP_SQUSHED.keys()) if key not in THREADS_CTX["done"]] THREADS_CTX["estimated"] = len(THREADS_CTX["keys"]) threads = [None] * THREADS_CTX["num"] for i in range(THREADS_CTX["num"]): threads[i] = threading.Thread(target=thread_function, args=(f"thread_{i}",)) threads[i].start() try: while THREADS_CTX["keys"]: time.sleep(.1) except KeyboardInterrupt as err: print("Wait for threads...") THREADS_CTX["keys"] = [] for i in range(THREADS_CTX["num"]): threads[i].join() with open('./.ddp.csv', 'w') as done_file: for key in THREADS_CTX["done"]: done_file.write(key + "\n") def thread_function(name): try: prev_key = None while key := THREADS_CTX["keys"].pop(0): if prev_key: THREADS_CTX["done"].append(prev_key) prev_key = key pics = PICS_MAP_SQUSHED[key] if THREADS_CTX["estimated"] % 10 == 0: print(f"Estimated: {THREADS_CTX['estimated']}") print(f"{name}: Do {len(pics)} photos: '{pics}'") THREADS_CTX["estimated"] -= 1 delete_list = [] compars_count = 0 for pic_a, pic_b in itertools.combinations(pics, 2): if pic_b in delete_list or pic_a in delete_list: continue if not os.path.exists(pic_a): print(f"pic_a doesnt exist: '{pic_a}'") continue if not os.path.exists(pic_b): print(f"pic_b doesnt exist: '{pic_b}'") continue compars_count += 1 out = subprocess.Popen(['sh', '-c', COMPARE_PICS_CMD.format(pic_a, pic_b)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) value_string, stderr = out.communicate() if out.returncode >= 2: print(f"HERE!!! stdout: {value_string}\nstderr: {stderr}\nreturn code: {out.returncode}") continue #diff = float(value_string.decode('utf-8')) #if diff == 0.: if out.returncode == 0: delete_list.append(pic_b) print(f"{name}: {compars_count} cmps") if not delete_list: continue print(f"Delete: {delete_list}") for delete_elem in delete_list: os.remove(delete_elem) except BaseException as err: print(f"{name}: {err}") # Handle last key if prev_key: THREADS_CTX["done"].append(prev_key) if __name__ == '__main__': main()