From b8de81a302b624865005e3c33f6c50a2944be714 Mon Sep 17 00:00:00 2001 From: localhorst Date: Wed, 22 Feb 2023 16:48:57 +0100 Subject: [PATCH] add script to find duplicates --- find_duplicates.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 find_duplicates.py diff --git a/find_duplicates.py b/find_duplicates.py new file mode 100644 index 0000000..cb40a93 --- /dev/null +++ b/find_duplicates.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" Author: Hendrik Schutter, localhorst@mosad.xyz + Date of creation: 2023/02/22 + Date of last modification: 2023/02/22 +""" + +import os +import sys +import time +import subprocess +import datetime +from dataclasses import dataclass +from tqdm import tqdm +import operator + +@dataclass +class MediaFile: + name: str #without extension + extension: str #without dot + full_path: str + +def supported_file_extension(filename): + if filename.endswith('.mp4') or filename.endswith('.mkv') or filename.endswith('.m4v'): + return True + return False + +def get_number_of_files(path): + #filter(supported_file_extension, files) + return sum([len(list(filter(supported_file_extension, files))) for r, d, files in os.walk(path)]) + +def cut_file_name(filename, max_lenght, ellipsis="..."): + if len(filename) > max_lenght: + return filename[:max_lenght-len(ellipsis)] + ellipsis + else: + return filename + +def scan_files(path): + total_numbers_to_scan = get_number_of_files(path) + + media_files = list() #stores all found files with metadata + + pbar = tqdm(total=total_numbers_to_scan) #print progress bar + + for root, dirs, files in os.walk(path, topdown=True): + for name in filter(supported_file_extension, files): + pbar.set_description("Processing %s" % str("{:<32}".format(cut_file_name(name, 32)))) + full_path = os.path.join(root, name) + media_files.append(MediaFile(name=os.path.splitext(name)[0], extension=os.path.splitext(name)[1], full_path=full_path)) + pbar.update(1) + pbar.close() + return media_files + +def print_all(media_files, path): + for media_file in media_files: + if (media_file.extension == ".mp4"): + #print(media_file.name) + file_test_path = path + media_file.name + ".mkv" + #print("Testing for: " + file_test_path) + if (os.path.isfile(file_test_path)): + print(media_file.full_path) + #os.remove(media_file.full_path) + + +def main() -> None: + if(len(sys.argv) != 2): + path = '.' #use current pwd + else: + path = sys.argv[1] #use arg0 as path + + media_files = scan_files(path) #scan all media files + + print("") + print_all(media_files, path) + + +if __name__ == "__main__": + main()