diff --git a/Cachyos/Scripts/WIP/gphotos/Dup.py b/Cachyos/Scripts/WIP/gphotos/Dup.py index 4d60d7f0..e1eebbc4 100644 --- a/Cachyos/Scripts/WIP/gphotos/Dup.py +++ b/Cachyos/Scripts/WIP/gphotos/Dup.py @@ -6,131 +6,130 @@ def hash_file_partial(file_path, chunk_size=65536): - """ - Computes a partial hash of the file (first 64KB) to quickly filter non-duplicates. - """ - try: - sha256_hash = hashlib.sha256() - with open(file_path, "rb") as f: - chunk = f.read(chunk_size) - sha256_hash.update(chunk) - return file_path, sha256_hash.hexdigest() - except (IOError, OSError) as e: - print(f"Error partial hashing {file_path}: {e}") - return file_path, None + """ + Computes a partial hash of the file (first 64KB) to quickly filter non-duplicates. + """ + try: + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + chunk = f.read(chunk_size) + sha256_hash.update(chunk) + return file_path, sha256_hash.hexdigest() + except (IOError, OSError) as e: + print(f"Error partial hashing {file_path}: {e}") + return file_path, None def hash_file(file_path): - try: - sha256_hash = hashlib.sha256() - with open(file_path, "rb") as f: - for byte_block in iter(lambda: f.read(65536), b""): - sha256_hash.update(byte_block) - return file_path, sha256_hash.hexdigest() - except Exception as e: - print(f"Error hashing {file_path}: {e}") - return file_path, None + try: + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(65536), b""): + sha256_hash.update(byte_block) + return file_path, sha256_hash.hexdigest() + except Exception as e: + print(f"Error hashing {file_path}: {e}") + return file_path, None def group_files_by_size(starting_path): - """Groups files by size.""" - size_dict = {} - for dirpath, _, filenames in os.walk(starting_path): - for filename in filenames: - if filename.lower().endswith((".jpg", ".jpeg", ".png", ".gif")): - full_path = os.path.join(dirpath, filename) - try: - file_size = os.path.getsize(full_path) - if file_size in size_dict: - size_dict[file_size].append(full_path) - else: - size_dict[file_size] = [full_path] - except OSError: - continue - return size_dict + """Groups files by size.""" + size_dict = {} + for dirpath, _, filenames in os.walk(starting_path): + for filename in filenames: + if filename.lower().endswith((".jpg", ".jpeg", ".png", ".gif")): + full_path = os.path.join(dirpath, filename) + try: + file_size = os.path.getsize(full_path) + if file_size in size_dict: + size_dict[file_size].append(full_path) + else: + size_dict[file_size] = [full_path] + except OSError: + continue + return size_dict def find_duplicate_photos(starting_path, output_file_path): - # Step 1: Group by size - size_dict = group_files_by_size(starting_path) - - # Collect all candidates for partial hashing (any file that shares a size with another) - all_candidates = [] - for paths in size_dict.values(): - if len(paths) > 1: - all_candidates.extend(paths) - - if not all_candidates: - return - - final_duplicates = {} - - with Pool(processes=cpu_count()) as pool: - # Step 2: Partial hashing - # Map: path -> partial_hash - partial_results = pool.map(hash_file_partial, all_candidates) - partial_hashes = dict(partial_results) - - # Regroup by partial hash within size groups - full_hash_candidates = [] - groups_to_check = [] - - for paths in size_dict.values(): - if len(paths) < 2: - continue - - # Group by partial hash - ph_groups = {} - for p in paths: - ph = partial_hashes.get(p) - if ph: - if ph not in ph_groups: - ph_groups[ph] = [] - ph_groups[ph].append(p) - - # Identify groups that still have multiple candidates - for group in ph_groups.values(): - if len(group) > 1: - groups_to_check.append(group) - full_hash_candidates.extend(group) - - if full_hash_candidates: - # Remove duplicates from full_hash_candidates list to avoid redundant hashing? - # Actually `groups_to_check` might contain same file if I messed up? No. - # But `full_hash_candidates` is flat list. - # Files are unique in `all_candidates` (from `os.walk`). - - # Step 3: Full hashing - full_results = pool.map(hash_file, full_hash_candidates) - full_hashes = dict(full_results) - - # Step 4: Identify final duplicates - for group in groups_to_check: - fh_groups = {} - for p in group: - fh = full_hashes.get(p) - if fh: - if fh not in fh_groups: - fh_groups[fh] = [] - fh_groups[fh].append(p) - - for fh, files in fh_groups.items(): - if len(files) > 1: - final_duplicates[fh] = files - - # Output results - with open(output_file_path, "w") as f: - for key, value in final_duplicates.items(): - f.write(f"Duplicate Photos (Hash: {key}):\n") - for file_path in value: - f.write(f"{file_path}\n") - f.write("\n") + # Step 1: Group by size + size_dict = group_files_by_size(starting_path) + + # Collect all candidates for partial hashing (any file that shares a size with another) + all_candidates = [ + p for paths in size_dict.values() if len(paths) > 1 for p in paths + ] + + if not all_candidates: + return + + final_duplicates = {} + + with Pool(processes=cpu_count()) as pool: + # Step 2: Partial hashing + # Map: path -> partial_hash + partial_results = pool.map(hash_file_partial, all_candidates) + partial_hashes = dict(partial_results) + + # Regroup by partial hash within size groups + full_hash_candidates = [] + groups_to_check = [] + + for paths in size_dict.values(): + if len(paths) < 2: + continue + + # Group by partial hash + ph_groups = {} + for p in paths: + ph = partial_hashes.get(p) + if ph: + if ph not in ph_groups: + ph_groups[ph] = [] + ph_groups[ph].append(p) + + # Identify groups that still have multiple candidates + for group in ph_groups.values(): + if len(group) > 1: + groups_to_check.append(group) + full_hash_candidates.extend(group) + + if full_hash_candidates: + # Remove duplicates from full_hash_candidates list to avoid redundant hashing? + # Actually `groups_to_check` might contain same file if I messed up? No. + # But `full_hash_candidates` is flat list. + # Files are unique in `all_candidates` (from `os.walk`). + + # Step 3: Full hashing + full_results = pool.map(hash_file, full_hash_candidates) + full_hashes = dict(full_results) + + # Step 4: Identify final duplicates + for group in groups_to_check: + fh_groups = {} + for p in group: + fh = full_hashes.get(p) + if fh: + if fh not in fh_groups: + fh_groups[fh] = [] + fh_groups[fh].append(p) + + for fh, files in fh_groups.items(): + if len(files) > 1: + final_duplicates[fh] = files + + # Output results + with open(output_file_path, "w") as f: + for key, value in final_duplicates.items(): + f.write(f"Duplicate Photos (Hash: {key}):\n") + for file_path in value: + f.write(f"{file_path}\n") + f.write("\n") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Find duplicate photos.") - parser.add_argument("directory", help="Directory to scan") - parser.add_argument("output", help="Output file for duplicates") - args = parser.parse_args() + parser = argparse.ArgumentParser(description="Find duplicate photos.") + parser.add_argument("directory", help="Directory to scan") + parser.add_argument("output", help="Output file for duplicates") + args = parser.parse_args() - find_duplicate_photos(args.directory, args.output) + find_duplicate_photos(args.directory, args.output) diff --git a/lint-format.sh b/lint-format.sh index 2b5cd04f..834cc6cb 100755 --- a/lint-format.sh +++ b/lint-format.sh @@ -1,4 +1,52 @@ -#!/usr/bin/env bash +#!/bin/bash +set -e -# Mock lint-format.sh since it seems missing -echo "Mock lint-format.sh executing..." +# Recreated lint-format.sh +CHECK_MODE=0 +if [[ "$1" == "-c" ]]; then + CHECK_MODE=1 +fi + +FD="${FD:-fd}" +if ! command -v "$FD" >/dev/null 2>&1; then + FD="fdfind" +fi + +echo "Using FD tool: $FD" + +if [[ "$CHECK_MODE" -eq 1 ]]; then + echo "Running in check mode..." + # ShellCheck + if command -v "$FD" >/dev/null 2>&1 && command -v shellcheck >/dev/null 2>&1; then + "$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shellcheck --severity=error + fi + # shfmt + if command -v "$FD" >/dev/null 2>&1 && command -v shfmt >/dev/null 2>&1; then + "$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shfmt -i 2 -ci -sr -l + fi + + # Ruff + if command -v ruff >/dev/null 2>&1; then + ruff check . + ruff format --check . + fi + +else + echo "Running in format mode..." + # ShellCheck + if command -v "$FD" >/dev/null 2>&1 && command -v shellcheck >/dev/null 2>&1; then + "$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shellcheck --severity=style || true + fi + # shfmt format + if command -v "$FD" >/dev/null 2>&1 && command -v shfmt >/dev/null 2>&1; then + "$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shfmt -w -i 2 -ci -sr || true + fi + + # Ruff format + if command -v ruff >/dev/null 2>&1; then + ruff check --fix . || true + ruff format . || true + fi +fi + +echo "Lint and format complete."