Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 114 additions & 115 deletions Cachyos/Scripts/WIP/gphotos/Dup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,131 +6,130 @@


def hash_file_partial(file_path, chunk_size=65536):
"""
Computes a partial hash of the file (first 64KB) to quickly filter non-duplicates.
"""
try:
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
chunk = f.read(chunk_size)
sha256_hash.update(chunk)
return file_path, sha256_hash.hexdigest()
except (IOError, OSError) as e:
print(f"Error partial hashing {file_path}: {e}")
return file_path, None
"""
Computes a partial hash of the file (first 64KB) to quickly filter non-duplicates.
"""
try:
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
chunk = f.read(chunk_size)
sha256_hash.update(chunk)
return file_path, sha256_hash.hexdigest()
except (IOError, OSError) as e:
print(f"Error partial hashing {file_path}: {e}")
return file_path, None


def hash_file(file_path):
try:
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(65536), b""):
sha256_hash.update(byte_block)
return file_path, sha256_hash.hexdigest()
except Exception as e:
print(f"Error hashing {file_path}: {e}")
return file_path, None
try:
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(65536), b""):
sha256_hash.update(byte_block)
return file_path, sha256_hash.hexdigest()
except Exception as e:
print(f"Error hashing {file_path}: {e}")
return file_path, None


def group_files_by_size(starting_path):
"""Groups files by size."""
size_dict = {}
for dirpath, _, filenames in os.walk(starting_path):
for filename in filenames:
if filename.lower().endswith((".jpg", ".jpeg", ".png", ".gif")):
full_path = os.path.join(dirpath, filename)
try:
file_size = os.path.getsize(full_path)
if file_size in size_dict:
size_dict[file_size].append(full_path)
else:
size_dict[file_size] = [full_path]
except OSError:
continue
return size_dict
"""Groups files by size."""
size_dict = {}
for dirpath, _, filenames in os.walk(starting_path):
for filename in filenames:
if filename.lower().endswith((".jpg", ".jpeg", ".png", ".gif")):
full_path = os.path.join(dirpath, filename)
try:
file_size = os.path.getsize(full_path)
if file_size in size_dict:
size_dict[file_size].append(full_path)
else:
size_dict[file_size] = [full_path]
except OSError:
continue
return size_dict


def find_duplicate_photos(starting_path, output_file_path):
# Step 1: Group by size
size_dict = group_files_by_size(starting_path)

# Collect all candidates for partial hashing (any file that shares a size with another)
all_candidates = []
for paths in size_dict.values():
if len(paths) > 1:
all_candidates.extend(paths)

if not all_candidates:
return

final_duplicates = {}

with Pool(processes=cpu_count()) as pool:
# Step 2: Partial hashing
# Map: path -> partial_hash
partial_results = pool.map(hash_file_partial, all_candidates)
partial_hashes = dict(partial_results)

# Regroup by partial hash within size groups
full_hash_candidates = []
groups_to_check = []

for paths in size_dict.values():
if len(paths) < 2:
continue

# Group by partial hash
ph_groups = {}
for p in paths:
ph = partial_hashes.get(p)
if ph:
if ph not in ph_groups:
ph_groups[ph] = []
ph_groups[ph].append(p)

# Identify groups that still have multiple candidates
for group in ph_groups.values():
if len(group) > 1:
groups_to_check.append(group)
full_hash_candidates.extend(group)

if full_hash_candidates:
# Remove duplicates from full_hash_candidates list to avoid redundant hashing?
# Actually `groups_to_check` might contain same file if I messed up? No.
# But `full_hash_candidates` is flat list.
# Files are unique in `all_candidates` (from `os.walk`).

# Step 3: Full hashing
full_results = pool.map(hash_file, full_hash_candidates)
full_hashes = dict(full_results)

# Step 4: Identify final duplicates
for group in groups_to_check:
fh_groups = {}
for p in group:
fh = full_hashes.get(p)
if fh:
if fh not in fh_groups:
fh_groups[fh] = []
fh_groups[fh].append(p)

for fh, files in fh_groups.items():
if len(files) > 1:
final_duplicates[fh] = files

# Output results
with open(output_file_path, "w") as f:
for key, value in final_duplicates.items():
f.write(f"Duplicate Photos (Hash: {key}):\n")
for file_path in value:
f.write(f"{file_path}\n")
f.write("\n")
# Step 1: Group by size
size_dict = group_files_by_size(starting_path)

# Collect all candidates for partial hashing (any file that shares a size with another)
all_candidates = [
p for paths in size_dict.values() if len(paths) > 1 for p in paths
]

if not all_candidates:
return

final_duplicates = {}

with Pool(processes=cpu_count()) as pool:
# Step 2: Partial hashing
# Map: path -> partial_hash
partial_results = pool.map(hash_file_partial, all_candidates)
partial_hashes = dict(partial_results)

# Regroup by partial hash within size groups
full_hash_candidates = []
groups_to_check = []

for paths in size_dict.values():
if len(paths) < 2:
continue

# Group by partial hash
ph_groups = {}
for p in paths:

Check notice on line 83 in Cachyos/Scripts/WIP/gphotos/Dup.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Cachyos/Scripts/WIP/gphotos/Dup.py#L83

Variable name "p" doesn't conform to '[a-z_][a-z0-9_]{2,30}$' pattern
ph = partial_hashes.get(p)

Check notice on line 84 in Cachyos/Scripts/WIP/gphotos/Dup.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Cachyos/Scripts/WIP/gphotos/Dup.py#L84

Variable name "ph" doesn't conform to '[a-z_][a-z0-9_]{2,30}$' pattern
if ph:
if ph not in ph_groups:
ph_groups[ph] = []
ph_groups[ph].append(p)

# Identify groups that still have multiple candidates
for group in ph_groups.values():
if len(group) > 1:
groups_to_check.append(group)
full_hash_candidates.extend(group)

if full_hash_candidates:
# Remove duplicates from full_hash_candidates list to avoid redundant hashing?
# Actually `groups_to_check` might contain same file if I messed up? No.
# But `full_hash_candidates` is flat list.
# Files are unique in `all_candidates` (from `os.walk`).

# Step 3: Full hashing
full_results = pool.map(hash_file, full_hash_candidates)
full_hashes = dict(full_results)

# Step 4: Identify final duplicates
for group in groups_to_check:
fh_groups = {}
for p in group:

Check notice on line 109 in Cachyos/Scripts/WIP/gphotos/Dup.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Cachyos/Scripts/WIP/gphotos/Dup.py#L109

Variable name "p" doesn't conform to '[a-z_][a-z0-9_]{2,30}$' pattern
fh = full_hashes.get(p)

Check notice on line 110 in Cachyos/Scripts/WIP/gphotos/Dup.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Cachyos/Scripts/WIP/gphotos/Dup.py#L110

Variable name "fh" doesn't conform to '[a-z_][a-z0-9_]{2,30}$' pattern
if fh:
if fh not in fh_groups:
fh_groups[fh] = []
fh_groups[fh].append(p)

for fh, files in fh_groups.items():

Check notice on line 116 in Cachyos/Scripts/WIP/gphotos/Dup.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Cachyos/Scripts/WIP/gphotos/Dup.py#L116

Variable name "fh" doesn't conform to '[a-z_][a-z0-9_]{2,30}$' pattern
if len(files) > 1:
final_duplicates[fh] = files

# Output results
with open(output_file_path, "w") as f:
for key, value in final_duplicates.items():
f.write(f"Duplicate Photos (Hash: {key}):\n")
for file_path in value:
f.write(f"{file_path}\n")
f.write("\n")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Find duplicate photos.")
parser.add_argument("directory", help="Directory to scan")
parser.add_argument("output", help="Output file for duplicates")
args = parser.parse_args()
parser = argparse.ArgumentParser(description="Find duplicate photos.")
parser.add_argument("directory", help="Directory to scan")
parser.add_argument("output", help="Output file for duplicates")
args = parser.parse_args()

find_duplicate_photos(args.directory, args.output)
find_duplicate_photos(args.directory, args.output)
54 changes: 51 additions & 3 deletions lint-format.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,52 @@
#!/usr/bin/env bash
#!/bin/bash
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Shebang should be #!/usr/bin/env bash instead of #!/bin/bash per code style rules (AGENTS.md line ~380)

set -e
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Missing strict mode flags. Should be set -Eeuo pipefail (per AGENTS.md script template)


# Mock lint-format.sh since it seems missing
echo "Mock lint-format.sh executing..."
# Recreated lint-format.sh
CHECK_MODE=0
if [[ "$1" == "-c" ]]; then
CHECK_MODE=1
fi

FD="${FD:-fd}"
if ! command -v "$FD" >/dev/null 2>&1; then
FD="fdfind"
fi

echo "Using FD tool: $FD"

if [[ "$CHECK_MODE" -eq 1 ]]; then
echo "Running in check mode..."
# ShellCheck
if command -v "$FD" >/dev/null 2>&1 && command -v shellcheck >/dev/null 2>&1; then
"$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shellcheck --severity=error
fi
# shfmt
if command -v "$FD" >/dev/null 2>&1 && command -v shfmt >/dev/null 2>&1; then
"$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shfmt -i 2 -ci -sr -l
fi

# Ruff
if command -v ruff >/dev/null 2>&1; then
ruff check .
ruff format --check .
fi

else
echo "Running in format mode..."
# ShellCheck
if command -v "$FD" >/dev/null 2>&1 && command -v shellcheck >/dev/null 2>&1; then
"$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shellcheck --severity=style || true
fi
# shfmt format
if command -v "$FD" >/dev/null 2>&1 && command -v shfmt >/dev/null 2>&1; then
"$FD" -t f -e sh . | grep -v 'Cachyos/Scripts/WIP' | xargs -r shfmt -w -i 2 -ci -sr || true
fi

# Ruff format
if command -v ruff >/dev/null 2>&1; then
ruff check --fix . || true
ruff format . || true
fi
fi

echo "Lint and format complete."
Loading