mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-11-23 23:49:08 -06:00
Experiments with a script to de-duplicate the static files, in particular the pdf.js one
This commit is contained in:
@@ -254,7 +254,8 @@ RUN set -eux \
|
|||||||
&& chown --from root:root --changes --recursive paperless:paperless /usr/src/paperless \
|
&& chown --from root:root --changes --recursive paperless:paperless /usr/src/paperless \
|
||||||
&& echo "Collecting static files" \
|
&& echo "Collecting static files" \
|
||||||
&& s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \
|
&& s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \
|
||||||
&& s6-setuidgid paperless python3 manage.py compilemessages
|
&& s6-setuidgid paperless python3 manage.py compilemessages \
|
||||||
|
&& /usr/local/bin/deduplicate.py --verbose /usr/src/paperless/static/
|
||||||
|
|
||||||
VOLUME ["/usr/src/paperless/data", \
|
VOLUME ["/usr/src/paperless/data", \
|
||||||
"/usr/src/paperless/media", \
|
"/usr/src/paperless/media", \
|
||||||
|
|||||||
167
docker/rootfs/usr/local/bin/deduplicate.py
Executable file
167
docker/rootfs/usr/local/bin/deduplicate.py
Executable file
@@ -0,0 +1,167 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
File deduplication script that replaces identical files with symlinks.
|
||||||
|
Uses SHA256 hashing to identify duplicate files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
import humanize
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_sha256(filepath: Path) -> str | None:
|
||||||
|
sha256_hash = hashlib.sha256()
|
||||||
|
try:
|
||||||
|
with filepath.open("rb") as f:
|
||||||
|
# Read file in chunks to handle large files efficiently
|
||||||
|
while chunk := f.read(65536): # 64KB chunks
|
||||||
|
sha256_hash.update(chunk)
|
||||||
|
return sha256_hash.hexdigest()
|
||||||
|
except OSError as e:
|
||||||
|
click.echo(f"Error reading {filepath}: {e}\n", err=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_duplicate_files(directory: Path) -> dict[str, list[Path]]:
|
||||||
|
"""
|
||||||
|
Recursively scan directory and group files by their SHA256 hash.
|
||||||
|
Returns a dictionary mapping hash -> list of file paths.
|
||||||
|
"""
|
||||||
|
hash_to_files: dict[str, list[Path]] = defaultdict(list)
|
||||||
|
|
||||||
|
for filepath in directory.rglob("*"):
|
||||||
|
# Skip symlinks
|
||||||
|
if filepath.is_symlink():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if not a regular file
|
||||||
|
if not filepath.is_file():
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_hash = calculate_sha256(filepath)
|
||||||
|
if file_hash:
|
||||||
|
hash_to_files[file_hash].append(filepath)
|
||||||
|
|
||||||
|
# Filter to only return hashes with duplicates
|
||||||
|
return {h: files for h, files in hash_to_files.items() if len(files) > 1}
|
||||||
|
|
||||||
|
|
||||||
|
def replace_with_symlinks(
|
||||||
|
duplicate_groups: dict[str, list[Path]],
|
||||||
|
*,
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Replace duplicate files with symlinks to the first occurrence.
|
||||||
|
Returns (number_of_files_replaced, space_saved_in_bytes).
|
||||||
|
"""
|
||||||
|
total_duplicates = 0
|
||||||
|
space_saved = 0
|
||||||
|
|
||||||
|
for file_hash, file_list in duplicate_groups.items():
|
||||||
|
# Keep the first file as the original, replace others with symlinks
|
||||||
|
original_file = file_list[0]
|
||||||
|
duplicates = file_list[1:]
|
||||||
|
|
||||||
|
click.echo(f"Found {len(duplicates)} duplicate(s) of: {original_file}\n")
|
||||||
|
|
||||||
|
for duplicate in duplicates:
|
||||||
|
try:
|
||||||
|
# Get file size before deletion
|
||||||
|
file_size = duplicate.stat().st_size
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
click.echo(f" [DRY RUN] Would replace: {duplicate}\n")
|
||||||
|
else:
|
||||||
|
# Remove the duplicate file
|
||||||
|
duplicate.unlink()
|
||||||
|
|
||||||
|
# Create relative symlink if possible, otherwise absolute
|
||||||
|
try:
|
||||||
|
# Try to create a relative symlink
|
||||||
|
rel_path = original_file.relative_to(duplicate.parent)
|
||||||
|
duplicate.symlink_to(rel_path)
|
||||||
|
click.echo(f" Replaced: {duplicate} -> {rel_path}\n")
|
||||||
|
except ValueError:
|
||||||
|
# Fall back to absolute path
|
||||||
|
duplicate.symlink_to(original_file.resolve())
|
||||||
|
click.echo(f" Replaced: {duplicate} -> {original_file}\n")
|
||||||
|
|
||||||
|
space_saved += file_size
|
||||||
|
|
||||||
|
total_duplicates += 1
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
click.echo(f" Error replacing {duplicate}: {e}\n", err=True)
|
||||||
|
|
||||||
|
return total_duplicates, space_saved
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument(
|
||||||
|
"directory",
|
||||||
|
type=click.Path(
|
||||||
|
exists=True,
|
||||||
|
file_okay=False,
|
||||||
|
dir_okay=True,
|
||||||
|
readable=True,
|
||||||
|
path_type=Path,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--dry-run",
|
||||||
|
is_flag=True,
|
||||||
|
help="Show what would be done without making changes",
|
||||||
|
)
|
||||||
|
@click.option("--verbose", "-v", is_flag=True, help="Show verbose output")
|
||||||
|
def deduplicate(directory: Path, *, dry_run: bool, verbose: bool) -> None:
|
||||||
|
"""
|
||||||
|
Recursively search DIRECTORY for identical files and replace them with symlinks.
|
||||||
|
|
||||||
|
Uses SHA256 hashing to identify duplicate files. The first occurrence of each
|
||||||
|
unique file is kept, and all duplicates are replaced with symlinks pointing to it.
|
||||||
|
"""
|
||||||
|
directory = directory.resolve()
|
||||||
|
|
||||||
|
click.echo(f"Scanning directory: {directory}")
|
||||||
|
if dry_run:
|
||||||
|
click.echo("Running in DRY RUN mode - no changes will be made\n")
|
||||||
|
|
||||||
|
# Find all duplicate files
|
||||||
|
click.echo("Calculating file hashes...\n")
|
||||||
|
duplicate_groups = find_duplicate_files(directory)
|
||||||
|
|
||||||
|
if not duplicate_groups:
|
||||||
|
click.echo("No duplicate files found!\n")
|
||||||
|
return
|
||||||
|
|
||||||
|
total_files = sum(len(files) - 1 for files in duplicate_groups.values())
|
||||||
|
click.echo(
|
||||||
|
f"Found {len(duplicate_groups)} group(s) of duplicates "
|
||||||
|
f"({total_files} files to deduplicate)\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
for file_hash, files in duplicate_groups.items():
|
||||||
|
click.echo(f"Hash: {file_hash}\n")
|
||||||
|
for f in files:
|
||||||
|
click.echo(f" - {f}\n")
|
||||||
|
|
||||||
|
# Replace duplicates with symlinks
|
||||||
|
click.echo("Processing duplicates...\n")
|
||||||
|
num_replaced, space_saved = replace_with_symlinks(duplicate_groups, dry_run=dry_run)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
click.echo(
|
||||||
|
f"{'Would replace' if dry_run else 'Replaced'} "
|
||||||
|
f"{num_replaced} duplicate file(s)\n",
|
||||||
|
)
|
||||||
|
if not dry_run:
|
||||||
|
click.echo(f"Space saved: {humanize.naturalsize(space_saved, binary=True)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
deduplicate()
|
||||||
Reference in New Issue
Block a user