Experiments with a script to de-duplicate the static files, in particular the pdf.js one

2025-11-23 23:49:08 -06:00 · 2025-11-20 08:23:23 -08:00
parent bc622d67fc
commit 8e76753dd9
2 changed files with 169 additions and 1 deletions
--- a/3
+++ b/3
@@ -254,7 +254,8 @@ RUN set -eux \
    && chown --from root:root --changes --recursive paperless:paperless /usr/src/paperless \
  && echo "Collecting static files" \
    && s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \
-    && s6-setuidgid paperless python3 manage.py compilemessages
+    && s6-setuidgid paperless python3 manage.py compilemessages \
    && /usr/local/bin/deduplicate.py --verbose /usr/src/paperless/static/
 VOLUME ["/usr/src/paperless/data", \
        "/usr/src/paperless/media", \
--- a/docker/rootfs/usr/local/bin/deduplicate.py
+++ b/docker/rootfs/usr/local/bin/deduplicate.py
@@ -0,0 +1,167 @@
 #!/usr/bin/env python3
 """
 File deduplication script that replaces identical files with symlinks.
 Uses SHA256 hashing to identify duplicate files.
 """
 import hashlib
 from collections import defaultdict
 from pathlib import Path
 import click
 import humanize
 def calculate_sha256(filepath: Path) -> str | None:
    sha256_hash = hashlib.sha256()
    try:
        with filepath.open("rb") as f:
            # Read file in chunks to handle large files efficiently
            while chunk := f.read(65536):  # 64KB chunks
                sha256_hash.update(chunk)
        return sha256_hash.hexdigest()
    except OSError as e:
        click.echo(f"Error reading {filepath}: {e}\n", err=True)
        return None
 def find_duplicate_files(directory: Path) -> dict[str, list[Path]]:
    """
    Recursively scan directory and group files by their SHA256 hash.
    Returns a dictionary mapping hash -> list of file paths.
    """
    hash_to_files: dict[str, list[Path]] = defaultdict(list)
    for filepath in directory.rglob("*"):
        # Skip symlinks
        if filepath.is_symlink():
            continue
        # Skip if not a regular file
        if not filepath.is_file():
            continue
        file_hash = calculate_sha256(filepath)
        if file_hash:
            hash_to_files[file_hash].append(filepath)
    # Filter to only return hashes with duplicates
    return {h: files for h, files in hash_to_files.items() if len(files) > 1}
 def replace_with_symlinks(
    duplicate_groups: dict[str, list[Path]],
    *,
    dry_run: bool = False,
 ) -> tuple[int, int]:
    """
    Replace duplicate files with symlinks to the first occurrence.
    Returns (number_of_files_replaced, space_saved_in_bytes).
    """
    total_duplicates = 0
    space_saved = 0
    for file_hash, file_list in duplicate_groups.items():
        # Keep the first file as the original, replace others with symlinks
        original_file = file_list[0]
        duplicates = file_list[1:]
        click.echo(f"Found {len(duplicates)} duplicate(s) of: {original_file}\n")
        for duplicate in duplicates:
            try:
                # Get file size before deletion
                file_size = duplicate.stat().st_size
                if dry_run:
                    click.echo(f"  [DRY RUN] Would replace: {duplicate}\n")
                else:
                    # Remove the duplicate file
                    duplicate.unlink()
                    # Create relative symlink if possible, otherwise absolute
                    try:
                        # Try to create a relative symlink
                        rel_path = original_file.relative_to(duplicate.parent)
                        duplicate.symlink_to(rel_path)
                        click.echo(f"  Replaced: {duplicate} -> {rel_path}\n")
                    except ValueError:
                        # Fall back to absolute path
                        duplicate.symlink_to(original_file.resolve())
                        click.echo(f"  Replaced: {duplicate} -> {original_file}\n")
                    space_saved += file_size
                total_duplicates += 1
            except OSError as e:
                click.echo(f"  Error replacing {duplicate}: {e}\n", err=True)
    return total_duplicates, space_saved
@click.command()
@click.argument(
    "directory",
    type=click.Path(
        exists=True,
        file_okay=False,
        dir_okay=True,
        readable=True,
        path_type=Path,
    ),
 )
@click.option(
    "--dry-run",
    is_flag=True,
    help="Show what would be done without making changes",
 )
@click.option("--verbose", "-v", is_flag=True, help="Show verbose output")
 def deduplicate(directory: Path, *, dry_run: bool, verbose: bool) -> None:
    """
    Recursively search DIRECTORY for identical files and replace them with symlinks.
    Uses SHA256 hashing to identify duplicate files. The first occurrence of each
    unique file is kept, and all duplicates are replaced with symlinks pointing to it.
    """
    directory = directory.resolve()
    click.echo(f"Scanning directory: {directory}")
    if dry_run:
        click.echo("Running in DRY RUN mode - no changes will be made\n")
    # Find all duplicate files
    click.echo("Calculating file hashes...\n")
    duplicate_groups = find_duplicate_files(directory)
    if not duplicate_groups:
        click.echo("No duplicate files found!\n")
        return
    total_files = sum(len(files) - 1 for files in duplicate_groups.values())
    click.echo(
        f"Found {len(duplicate_groups)} group(s) of duplicates "
        f"({total_files} files to deduplicate)\n",
    )
    if verbose:
        for file_hash, files in duplicate_groups.items():
            click.echo(f"Hash: {file_hash}\n")
            for f in files:
                click.echo(f"  - {f}\n")
    # Replace duplicates with symlinks
    click.echo("Processing duplicates...\n")
    num_replaced, space_saved = replace_with_symlinks(duplicate_groups, dry_run=dry_run)
    # Summary
    click.echo(
        f"{'Would replace' if dry_run else 'Replaced'} "
        f"{num_replaced} duplicate file(s)\n",
    )
    if not dry_run:
        click.echo(f"Space saved: {humanize.naturalsize(space_saved, binary=True)}")
 if __name__ == "__main__":
    deduplicate()