#!/usr/bin/env python3
"""
File deduplication script that replaces identical files with symlinks.
Uses SHA256 hashing to identify duplicate files.
"""

import hashlib
from collections import defaultdict
from pathlib import Path

import click
import humanize


def calculate_sha256(filepath: Path) -> str | None:
    sha256_hash = hashlib.sha256()
    try:
        with filepath.open("rb") as f:
            # Read file in chunks to handle large files efficiently
            while chunk := f.read(65536):  # 64KB chunks
                sha256_hash.update(chunk)
        return sha256_hash.hexdigest()
    except OSError as e:
        click.echo(f"Error reading {filepath}: {e}", err=True)
        return None


def find_duplicate_files(directory: Path) -> dict[str, list[Path]]:
    """
    Recursively scan directory and group files by their SHA256 hash.
    Returns a dictionary mapping hash -> list of file paths.
    """
    hash_to_files: dict[str, list[Path]] = defaultdict(list)

    for filepath in directory.rglob("*"):
        # Skip symlinks
        if filepath.is_symlink():
            continue

        # Skip if not a regular file
        if not filepath.is_file():
            continue

        file_hash = calculate_sha256(filepath)
        if file_hash:
            hash_to_files[file_hash].append(filepath)

    # Filter to only return hashes with duplicates
    return {h: files for h, files in hash_to_files.items() if len(files) > 1}


def replace_with_symlinks(
    duplicate_groups: dict[str, list[Path]],
    *,
    dry_run: bool = False,
) -> tuple[int, int]:
    """
    Replace duplicate files with symlinks to the first occurrence.
    Returns (number_of_files_replaced, space_saved_in_bytes).
    """
    total_duplicates = 0
    space_saved = 0

    for file_hash, file_list in duplicate_groups.items():
        # Keep the first file as the original, replace others with symlinks
        original_file = file_list[0]
        duplicates = file_list[1:]

        click.echo(f"Found {len(duplicates)} duplicate(s) of: {original_file}")

        for duplicate in duplicates:
            try:
                # Get file size before deletion
                file_size = duplicate.stat().st_size

                if dry_run:
                    click.echo(f"  [DRY RUN] Would replace: {duplicate}")
                else:
                    # Remove the duplicate file
                    duplicate.unlink()

                    # Create relative symlink if possible, otherwise absolute
                    try:
                        # Try to create a relative symlink
                        rel_path = original_file.relative_to(duplicate.parent)
                        duplicate.symlink_to(rel_path)
                        click.echo(f"  Replaced: {duplicate} -> {rel_path}")
                    except ValueError:
                        # Fall back to absolute path
                        duplicate.symlink_to(original_file.resolve())
                        click.echo(f"  Replaced: {duplicate} -> {original_file}")

                    space_saved += file_size

                total_duplicates += 1

            except OSError as e:
                click.echo(f"  Error replacing {duplicate}: {e}", err=True)

    return total_duplicates, space_saved


@click.command()
@click.argument(
    "directory",
    type=click.Path(
        exists=True,
        file_okay=False,
        dir_okay=True,
        readable=True,
        path_type=Path,
    ),
)
@click.option(
    "--dry-run",
    is_flag=True,
    help="Show what would be done without making changes",
)
@click.option("--verbose", "-v", is_flag=True, help="Show verbose output")
def deduplicate(directory: Path, *, dry_run: bool, verbose: bool) -> None:
    """
    Recursively search DIRECTORY for identical files and replace them with symlinks.

    Uses SHA256 hashing to identify duplicate files. The first occurrence of each
    unique file is kept, and all duplicates are replaced with symlinks pointing to it.
    """
    directory = directory.resolve()

    click.echo(f"Scanning directory: {directory}")
    if dry_run:
        click.echo("Running in DRY RUN mode - no changes will be made")

    # Find all duplicate files
    click.echo("Calculating file hashes...")
    duplicate_groups = find_duplicate_files(directory)

    if not duplicate_groups:
        click.echo("No duplicate files found!")
        return

    total_files = sum(len(files) - 1 for files in duplicate_groups.values())
    click.echo(
        f"Found {len(duplicate_groups)} group(s) of duplicates "
        f"({total_files} files to deduplicate)",
    )

    if verbose:
        for file_hash, files in duplicate_groups.items():
            click.echo(f"Hash: {file_hash}")
            for f in files:
                click.echo(f"  - {f}")

    # Replace duplicates with symlinks
    click.echo("Processing duplicates...")
    num_replaced, space_saved = replace_with_symlinks(duplicate_groups, dry_run=dry_run)

    # Summary
    click.echo(
        f"{'Would replace' if dry_run else 'Replaced'} "
        f"{num_replaced} duplicate file(s)",
    )
    if not dry_run:
        click.echo(f"Space saved: {humanize.naturalsize(space_saved, binary=True)}")


if __name__ == "__main__":
    deduplicate()