mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			b2703b4605
			...
			feature-ex
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | d5d914f74a | ||
|   | cca01a401d | 
| @@ -425,7 +425,7 @@ WHITENOISE_STATIC_PREFIX = "/static/" | |||||||
| if machine().lower() == "aarch64":  # pragma: no cover | if machine().lower() == "aarch64":  # pragma: no cover | ||||||
|     _static_backend = "django.contrib.staticfiles.storage.StaticFilesStorage" |     _static_backend = "django.contrib.staticfiles.storage.StaticFilesStorage" | ||||||
| else: | else: | ||||||
|     _static_backend = "whitenoise.storage.CompressedStaticFilesStorage" |     _static_backend = "paperless.staticfiles.DeduplicatedCompressedStaticFilesStorage" | ||||||
|  |  | ||||||
| STORAGES = { | STORAGES = { | ||||||
|     "staticfiles": { |     "staticfiles": { | ||||||
|   | |||||||
							
								
								
									
										385
									
								
								src/paperless/staticfiles.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										385
									
								
								src/paperless/staticfiles.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,385 @@ | |||||||
|  | import gzip | ||||||
|  | import hashlib | ||||||
|  | import logging | ||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  | import threading | ||||||
|  | import time | ||||||
|  | from collections import defaultdict | ||||||
|  | from concurrent.futures import ThreadPoolExecutor | ||||||
|  | from concurrent.futures import as_completed | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from pathlib import Path | ||||||
|  |  | ||||||
|  | import brotli | ||||||
|  | import humanize | ||||||
|  | from django.contrib.staticfiles.storage import StaticFilesStorage | ||||||
|  |  | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass(slots=True) | ||||||
|  | class FileInfo: | ||||||
|  |     file_path_str: str | ||||||
|  |     file_path_path: Path | ||||||
|  |     checksum: str | ||||||
|  |     original_size: int | ||||||
|  |     gzip_size: int | None = None | ||||||
|  |     brotli_size: int | None = None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DeduplicatedCompressedStaticFilesStorage(StaticFilesStorage): | ||||||
|  |     # File extensions that should be compressed | ||||||
|  |     COMPRESSIBLE_EXTENSIONS = { | ||||||
|  |         ".css", | ||||||
|  |         ".js", | ||||||
|  |         ".html", | ||||||
|  |         ".htm", | ||||||
|  |         ".xml", | ||||||
|  |         ".json", | ||||||
|  |         ".txt", | ||||||
|  |         ".svg", | ||||||
|  |         ".md", | ||||||
|  |         ".rst", | ||||||
|  |         ".csv", | ||||||
|  |         ".tsv", | ||||||
|  |         ".yaml", | ||||||
|  |         ".yml", | ||||||
|  |         ".map", | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     # Minimum file size to compress (bytes) | ||||||
|  |     MIN_COMPRESS_SIZE = 1024  # 1KB | ||||||
|  |  | ||||||
|  |     # Maximum number of threads for parallel processing | ||||||
|  |     MAX_WORKERS = min(32, (os.cpu_count() or 1) + 4) | ||||||
|  |  | ||||||
|  |     # Chunk size for file reading | ||||||
|  |     CHUNK_SIZE = 64 * 1024  # 64KB | ||||||
|  |  | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         super().__init__(*args, **kwargs) | ||||||
|  |         # --- MODIFIED: Added path_to_file_info for easy lookup --- | ||||||
|  |         self.hash_to_files: dict[str, list[FileInfo]] = defaultdict(list) | ||||||
|  |         self.path_to_file_info: dict[str, FileInfo] = {} | ||||||
|  |         self.linked_files: set[Path] = set() | ||||||
|  |         self.compression_stats = { | ||||||
|  |             "brotli": 0, | ||||||
|  |             "gzip": 0, | ||||||
|  |             "skipped_linked": 0, | ||||||
|  |             "skipped_other": 0, | ||||||
|  |             "errors": 0, | ||||||
|  |         } | ||||||
|  |         self._lock = threading.Lock() | ||||||
|  |  | ||||||
|  |     def post_process(self, paths: list[str], **options): | ||||||
|  |         """ | ||||||
|  |         Post-process collected files: deduplicate first, then compress. | ||||||
|  |         Django 5.2 compatible with proper options handling. | ||||||
|  |         """ | ||||||
|  |         start_time = time.time() | ||||||
|  |  | ||||||
|  |         # Step 1: Build hash map for deduplication (parallel) | ||||||
|  |         self._build_file_hash_map_parallel(paths) | ||||||
|  |  | ||||||
|  |         # Step 2: Create hard links for duplicate files | ||||||
|  |         self._create_hard_links() | ||||||
|  |  | ||||||
|  |         # Step 3: Compress files (parallel, skip linked duplicates) | ||||||
|  |         self._compress_files_parallel(paths) | ||||||
|  |  | ||||||
|  |         # Step 4: Provide user a summary of the compression | ||||||
|  |         self._log_compression_summary() | ||||||
|  |  | ||||||
|  |         processing_time = time.time() - start_time | ||||||
|  |         logger.info(f"Post-processing complete in {processing_time:.2f}s.") | ||||||
|  |  | ||||||
|  |         # Return list of processed files | ||||||
|  |         processed_files = [] | ||||||
|  |         for path in paths: | ||||||
|  |             processed_files.append((path, path, True)) | ||||||
|  |             # Add compressed variants | ||||||
|  |             file_path = self.path(path) | ||||||
|  |             if Path(file_path + ".br").exists(): | ||||||
|  |                 processed_files.append((path + ".br", path + ".br", True)) | ||||||
|  |             if Path(file_path + ".gz").exists(): | ||||||
|  |                 processed_files.append((path + ".gz", path + ".gz", True)) | ||||||
|  |  | ||||||
|  |         return processed_files | ||||||
|  |  | ||||||
|  |     def _build_file_hash_map_parallel(self, file_paths: list[str]): | ||||||
|  |         """Build a map of file hashes using parallel processing.""" | ||||||
|  |         logger.info( | ||||||
|  |             f"Hashing {len(file_paths)} files with {self.MAX_WORKERS} workers...", | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         def hash_file(path: str): | ||||||
|  |             """Hash a single file.""" | ||||||
|  |             try: | ||||||
|  |                 file_path = Path(self.path(path)) | ||||||
|  |                 if not file_path.is_file(): | ||||||
|  |                     return None, None, None | ||||||
|  |  | ||||||
|  |                 file_hash = self._get_file_hash_fast(file_path) | ||||||
|  |                 file_size = file_path.stat().st_size | ||||||
|  |                 return path, file_hash, file_size | ||||||
|  |             except Exception as e: | ||||||
|  |                 logger.warning(f"Error hashing file {path}: {e}") | ||||||
|  |                 return path, None, None | ||||||
|  |  | ||||||
|  |         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: | ||||||
|  |             future_to_path = { | ||||||
|  |                 executor.submit(hash_file, path): path for path in file_paths | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             for future in as_completed(future_to_path): | ||||||
|  |                 path, file_hash, file_size = future.result() | ||||||
|  |                 if path is not None and file_hash is not None and file_size is not None: | ||||||
|  |                     with self._lock: | ||||||
|  |                         file_info = FileInfo( | ||||||
|  |                             file_path_str=path, | ||||||
|  |                             file_path_path=Path(self.path(path)), | ||||||
|  |                             checksum=file_hash, | ||||||
|  |                             original_size=file_size, | ||||||
|  |                         ) | ||||||
|  |                         self.hash_to_files[file_hash].append(file_info) | ||||||
|  |                         self.path_to_file_info[path] = file_info | ||||||
|  |  | ||||||
|  |         duplicates = sum(1 for files in self.hash_to_files.values() if len(files) > 1) | ||||||
|  |         logger.info(f"Found {duplicates} sets of duplicate files") | ||||||
|  |  | ||||||
|  |     def _get_file_hash_fast(self, file_path: Path): | ||||||
|  |         """Calculate SHA-256 hash of file content with optimized reading.""" | ||||||
|  |         hash_sha256 = hashlib.sha256() | ||||||
|  |         try: | ||||||
|  |             with file_path.open("rb") as f: | ||||||
|  |                 while chunk := f.read(self.CHUNK_SIZE): | ||||||
|  |                     hash_sha256.update(chunk) | ||||||
|  |         except OSError as e: | ||||||
|  |             logger.warning(f"Could not read file {file_path}: {e}") | ||||||
|  |             raise | ||||||
|  |         return hash_sha256.hexdigest() | ||||||
|  |  | ||||||
|  |     def _create_hard_links(self): | ||||||
|  |         """Create hard links for duplicate files.""" | ||||||
|  |         logger.info("Creating hard links for duplicate files...") | ||||||
|  |  | ||||||
|  |         linked_count = 0 | ||||||
|  |         for file_info_list in self.hash_to_files.values(): | ||||||
|  |             if len(file_info_list) <= 1: | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             # Sort by file size (desc) then path length (asc) to keep best original | ||||||
|  |             file_info_list.sort(key=lambda x: (-x.original_size, len(x.file_path_str))) | ||||||
|  |             original_file_info = file_info_list[0] | ||||||
|  |             duplicate_info = file_info_list[1:] | ||||||
|  |  | ||||||
|  |             for duplicate_file_info in duplicate_info: | ||||||
|  |                 try: | ||||||
|  |                     # Remove duplicate file and create hard link | ||||||
|  |                     if duplicate_file_info.file_path_path.exists(): | ||||||
|  |                         duplicate_file_info.file_path_path.unlink() | ||||||
|  |  | ||||||
|  |                     # Create hard link | ||||||
|  |                     os.link( | ||||||
|  |                         original_file_info.file_path_path, | ||||||
|  |                         duplicate_file_info.file_path_path, | ||||||
|  |                     ) | ||||||
|  |  | ||||||
|  |                     with self._lock: | ||||||
|  |                         self.linked_files.add(duplicate_file_info.file_path_path) | ||||||
|  |  | ||||||
|  |                     linked_count += 1 | ||||||
|  |  | ||||||
|  |                     logger.info( | ||||||
|  |                         f"Linked {duplicate_file_info.file_path_path} -> {original_file_info.file_path_path}", | ||||||
|  |                     ) | ||||||
|  |  | ||||||
|  |                 except OSError as e: | ||||||
|  |                     logger.error( | ||||||
|  |                         f"Hard link failed for {original_file_info.file_path_path}, copying instead: {e}", | ||||||
|  |                     ) | ||||||
|  |                     # Fall back to copying if hard linking fails | ||||||
|  |                     try: | ||||||
|  |                         import shutil | ||||||
|  |  | ||||||
|  |                         shutil.copy2( | ||||||
|  |                             original_file_info.file_path_path, | ||||||
|  |                             original_file_info.file_path_path, | ||||||
|  |                         ) | ||||||
|  |                         logger.error( | ||||||
|  |                             f"Copied {original_file_info.file_path_path} (hard link failed)", | ||||||
|  |                         ) | ||||||
|  |                     except Exception as copy_error: | ||||||
|  |                         logger.error( | ||||||
|  |                             f"Failed to copy {original_file_info.file_path_path}: {copy_error}", | ||||||
|  |                         ) | ||||||
|  |  | ||||||
|  |         if linked_count > 0: | ||||||
|  |             logger.info(f"Created {linked_count} hard links") | ||||||
|  |  | ||||||
|  |     def _compress_files_parallel(self, file_paths: list[str]): | ||||||
|  |         """Compress files using parallel processing and update FileInfo objects.""" | ||||||
|  |         # Identify files to compress, excluding hard links | ||||||
|  |         compressible_files = [ | ||||||
|  |             self.path_to_file_info[path] | ||||||
|  |             for path in file_paths | ||||||
|  |             if self.path_to_file_info[path].file_path_path not in self.linked_files | ||||||
|  |             and self._should_compress_file(path) | ||||||
|  |         ] | ||||||
|  |  | ||||||
|  |         if not compressible_files: | ||||||
|  |             logger.info("No new files to compress") | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         logger.info( | ||||||
|  |             f"Compressing {len(compressible_files)} files with {self.MAX_WORKERS} workers...", | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         def compress_file(file_info: FileInfo): | ||||||
|  |             """Compress a single file and update its FileInfo by side-effect.""" | ||||||
|  |             brotli_size = None | ||||||
|  |             gzip_size = None | ||||||
|  |             error = None | ||||||
|  |             try: | ||||||
|  |                 brotli_size = self._compress_file_brotli(str(file_info.file_path_path)) | ||||||
|  |                 gzip_size = self._compress_file_gzip(str(file_info.file_path_path)) | ||||||
|  |                 # Store the compressed sizes | ||||||
|  |                 file_info.brotli_size = brotli_size | ||||||
|  |                 file_info.gzip_size = gzip_size | ||||||
|  |             except Exception as e: | ||||||
|  |                 error = str(e) | ||||||
|  |                 logger.warning(f"Error compressing {file_info.file_path_str}: {e}") | ||||||
|  |             return { | ||||||
|  |                 "brotli": brotli_size is not None, | ||||||
|  |                 "gzip": gzip_size is not None, | ||||||
|  |                 "error": error, | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |         with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: | ||||||
|  |             future_to_info = { | ||||||
|  |                 executor.submit(compress_file, info): info | ||||||
|  |                 for info in compressible_files | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             for future in as_completed(future_to_info): | ||||||
|  |                 result = future.result() | ||||||
|  |                 with self._lock: | ||||||
|  |                     if result["brotli"]: | ||||||
|  |                         self.compression_stats["brotli"] += 1 | ||||||
|  |                     if result["gzip"]: | ||||||
|  |                         self.compression_stats["gzip"] += 1 | ||||||
|  |                     if result["error"]: | ||||||
|  |                         self.compression_stats["errors"] += 1 | ||||||
|  |                     if ( | ||||||
|  |                         not result["brotli"] | ||||||
|  |                         and not result["gzip"] | ||||||
|  |                         and not result["error"] | ||||||
|  |                     ): | ||||||
|  |                         self.compression_stats["skipped_other"] += 1 | ||||||
|  |  | ||||||
|  |         self.compression_stats["skipped_linked"] = len(self.linked_files) | ||||||
|  |         logger.info(f"File count stats: {self.compression_stats}") | ||||||
|  |  | ||||||
|  |     def _should_compress_file(self, path: str): | ||||||
|  |         """Determine if a file should be compressed.""" | ||||||
|  |         file_ext = Path(path).suffix.lower() | ||||||
|  |         if file_ext not in self.COMPRESSIBLE_EXTENSIONS: | ||||||
|  |             return False | ||||||
|  |         try: | ||||||
|  |             if Path(self.path(path)).stat().st_size < self.MIN_COMPRESS_SIZE: | ||||||
|  |                 return False | ||||||
|  |         except OSError: | ||||||
|  |             return False | ||||||
|  |         return True | ||||||
|  |  | ||||||
|  |     def _compress_file_brotli(self, file_path: str) -> int | None: | ||||||
|  |         """Compress file using Brotli, returns compressed size or None.""" | ||||||
|  |         brotli_path = Path(file_path + ".br") | ||||||
|  |         try: | ||||||
|  |             with Path(file_path).open("rb") as f_in: | ||||||
|  |                 original_data = f_in.read() | ||||||
|  |             compressed_data = brotli.compress( | ||||||
|  |                 original_data, | ||||||
|  |                 quality=10, | ||||||
|  |                 lgwin=22,  # Window size | ||||||
|  |                 lgblock=0,  # Auto block size | ||||||
|  |             ) | ||||||
|  |             if len(compressed_data) < len(original_data) * 0.95: | ||||||
|  |                 with brotli_path.open("wb") as f_out: | ||||||
|  |                     f_out.write(compressed_data) | ||||||
|  |                 return len(compressed_data) | ||||||
|  |             return None | ||||||
|  |         except Exception as e: | ||||||
|  |             logger.warning(f"Brotli compression failed for {file_path}: {e}") | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |     def _compress_file_gzip(self, file_path: str) -> int | None: | ||||||
|  |         """Compress file using GZip, returns compressed size or None.""" | ||||||
|  |         gzip_path = Path(file_path + ".gz") | ||||||
|  |         file_path_path = Path(file_path) | ||||||
|  |         try: | ||||||
|  |             original_size = file_path_path.stat().st_size | ||||||
|  |             with ( | ||||||
|  |                 file_path_path.open("rb") as f_in, | ||||||
|  |                 gzip.open( | ||||||
|  |                     gzip_path, | ||||||
|  |                     "wb", | ||||||
|  |                     compresslevel=7, | ||||||
|  |                 ) as f_out, | ||||||
|  |             ): | ||||||
|  |                 shutil.copyfileobj(f_in, f_out, length=self.CHUNK_SIZE) | ||||||
|  |  | ||||||
|  |             compressed_size = gzip_path.stat().st_size | ||||||
|  |             if compressed_size < original_size * 0.95: | ||||||
|  |                 return compressed_size | ||||||
|  |             else: | ||||||
|  |                 gzip_path.unlink() | ||||||
|  |                 return None | ||||||
|  |         except Exception as e: | ||||||
|  |             logger.warning(f"GZip compression failed for {file_path}: {e}") | ||||||
|  |             if gzip_path.exists(): | ||||||
|  |                 try: | ||||||
|  |                     gzip_path.unlink() | ||||||
|  |                 except OSError: | ||||||
|  |                     pass | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |     def _log_compression_summary(self): | ||||||
|  |         """Calculates and logs the total size savings from compression.""" | ||||||
|  |         total_original_size = 0 | ||||||
|  |         total_brotli_size = 0 | ||||||
|  |         total_gzip_size = 0 | ||||||
|  |  | ||||||
|  |         # Only consider the original files, not the duplicates, for size calculation | ||||||
|  |         unique_files = { | ||||||
|  |             file_list[0].checksum: file_list[0] | ||||||
|  |             for file_list in self.hash_to_files.values() | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         for file_info in unique_files.values(): | ||||||
|  |             if self._should_compress_file(file_info.file_path_str): | ||||||
|  |                 total_original_size += file_info.original_size | ||||||
|  |                 if file_info.brotli_size: | ||||||
|  |                     total_brotli_size += file_info.brotli_size | ||||||
|  |                 if file_info.gzip_size: | ||||||
|  |                     total_gzip_size += file_info.gzip_size | ||||||
|  |  | ||||||
|  |         def get_savings(original: int, compressed: int) -> str: | ||||||
|  |             if original == 0: | ||||||
|  |                 return "0.00%" | ||||||
|  |             return f"{(1 - compressed / original) * 100:.2f}%" | ||||||
|  |  | ||||||
|  |         logger.info( | ||||||
|  |             f"Total Original Size (compressible files): {humanize.naturalsize(total_original_size)}", | ||||||
|  |         ) | ||||||
|  |         if total_brotli_size > 0: | ||||||
|  |             logger.info( | ||||||
|  |                 f"Total Brotli Size: {humanize.naturalsize(total_brotli_size)} " | ||||||
|  |                 f"(Savings: {get_savings(total_original_size, total_brotli_size)})", | ||||||
|  |             ) | ||||||
|  |         if total_gzip_size > 0: | ||||||
|  |             logger.info( | ||||||
|  |                 f"Total Gzip Size:   {humanize.naturalsize(total_gzip_size)} " | ||||||
|  |                 f"(Savings: {get_savings(total_original_size, total_gzip_size)})", | ||||||
|  |             ) | ||||||
		Reference in New Issue
	
	Block a user