From 827fcba2778de919265f5eece76dcf5586effca3 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:06:14 -0800 Subject: [PATCH] Chore: Reduce imports for a slight memory improvement (#9217) --- .ruff.toml | 2 +- src/documents/barcodes.py | 5 ++++- src/documents/bulk_download.py | 8 +++++--- src/documents/bulk_edit.py | 7 ++++++- src/documents/caching.py | 5 +++-- src/documents/classifier.py | 10 ++++++---- src/documents/filters.py | 7 ++++++- src/documents/index.py | 13 +++++++++---- src/documents/matching.py | 7 ++++++- src/documents/parsers.py | 11 ++++++++--- src/documents/serialisers.py | 8 +++++++- src/documents/signals/handlers.py | 14 ++++++++++---- 12 files changed, 71 insertions(+), 26 deletions(-) diff --git a/.ruff.toml b/.ruff.toml index ae1bed609..0fc170c96 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -26,7 +26,7 @@ extend-select = [ "T20", # https://docs.astral.sh/ruff/rules/#flake8-print-t20 "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim "TID", # https://docs.astral.sh/ruff/rules/#flake8-tidy-imports-tid - "TCH", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch + "TC", # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tc "PLC", # https://docs.astral.sh/ruff/rules/#pylint-pl "PLE", # https://docs.astral.sh/ruff/rules/#pylint-pl "RUF", # https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 4fe0670af..3b0c1d33b 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import re import tempfile @@ -10,7 +12,6 @@ from pdf2image import convert_from_path from pikepdf import Page from pikepdf import PasswordError from pikepdf import Pdf -from PIL import Image from documents.converters import convert_from_tiff_to_pdf from documents.data_models import ConsumableDocument @@ -25,6 +26,8 @@ from documents.utils import maybe_override_pixel_limit if TYPE_CHECKING: from collections.abc import Callable + from PIL import Image + logger = logging.getLogger("paperless.barcodes") diff --git a/src/documents/bulk_download.py b/src/documents/bulk_download.py index 5bdc3e74a..7e87f0488 100644 --- a/src/documents/bulk_download.py +++ b/src/documents/bulk_download.py @@ -1,12 +1,14 @@ +from __future__ import annotations + from pathlib import Path from typing import TYPE_CHECKING from typing import NoReturn -from zipfile import ZipFile - -from documents.models import Document if TYPE_CHECKING: from collections.abc import Callable + from zipfile import ZipFile + + from documents.models import Document class BulkArchiveStrategy: diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py index f6adfc8a9..be4608e36 100644 --- a/src/documents/bulk_edit.py +++ b/src/documents/bulk_edit.py @@ -1,8 +1,11 @@ +from __future__ import annotations + import hashlib import itertools import logging import tempfile from pathlib import Path +from typing import TYPE_CHECKING from typing import Literal from celery import chain @@ -10,7 +13,6 @@ from celery import chord from celery import group from celery import shared_task from django.conf import settings -from django.contrib.auth.models import User from django.db.models import Q from django.utils import timezone @@ -29,6 +31,9 @@ from documents.tasks import bulk_update_documents from documents.tasks import consume_file from documents.tasks import update_document_content_maybe_archive_file +if TYPE_CHECKING: + from django.contrib.auth.models import User + logger: logging.Logger = logging.getLogger("paperless.bulk_edit") diff --git a/src/documents/caching.py b/src/documents/caching.py index 6eb2b691f..1099a7a73 100644 --- a/src/documents/caching.py +++ b/src/documents/caching.py @@ -1,9 +1,10 @@ +from __future__ import annotations + import logging from binascii import hexlify from dataclasses import dataclass from typing import TYPE_CHECKING from typing import Final -from typing import Optional from django.core.cache import cache @@ -80,7 +81,7 @@ def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None: def set_suggestions_cache( document_id: int, suggestions: dict, - classifier: Optional["DocumentClassifier"], + classifier: DocumentClassifier | None, *, timeout=CACHE_50_MINUTES, ) -> None: diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 5bc8be2c6..548a4e833 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -1,22 +1,22 @@ +from __future__ import annotations + import logging import pickle import re import time import warnings -from collections.abc import Iterator from hashlib import sha256 from pathlib import Path from typing import TYPE_CHECKING -from typing import Optional if TYPE_CHECKING: + from collections.abc import Iterator from datetime import datetime from numpy import ndarray from django.conf import settings from django.core.cache import cache -from sklearn.exceptions import InconsistentVersionWarning from documents.caching import CACHE_50_MINUTES from documents.caching import CLASSIFIER_HASH_KEY @@ -38,7 +38,7 @@ class ClassifierModelCorruptError(Exception): pass -def load_classifier(*, raise_exception: bool = False) -> Optional["DocumentClassifier"]: +def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None: if not settings.MODEL_FILE.is_file(): logger.debug( "Document classification model does not exist (yet), not " @@ -103,6 +103,8 @@ class DocumentClassifier: self._stop_words = None def load(self) -> None: + from sklearn.exceptions import InconsistentVersionWarning + # Catch warnings for processing with warnings.catch_warnings(record=True) as w: with Path(settings.MODEL_FILE).open("rb") as f: diff --git a/src/documents/filters.py b/src/documents/filters.py index 1ce782ee6..b63da50e6 100644 --- a/src/documents/filters.py +++ b/src/documents/filters.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import functools import inspect import json import operator -from collections.abc import Callable from contextlib import contextmanager +from typing import TYPE_CHECKING from django.contrib.contenttypes.models import ContentType from django.db.models import Case @@ -39,6 +41,9 @@ from documents.models import ShareLink from documents.models import StoragePath from documents.models import Tag +if TYPE_CHECKING: + from collections.abc import Callable + CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"] ID_KWARGS = ["in", "exact"] INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"] diff --git a/src/documents/index.py b/src/documents/index.py index 4b11325ff..9b3a1724c 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import math from collections import Counter @@ -5,10 +7,10 @@ from contextlib import contextmanager from datetime import datetime from datetime import timezone from shutil import rmtree +from typing import TYPE_CHECKING from typing import Literal from django.conf import settings -from django.db.models import QuerySet from django.utils import timezone as django_timezone from guardian.shortcuts import get_users_with_perms from whoosh import classify @@ -32,10 +34,7 @@ from whoosh.qparser import QueryParser from whoosh.qparser.dateparse import DateParserPlugin from whoosh.qparser.dateparse import English from whoosh.qparser.plugins import FieldsPlugin -from whoosh.reading import IndexReader from whoosh.scoring import TF_IDF -from whoosh.searching import ResultsPage -from whoosh.searching import Searcher from whoosh.util.times import timespan from whoosh.writing import AsyncWriter @@ -44,6 +43,12 @@ from documents.models import Document from documents.models import Note from documents.models import User +if TYPE_CHECKING: + from django.db.models import QuerySet + from whoosh.reading import IndexReader + from whoosh.searching import ResultsPage + from whoosh.searching import Searcher + logger = logging.getLogger("paperless.index") diff --git a/src/documents/matching.py b/src/documents/matching.py index 59c0ccfda..ab3866518 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import logging import re from fnmatch import fnmatch +from typing import TYPE_CHECKING -from documents.classifier import DocumentClassifier from documents.data_models import ConsumableDocument from documents.data_models import DocumentSource from documents.models import Correspondent @@ -15,6 +17,9 @@ from documents.models import Workflow from documents.models import WorkflowTrigger from documents.permissions import get_objects_for_user_owner_aware +if TYPE_CHECKING: + from documents.classifier import DocumentClassifier + logger = logging.getLogger("paperless.matching") diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 28d903fdd..1465234a9 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -1,4 +1,5 @@ -import datetime +from __future__ import annotations + import logging import mimetypes import os @@ -6,10 +7,10 @@ import re import shutil import subprocess import tempfile -from collections.abc import Iterator from functools import lru_cache from pathlib import Path from re import Match +from typing import TYPE_CHECKING from django.conf import settings from django.utils import timezone @@ -19,6 +20,10 @@ from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +if TYPE_CHECKING: + import datetime + from collections.abc import Iterator + # This regular expression will try to find dates in the document at # hand and will match the following formats: # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits @@ -106,7 +111,7 @@ def get_supported_file_extensions() -> set[str]: return extensions -def get_parser_class_for_mime_type(mime_type: str) -> type["DocumentParser"] | None: +def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None: """ Returns the best parser (by weight) for the given mimetype or None if no parser exists diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index aeba5a721..a486fe241 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1,10 +1,12 @@ +from __future__ import annotations + import datetime import logging import math import re import zoneinfo -from collections.abc import Iterable from decimal import Decimal +from typing import TYPE_CHECKING import magic from celery import states @@ -32,6 +34,7 @@ from rest_framework.fields import SerializerMethodField if settings.AUDIT_LOG_ENABLED: from auditlog.context import set_actor + from documents import bulk_edit from documents.data_models import DocumentSource from documents.models import Correspondent @@ -60,6 +63,9 @@ from documents.templating.utils import convert_format_str_to_template_format from documents.validators import uri_validator from documents.validators import url_validator +if TYPE_CHECKING: + from collections.abc import Iterable + logger = logging.getLogger("paperless.serializers") diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 0079e5f8c..4345e04d5 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import logging import os import shutil -from pathlib import Path +from typing import TYPE_CHECKING import httpx from celery import shared_task @@ -23,9 +25,6 @@ from guardian.shortcuts import remove_perm from documents import matching from documents.caching import clear_document_caches -from documents.classifier import DocumentClassifier -from documents.data_models import ConsumableDocument -from documents.data_models import DocumentMetadataOverrides from documents.file_handling import create_source_path_directory from documents.file_handling import delete_empty_directories from documents.file_handling import generate_unique_filename @@ -46,6 +45,13 @@ from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import set_permissions_for_object from documents.templating.workflows import parse_w_workflow_placeholders +if TYPE_CHECKING: + from pathlib import Path + + from documents.classifier import DocumentClassifier + from documents.data_models import ConsumableDocument + from documents.data_models import DocumentMetadataOverrides + logger = logging.getLogger("paperless.handlers")