diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6848178e1..32daae6ec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -549,7 +549,7 @@ jobs: - name: Generate requirements file run: | - uv export --quiet --no-dev --format requirements-txt --output-file requirements.txt + uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt - name: Compile messages run: | diff --git a/Dockerfile b/Dockerfile index 10fbab8e0..e64f1708c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -205,9 +205,6 @@ COPY --chown=1000:1000 ["pyproject.toml", "uv.lock", "/usr/src/paperless/src/"] # dependencies ARG BUILD_PACKAGES="\ build-essential \ - git \ - # https://www.psycopg.org/docs/install.html#prerequisites - libpq-dev \ # https://github.com/PyMySQL/mysqlclient#linux default-libmysqlclient-dev \ pkg-config" @@ -219,7 +216,7 @@ RUN --mount=type=cache,target=${UV_CACHE_DIR},id=python-cache \ && apt-get update \ && apt-get install --yes --quiet --no-install-recommends ${BUILD_PACKAGES} \ && echo "Installing Python requirements" \ - && uv export --quiet --no-dev --format requirements-txt --output-file requirements.txt \ + && uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt \ && uv pip install --system --no-python-downloads --python-preference system --requirements requirements.txt \ && echo "Installing NLTK data" \ && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" snowball_data \ diff --git a/docs/setup.md b/docs/setup.md index 55b52e387..75754766e 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -380,6 +380,12 @@ are released, dependency support is confirmed, etc. dependencies. This is an alternative to the above and may require adjusting the example scripts to utilize the virtual environment paths + !!! tip + + If you use modern Python tooling, such as `uv`, installation will not include + dependencies for Postgres or Mariadb. You can select those extras with `--extra ` + or all with `--all-extras` + 9. Go to `/opt/paperless/src`, and execute the following commands: ```bash diff --git a/pyproject.toml b/pyproject.toml index 291033c13..d26d05aa3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,20 +42,15 @@ dependencies = [ "filelock~=3.17.0", "flower~=2.0.1", "gotenberg-client~=0.9.0", - "granian~=1.7.6", "httpx-oauth~=0.16", "imap-tools~=1.10.0", "inotifyrecursive~=0.3", "jinja2~=3.1.5", "langdetect~=1.0.9", - "mysqlclient~=2.2.7", "nltk~=3.9.1", "ocrmypdf~=16.9.0", "pathvalidate~=3.2.3", "pdf2image~=1.17.0", - "psycopg[c]==3.2.4", - # Direct dependency for proper resolution of the pre-build wheels - "psycopg-c==3.2.4", "python-dateutil~=2.9.0", "python-dotenv~=1.0.1", "python-gnupg~=0.5.4", @@ -74,6 +69,18 @@ dependencies = [ "zxing-cpp~=2.3.0", ] +optional-dependencies.mariadb = [ + "mysqlclient~=2.2.7", +] +optional-dependencies.postgres = [ + "psycopg[c]==3.2.4", + # Direct dependency for proper resolution of the pre-built wheels + "psycopg-c==3.2.4", +] +optional-dependencies.webserver = [ + "granian~=1.7.6", +] + [dependency-groups] dev = [ diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 81739fa7a..4bf9ab89b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -26,7 +26,6 @@ from documents.models import CustomField from documents.models import CustomFieldInstance from documents.models import Document from documents.models import DocumentType -from documents.models import FileInfo from documents.models import StoragePath from documents.models import Tag from documents.models import WorkflowTrigger @@ -705,8 +704,6 @@ class ConsumerPlugin( ) -> Document: # If someone gave us the original filename, use it instead of doc. - file_info = FileInfo.from_filename(self.filename) - self.log.debug("Saving record to database") if self.metadata.created is not None: @@ -714,9 +711,6 @@ class ConsumerPlugin( self.log.debug( f"Creation date from post_documents parameter: {create_date}", ) - elif file_info.created is not None: - create_date = file_info.created - self.log.debug(f"Creation date from FileInfo: {create_date}") elif date is not None: create_date = date self.log.debug(f"Creation date from parse_date: {create_date}") @@ -729,7 +723,11 @@ class ConsumerPlugin( storage_type = Document.STORAGE_TYPE_UNENCRYPTED - title = file_info.title + if self.metadata.filename: + title = Path(self.metadata.filename).stem + else: + title = self.input_doc.original_file.stem + if self.metadata.title is not None: try: title = self._parse_title_placeholders(self.metadata.title) diff --git a/src/documents/models.py b/src/documents/models.py index 57ff96df1..e40ee8115 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,11 +1,7 @@ import datetime -import os -import re -from collections import OrderedDict from pathlib import Path from typing import Final -import dateutil.parser import pathvalidate from celery import states from django.conf import settings @@ -517,91 +513,6 @@ class SavedViewFilterRule(models.Model): return f"SavedViewFilterRule: {self.rule_type} : {self.value}" -# TODO: why is this in the models file? -# TODO: how about, what is this and where is it documented? -# It appears to parsing JSON from an environment variable to get a title and date from -# the filename, if possible, as a higher priority than either document filename or -# content parsing -class FileInfo: - REGEXES = OrderedDict( - [ - ( - "created-title", - re.compile( - r"^(?P\d{8}(\d{6})?Z) - (?P.*)$", - flags=re.IGNORECASE, - ), - ), - ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)), - ], - ) - - def __init__( - self, - created=None, - correspondent=None, - title=None, - tags=(), - extension=None, - ): - self.created = created - self.title = title - self.extension = extension - self.correspondent = correspondent - self.tags = tags - - @classmethod - def _get_created(cls, created): - try: - return dateutil.parser.parse(f"{created[:-1]:0<14}Z") - except ValueError: - return None - - @classmethod - def _get_title(cls, title): - return title - - @classmethod - def _mangle_property(cls, properties, name): - if name in properties: - properties[name] = getattr(cls, f"_get_{name}")(properties[name]) - - @classmethod - def from_filename(cls, filename) -> "FileInfo": - # Mutate filename in-place before parsing its components - # by applying at most one of the configured transformations. - for pattern, repl in settings.FILENAME_PARSE_TRANSFORMS: - (filename, count) = pattern.subn(repl, filename) - if count: - break - - # do this after the transforms so that the transforms can do whatever - # with the file extension. - filename_no_ext = os.path.splitext(filename)[0] - - if filename_no_ext == filename and filename.startswith("."): - # This is a very special case where there is no text before the - # file type. - # TODO: this should be handled better. The ext is not removed - # because usually, files like '.pdf' are just hidden files - # with the name pdf, but in our case, its more likely that - # there's just no name to begin with. - filename = "" - # This isn't too bad either, since we'll just not match anything - # and return an empty title. TODO: actually, this is kinda bad. - else: - filename = filename_no_ext - - # Parse filename components. - for regex in cls.REGEXES.values(): - m = regex.match(filename) - if m: - properties = m.groupdict() - cls._mangle_property(properties, "created") - cls._mangle_property(properties, "title") - return cls(**properties) - - # Extending User Model Using a One-To-One Link class UiSettings(models.Model): user = models.OneToOneField( diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 6f576ab24..ff684804e 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,12 +1,10 @@ import datetime import os -import re import shutil import stat import tempfile import zoneinfo from pathlib import Path -from unittest import TestCase as UnittestTestCase from unittest import mock from unittest.mock import MagicMock @@ -26,7 +24,6 @@ from documents.models import Correspondent from documents.models import CustomField from documents.models import Document from documents.models import DocumentType -from documents.models import FileInfo from documents.models import StoragePath from documents.models import Tag from documents.parsers import DocumentParser @@ -40,143 +37,6 @@ from paperless_mail.models import MailRule from paperless_mail.parsers import MailDocumentParser -class TestAttributes(UnittestTestCase): - TAGS = ("tag1", "tag2", "tag3") - - def _test_guess_attributes_from_name(self, filename, sender, title, tags): - file_info = FileInfo.from_filename(filename) - - if sender: - self.assertEqual(file_info.correspondent.name, sender, filename) - else: - self.assertIsNone(file_info.correspondent, filename) - - self.assertEqual(file_info.title, title, filename) - - self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename) - - def test_guess_attributes_from_name_when_title_starts_with_dash(self): - self._test_guess_attributes_from_name( - "- weird but should not break.pdf", - None, - "- weird but should not break", - (), - ) - - def test_guess_attributes_from_name_when_title_ends_with_dash(self): - self._test_guess_attributes_from_name( - "weird but should not break -.pdf", - None, - "weird but should not break -", - (), - ) - - -class TestFieldPermutations(TestCase): - valid_dates = ( - "20150102030405Z", - "20150102Z", - ) - valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""] - valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""] - valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"] - - def _test_guessed_attributes( - self, - filename, - created=None, - correspondent=None, - title=None, - tags=None, - ): - info = FileInfo.from_filename(filename) - - # Created - if created is None: - self.assertIsNone(info.created, filename) - else: - self.assertEqual(info.created.year, int(created[:4]), filename) - self.assertEqual(info.created.month, int(created[4:6]), filename) - self.assertEqual(info.created.day, int(created[6:8]), filename) - - # Correspondent - if correspondent: - self.assertEqual(info.correspondent.name, correspondent, filename) - else: - self.assertEqual(info.correspondent, None, filename) - - # Title - self.assertEqual(info.title, title, filename) - - # Tags - if tags is None: - self.assertEqual(info.tags, (), filename) - else: - self.assertEqual([t.name for t in info.tags], tags.split(","), filename) - - def test_just_title(self): - template = "{title}.pdf" - for title in self.valid_titles: - spec = dict(title=title) - filename = template.format(**spec) - self._test_guessed_attributes(filename, **spec) - - def test_created_and_title(self): - template = "{created} - {title}.pdf" - - for created in self.valid_dates: - for title in self.valid_titles: - spec = {"created": created, "title": title} - self._test_guessed_attributes(template.format(**spec), **spec) - - def test_invalid_date_format(self): - info = FileInfo.from_filename("06112017Z - title.pdf") - self.assertEqual(info.title, "title") - self.assertIsNone(info.created) - - def test_filename_parse_transforms(self): - filename = "tag1,tag2_20190908_180610_0001.pdf" - all_patt = re.compile("^.*$") - none_patt = re.compile("$a") - re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.") - - # No transformations configured (= default) - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") - self.assertEqual(info.tags, ()) - self.assertIsNone(info.created) - - # Pattern doesn't match (filename unaltered) - with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") - - # Simple transformation (match all) - with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "all") - - # Multiple transformations configured (first pattern matches) - with self.settings( - FILENAME_PARSE_TRANSFORMS=[ - (all_patt, "all.gif"), - (all_patt, "anotherall.gif"), - ], - ): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "all") - - # Multiple transformations configured (second pattern matches) - with self.settings( - FILENAME_PARSE_TRANSFORMS=[ - (none_patt, "none.gif"), - (all_patt, "anotherall.gif"), - ], - ): - info = FileInfo.from_filename(filename) - self.assertEqual(info.title, "anotherall") - - class _BaseTestParser(DocumentParser): def get_settings(self): """ diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 0c8c71ab9..ff1829528 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -3,7 +3,6 @@ import json import math import multiprocessing import os -import re import tempfile from os import PathLike from pathlib import Path @@ -1089,11 +1088,6 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") # fewer dates shown. NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3) -# Transformations applied before filename parsing -FILENAME_PARSE_TRANSFORMS = [] -for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): - FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"])) - # Specify the filename format for out files FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") diff --git a/uv.lock b/uv.lock index 61fa6f255..d4b0a45d4 100644 --- a/uv.lock +++ b/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.10" resolution-markers = [ "sys_platform == 'darwin'", @@ -1840,7 +1841,7 @@ wheels = [ [[package]] name = "paperless-ngx" -version = "0.1.0" +version = "2.14.7" source = { virtual = "." } dependencies = [ { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -1868,21 +1869,15 @@ dependencies = [ { name = "filelock", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "flower", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "gotenberg-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "granian", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "httpx-oauth", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "imap-tools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "inotifyrecursive", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "langdetect", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "mysqlclient", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "nltk", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "psycopg", extra = ["c"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "psycopg-c", version = "3.2.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version != '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version != '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, - { name = "psycopg-c", version = "3.2.4", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_aarch64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "psycopg-c", version = "3.2.4", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-gnupg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -1903,6 +1898,20 @@ dependencies = [ { name = "zxing-cpp", version = "2.3.0", source = { url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] +[package.optional-dependencies] +mariadb = [ + { name = "mysqlclient", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +postgres = [ + { name = "psycopg", extra = ["c"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "psycopg-c", version = "3.2.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version != '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version != '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "psycopg-c", version = "3.2.4", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_aarch64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "psycopg-c", version = "3.2.4", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +webserver = [ + { name = "granian", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] + [package.dev-dependencies] dev = [ { name = "daphne", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -1990,21 +1999,21 @@ requires-dist = [ { name = "filelock", specifier = "~=3.17.0" }, { name = "flower", specifier = "~=2.0.1" }, { name = "gotenberg-client", specifier = "~=0.9.0" }, - { name = "granian", specifier = "~=1.7.6" }, + { name = "granian", marker = "extra == 'webserver'", specifier = "~=1.7.6" }, { name = "httpx-oauth", specifier = "~=0.16" }, { name = "imap-tools", specifier = "~=1.10.0" }, { name = "inotifyrecursive", specifier = "~=0.3" }, { name = "jinja2", specifier = "~=3.1.5" }, { name = "langdetect", specifier = "~=1.0.9" }, - { name = "mysqlclient", specifier = "~=2.2.7" }, + { name = "mysqlclient", marker = "extra == 'mariadb'", specifier = "~=2.2.7" }, { name = "nltk", specifier = "~=3.9.1" }, { name = "ocrmypdf", specifier = "~=16.9.0" }, { name = "pathvalidate", specifier = "~=3.2.3" }, { name = "pdf2image", specifier = "~=1.17.0" }, - { name = "psycopg", extras = ["c"], specifier = "==3.2.4" }, - { name = "psycopg-c", marker = "(python_full_version != '3.12.*' and platform_machine == 'aarch64') or (python_full_version != '3.12.*' and platform_machine == 'x86_64') or (platform_machine != 'aarch64' and platform_machine != 'x86_64') or sys_platform != 'linux'", specifier = "==3.2.4" }, - { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_aarch64.whl" }, - { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_x86_64.whl" }, + { name = "psycopg", extras = ["c"], marker = "extra == 'postgres'", specifier = "==3.2.4" }, + { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_aarch64.whl" }, + { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.4/psycopg_c-3.2.4-cp312-cp312-linux_x86_64.whl" }, + { name = "psycopg-c", marker = "(python_full_version != '3.12.*' and platform_machine == 'aarch64' and extra == 'postgres') or (python_full_version != '3.12.*' and platform_machine == 'x86_64' and extra == 'postgres') or (platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'postgres') or (sys_platform != 'linux' and extra == 'postgres')", specifier = "==3.2.4" }, { name = "python-dateutil", specifier = "~=2.9.0" }, { name = "python-dotenv", specifier = "~=1.0.1" }, { name = "python-gnupg", specifier = "~=0.5.4" }, @@ -2024,6 +2033,7 @@ requires-dist = [ { name = "zxing-cpp", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_aarch64.whl" }, { name = "zxing-cpp", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'", url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_x86_64.whl" }, ] +provides-extras = ["mariadb", "postgres", "webserver"] [package.metadata.requires-dev] dev = [