From ff25c8025a15e0f0cad0574791ec39138810a6f8 Mon Sep 17 00:00:00 2001 From: Marcel2508 Date: Sat, 14 Dec 2024 21:18:25 +0100 Subject: [PATCH] feat: add xrechnung to pdf visualization/parser --- .pre-commit-config.yaml | 3 +- Dockerfile | 6 +- src/paperless/settings.py | 1 + src/paperless_xml/__init__.py | 0 src/paperless_xml/apps.py | 14 +++ src/paperless_xml/parsers.py | 107 ++++++++++++++++++ src/paperless_xml/signals.py | 16 +++ src/paperless_xml/tests/__init__.py | 0 src/paperless_xml/tests/conftest.py | 30 +++++ .../tests/samples/decode_error.txt | 1 + src/paperless_xml/tests/samples/test.txt | 1 + src/paperless_xml/tests/test_parser.py | 37 ++++++ 12 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 src/paperless_xml/__init__.py create mode 100644 src/paperless_xml/apps.py create mode 100644 src/paperless_xml/parsers.py create mode 100644 src/paperless_xml/signals.py create mode 100644 src/paperless_xml/tests/__init__.py create mode 100644 src/paperless_xml/tests/conftest.py create mode 100644 src/paperless_xml/tests/samples/decode_error.txt create mode 100644 src/paperless_xml/tests/samples/test.txt create mode 100644 src/paperless_xml/tests/test_parser.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 76a884747..fe9e3fcaa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,8 @@ # This file configures pre-commit hooks. # See https://pre-commit.com/ for general information # See https://pre-commit.com/hooks.html for a listing of possible hooks - +default_language_version: + python: python3.11 repos: # General hooks - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/Dockerfile b/Dockerfile index 6e6bf6977..550f86690 100644 --- a/Dockerfile +++ b/Dockerfile @@ -118,7 +118,9 @@ ARG RUNTIME_PACKAGES="\ zlib1g \ # Barcode splitter libzbar0 \ - poppler-utils" + poppler-utils \ + # XRechnung + default-jre" # Install basic runtime packages. # These change very infrequently @@ -160,6 +162,8 @@ RUN set -eux \ && echo "Installing supervisor" \ && python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5 +RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar + # Copy gunicorn config # Changes very infrequently WORKDIR /usr/src/paperless/ diff --git a/src/paperless/settings.py b/src/paperless/settings.py index a32c78ef5..26b46e8fc 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -318,6 +318,7 @@ INSTALLED_APPS = [ "paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", + "paperless_xml.apps.PaperlessXMLConfig", "django.contrib.admin", "rest_framework", "rest_framework.authtoken", diff --git a/src/paperless_xml/__init__.py b/src/paperless_xml/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_xml/apps.py b/src/paperless_xml/apps.py new file mode 100644 index 000000000..fc7e6ea06 --- /dev/null +++ b/src/paperless_xml/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_xml.signals import xml_consumer_declaration + + +class PaperlessXMLConfig(AppConfig): + name = "paperless_xml" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(xml_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_xml/parsers.py b/src/paperless_xml/parsers.py new file mode 100644 index 000000000..d8f5a1f58 --- /dev/null +++ b/src/paperless_xml/parsers.py @@ -0,0 +1,107 @@ +import subprocess +from pathlib import Path + +from documents.parsers import ParseError +from documents.parsers import make_thumbnail_from_pdf +from paperless_text.parsers import TextDocumentParser + + +class XMLDocumentParser(TextDocumentParser): + """ + This parser parses a xml document (.xml) + """ + + logging_name = "paperless.parsing.xml" + + is_invoice = False + + def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: + if self.is_invoice: + return make_thumbnail_from_pdf( + self.archive_path, + self.tempdir, + self.logging_group, + ) + else: + return super().get_thumbnail(document_path, mime_type, file_name) + + def xml_to_pdf_mustang( + self, + document_path: Path, + mime_type, + file_name=None, + ) -> Path: + outpdf = Path(self.tempdir, "out.pdf") + res = subprocess.run( + [ + "mustang-cli.jar", + "--action", + "pdf", + "--source", + document_path, + "--out", + outpdf, + ], + timeout=20, + ) + if res.returncode != 0: + raise ParseError("Mustang CLI exited with code: " + str(res.returncode)) + else: + return outpdf + + def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path: + outpdf = Path(self.tempdir, "combined.pdf") + res = subprocess.run( + [ + "mustang-cli.jar", + "--action", + "combine", + "--source", + pdf_path, + "--source-xml", + xml_path, + "--format", + "zf", + "--version", + "2", + "--profile", + "X", + "--no-additional-attachments", + "--out", + outpdf, + ], + timeout=20, + ) + if res.returncode != 0: + raise ParseError("Mustang CLI exited with code: " + str(res.returncode)) + else: + return outpdf + + def is_xrechnung_mustang( + self, + document_path: Path, + mime_type, + file_name=None, + ) -> bool: + res = subprocess.run( + [ + "mustang-cli.jar", + "--action", + "validate", + "--source", + document_path, + "--no-notices", + ], + timeout=20, + ) + return res.returncode == 0 + + def parse(self, document_path, mime_type, file_name=None): + super().parse(document_path, mime_type, file_name) + if self.is_xrechnung_mustang(document_path, mime_type, file_name): + self.is_invoice = True + pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name) + pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path) + self.archive_path = pdfWith + else: + self.is_invoice = False diff --git a/src/paperless_xml/signals.py b/src/paperless_xml/signals.py new file mode 100644 index 000000000..a8924ba21 --- /dev/null +++ b/src/paperless_xml/signals.py @@ -0,0 +1,16 @@ +def get_parser(*args, **kwargs): + from paperless_xml.parsers import XMLDocumentParser + + return XMLDocumentParser(*args, **kwargs) + + +def xml_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 11, + "mime_types": { + "text/plain": ".txt", + "text/xml": ".xml", + "application/xml": ".xml", + }, + } diff --git a/src/paperless_xml/tests/__init__.py b/src/paperless_xml/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_xml/tests/conftest.py b/src/paperless_xml/tests/conftest.py new file mode 100644 index 000000000..1d9e4fc2f --- /dev/null +++ b/src/paperless_xml/tests/conftest.py @@ -0,0 +1,30 @@ +from collections.abc import Generator +from pathlib import Path + +import pytest + +from paperless_text.parsers import TextDocumentParser + + +@pytest.fixture(scope="session") +def sample_dir() -> Path: + return (Path(__file__).parent / Path("samples")).resolve() + + +@pytest.fixture() +def text_parser() -> Generator[TextDocumentParser, None, None]: + try: + parser = TextDocumentParser(logging_group=None) + yield parser + finally: + parser.cleanup() + + +@pytest.fixture(scope="session") +def sample_txt_file(sample_dir: Path) -> Path: + return sample_dir / "test.txt" + + +@pytest.fixture(scope="session") +def malformed_txt_file(sample_dir: Path) -> Path: + return sample_dir / "decode_error.txt" diff --git a/src/paperless_xml/tests/samples/decode_error.txt b/src/paperless_xml/tests/samples/decode_error.txt new file mode 100644 index 000000000..2137cd2b7 --- /dev/null +++ b/src/paperless_xml/tests/samples/decode_error.txt @@ -0,0 +1 @@ +Pantothensäure diff --git a/src/paperless_xml/tests/samples/test.txt b/src/paperless_xml/tests/samples/test.txt new file mode 100644 index 000000000..6de7b8c69 --- /dev/null +++ b/src/paperless_xml/tests/samples/test.txt @@ -0,0 +1 @@ +This is a test file. diff --git a/src/paperless_xml/tests/test_parser.py b/src/paperless_xml/tests/test_parser.py new file mode 100644 index 000000000..0f8cc19ba --- /dev/null +++ b/src/paperless_xml/tests/test_parser.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from paperless_text.parsers import TextDocumentParser + + +class TestTextParser: + def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path): + # just make sure that it does not crash + f = text_parser.get_thumbnail(sample_txt_file, "text/plain") + assert f.exists() + assert f.is_file() + + def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path): + text_parser.parse(sample_txt_file, "text/plain") + + assert text_parser.get_text() == "This is a test file.\n" + assert text_parser.get_archive_path() is None + + def test_parse_invalid_bytes( + self, + text_parser: TextDocumentParser, + malformed_txt_file: Path, + ): + """ + GIVEN: + - Text file which contains invalid UTF bytes + WHEN: + - The file is parsed + THEN: + - Parsing continues + - Invalid bytes are removed + """ + + text_parser.parse(malformed_txt_file, "text/plain") + + assert text_parser.get_text() == "Pantothens�ure\n" + assert text_parser.get_archive_path() is None