feat: add xrechnung to pdf visualization/parser

2025-07-26 18:14:37 -05:00 · 2024-12-14 21:18:25 +01:00 · 2024-12-14 21:18:25 +01:00 · ff25c8025a
commit ff25c8025a
parent 740bb39d7a
12 changed files with 214 additions and 2 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,7 +1,8 @@
 # This file configures pre-commit hooks.
 # See https://pre-commit.com/ for general information
 # See https://pre-commit.com/hooks.html for a listing of possible hooks
-
+default_language_version:
+  python: python3.11
 repos:
  # General hooks
  - repo: https://github.com/pre-commit/pre-commit-hooks
--- a/6
+++ b/6
@ -118,7 +118,9 @@ ARG RUNTIME_PACKAGES="\
  zlib1g \
  # Barcode splitter
  libzbar0 \
-  poppler-utils"
+  poppler-utils \
+  # XRechnung
+  default-jre"

 # Install basic runtime packages.
 # These change very infrequently
@ -160,6 +162,8 @@ RUN set -eux \
  && echo "Installing supervisor" \
    && python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5

+RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
+
 # Copy gunicorn config
 # Changes very infrequently
 WORKDIR /usr/src/paperless/
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -318,6 +318,7 @@ INSTALLED_APPS = [
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
+    "paperless_xml.apps.PaperlessXMLConfig",
    "django.contrib.admin",
    "rest_framework",
    "rest_framework.authtoken",
--- a/src/paperless_xml/init.py
+++ b/src/paperless_xml/init.py
--- a/src/paperless_xml/apps.py
+++ b/src/paperless_xml/apps.py
@ -0,0 +1,14 @@
+from django.apps import AppConfig
+
+from paperless_xml.signals import xml_consumer_declaration
+
+
+class PaperlessXMLConfig(AppConfig):
+    name = "paperless_xml"
+
+    def ready(self):
+        from documents.signals import document_consumer_declaration
+
+        document_consumer_declaration.connect(xml_consumer_declaration)
+
+        AppConfig.ready(self)
--- a/src/paperless_xml/parsers.py
+++ b/src/paperless_xml/parsers.py
@ -0,0 +1,107 @@
+import subprocess
+from pathlib import Path
+
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless_text.parsers import TextDocumentParser
+
+
+class XMLDocumentParser(TextDocumentParser):
+    """
+    This parser parses a xml document (.xml)
+    """
+
+    logging_name = "paperless.parsing.xml"
+
+    is_invoice = False
+
+    def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
+        if self.is_invoice:
+            return make_thumbnail_from_pdf(
+                self.archive_path,
+                self.tempdir,
+                self.logging_group,
+            )
+        else:
+            return super().get_thumbnail(document_path, mime_type, file_name)
+
+    def xml_to_pdf_mustang(
+        self,
+        document_path: Path,
+        mime_type,
+        file_name=None,
+    ) -> Path:
+        outpdf = Path(self.tempdir, "out.pdf")
+        res = subprocess.run(
+            [
+                "mustang-cli.jar",
+                "--action",
+                "pdf",
+                "--source",
+                document_path,
+                "--out",
+                outpdf,
+            ],
+            timeout=20,
+        )
+        if res.returncode != 0:
+            raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
+        else:
+            return outpdf
+
+    def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
+        outpdf = Path(self.tempdir, "combined.pdf")
+        res = subprocess.run(
+            [
+                "mustang-cli.jar",
+                "--action",
+                "combine",
+                "--source",
+                pdf_path,
+                "--source-xml",
+                xml_path,
+                "--format",
+                "zf",
+                "--version",
+                "2",
+                "--profile",
+                "X",
+                "--no-additional-attachments",
+                "--out",
+                outpdf,
+            ],
+            timeout=20,
+        )
+        if res.returncode != 0:
+            raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
+        else:
+            return outpdf
+
+    def is_xrechnung_mustang(
+        self,
+        document_path: Path,
+        mime_type,
+        file_name=None,
+    ) -> bool:
+        res = subprocess.run(
+            [
+                "mustang-cli.jar",
+                "--action",
+                "validate",
+                "--source",
+                document_path,
+                "--no-notices",
+            ],
+            timeout=20,
+        )
+        return res.returncode == 0
+
+    def parse(self, document_path, mime_type, file_name=None):
+        super().parse(document_path, mime_type, file_name)
+        if self.is_xrechnung_mustang(document_path, mime_type, file_name):
+            self.is_invoice = True
+            pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
+            pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
+            self.archive_path = pdfWith
+        else:
+            self.is_invoice = False
--- a/src/paperless_xml/signals.py
+++ b/src/paperless_xml/signals.py
@ -0,0 +1,16 @@
+def get_parser(*args, **kwargs):
+    from paperless_xml.parsers import XMLDocumentParser
+
+    return XMLDocumentParser(*args, **kwargs)
+
+
+def xml_consumer_declaration(sender, **kwargs):
+    return {
+        "parser": get_parser,
+        "weight": 11,
+        "mime_types": {
+            "text/plain": ".txt",
+            "text/xml": ".xml",
+            "application/xml": ".xml",
+        },
+    }
--- a/src/paperless_xml/tests/init.py
+++ b/src/paperless_xml/tests/init.py
--- a/src/paperless_xml/tests/conftest.py
+++ b/src/paperless_xml/tests/conftest.py
@ -0,0 +1,30 @@
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_text.parsers import TextDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture()
+def text_parser() -> Generator[TextDocumentParser, None, None]:
+    try:
+        parser = TextDocumentParser(logging_group=None)
+        yield parser
+    finally:
+        parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_txt_file(sample_dir: Path) -> Path:
+    return sample_dir / "test.txt"
+
+
+@pytest.fixture(scope="session")
+def malformed_txt_file(sample_dir: Path) -> Path:
+    return sample_dir / "decode_error.txt"
--- a/src/paperless_xml/tests/samples/decode_error.txt
+++ b/src/paperless_xml/tests/samples/decode_error.txt
@ -0,0 +1 @@
+Pantothensäure
--- a/src/paperless_xml/tests/samples/test.txt
+++ b/src/paperless_xml/tests/samples/test.txt
@ -0,0 +1 @@
+This is a test file.
--- a/src/paperless_xml/tests/test_parser.py
+++ b/src/paperless_xml/tests/test_parser.py
@ -0,0 +1,37 @@
+from pathlib import Path
+
+from paperless_text.parsers import TextDocumentParser
+
+
+class TestTextParser:
+    def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
+        # just make sure that it does not crash
+        f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
+        assert f.exists()
+        assert f.is_file()
+
+    def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
+        text_parser.parse(sample_txt_file, "text/plain")
+
+        assert text_parser.get_text() == "This is a test file.\n"
+        assert text_parser.get_archive_path() is None
+
+    def test_parse_invalid_bytes(
+        self,
+        text_parser: TextDocumentParser,
+        malformed_txt_file: Path,
+    ):
+        """
+        GIVEN:
+            - Text file which contains invalid UTF bytes
+        WHEN:
+            - The file is parsed
+        THEN:
+            - Parsing continues
+            - Invalid bytes are removed
+        """
+
+        text_parser.parse(malformed_txt_file, "text/plain")
+
+        assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
+        assert text_parser.get_archive_path() is None