feat: add xrechnung to pdf visualization/parser

This commit is contained in:
Marcel2508 2024-12-14 21:18:25 +01:00
parent 740bb39d7a
commit ff25c8025a
12 changed files with 214 additions and 2 deletions

View File

@ -1,7 +1,8 @@
# This file configures pre-commit hooks.
# See https://pre-commit.com/ for general information
# See https://pre-commit.com/hooks.html for a listing of possible hooks
default_language_version:
python: python3.11
repos:
# General hooks
- repo: https://github.com/pre-commit/pre-commit-hooks

View File

@ -118,7 +118,9 @@ ARG RUNTIME_PACKAGES="\
zlib1g \
# Barcode splitter
libzbar0 \
poppler-utils"
poppler-utils \
# XRechnung
default-jre"
# Install basic runtime packages.
# These change very infrequently
@ -160,6 +162,8 @@ RUN set -eux \
&& echo "Installing supervisor" \
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
# Copy gunicorn config
# Changes very infrequently
WORKDIR /usr/src/paperless/

View File

@ -318,6 +318,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_xml.apps.PaperlessXMLConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",

View File

14
src/paperless_xml/apps.py Normal file
View File

@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_xml.signals import xml_consumer_declaration
class PaperlessXMLConfig(AppConfig):
name = "paperless_xml"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(xml_consumer_declaration)
AppConfig.ready(self)

View File

@ -0,0 +1,107 @@
import subprocess
from pathlib import Path
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless_text.parsers import TextDocumentParser
class XMLDocumentParser(TextDocumentParser):
"""
This parser parses a xml document (.xml)
"""
logging_name = "paperless.parsing.xml"
is_invoice = False
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
if self.is_invoice:
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
)
else:
return super().get_thumbnail(document_path, mime_type, file_name)
def xml_to_pdf_mustang(
self,
document_path: Path,
mime_type,
file_name=None,
) -> Path:
outpdf = Path(self.tempdir, "out.pdf")
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"pdf",
"--source",
document_path,
"--out",
outpdf,
],
timeout=20,
)
if res.returncode != 0:
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
else:
return outpdf
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
outpdf = Path(self.tempdir, "combined.pdf")
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"combine",
"--source",
pdf_path,
"--source-xml",
xml_path,
"--format",
"zf",
"--version",
"2",
"--profile",
"X",
"--no-additional-attachments",
"--out",
outpdf,
],
timeout=20,
)
if res.returncode != 0:
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
else:
return outpdf
def is_xrechnung_mustang(
self,
document_path: Path,
mime_type,
file_name=None,
) -> bool:
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"validate",
"--source",
document_path,
"--no-notices",
],
timeout=20,
)
return res.returncode == 0
def parse(self, document_path, mime_type, file_name=None):
super().parse(document_path, mime_type, file_name)
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
self.is_invoice = True
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
self.archive_path = pdfWith
else:
self.is_invoice = False

View File

@ -0,0 +1,16 @@
def get_parser(*args, **kwargs):
from paperless_xml.parsers import XMLDocumentParser
return XMLDocumentParser(*args, **kwargs)
def xml_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 11,
"mime_types": {
"text/plain": ".txt",
"text/xml": ".xml",
"application/xml": ".xml",
},
}

View File

View File

@ -0,0 +1,30 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_text.parsers import TextDocumentParser
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture()
def text_parser() -> Generator[TextDocumentParser, None, None]:
try:
parser = TextDocumentParser(logging_group=None)
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_txt_file(sample_dir: Path) -> Path:
return sample_dir / "test.txt"
@pytest.fixture(scope="session")
def malformed_txt_file(sample_dir: Path) -> Path:
return sample_dir / "decode_error.txt"

View File

@ -0,0 +1 @@
Pantothensäure

View File

@ -0,0 +1 @@
This is a test file.

View File

@ -0,0 +1,37 @@
from pathlib import Path
from paperless_text.parsers import TextDocumentParser
class TestTextParser:
def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
# just make sure that it does not crash
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
assert f.exists()
assert f.is_file()
def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
text_parser.parse(sample_txt_file, "text/plain")
assert text_parser.get_text() == "This is a test file.\n"
assert text_parser.get_archive_path() is None
def test_parse_invalid_bytes(
self,
text_parser: TextDocumentParser,
malformed_txt_file: Path,
):
"""
GIVEN:
- Text file which contains invalid UTF bytes
WHEN:
- The file is parsed
THEN:
- Parsing continues
- Invalid bytes are removed
"""
text_parser.parse(malformed_txt_file, "text/plain")
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
assert text_parser.get_archive_path() is None