mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-29 11:09:27 -05:00
feat: add xrechnung to pdf visualization/parser
This commit is contained in:
parent
740bb39d7a
commit
ff25c8025a
@ -1,7 +1,8 @@
|
||||
# This file configures pre-commit hooks.
|
||||
# See https://pre-commit.com/ for general information
|
||||
# See https://pre-commit.com/hooks.html for a listing of possible hooks
|
||||
|
||||
default_language_version:
|
||||
python: python3.11
|
||||
repos:
|
||||
# General hooks
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
|
@ -118,7 +118,9 @@ ARG RUNTIME_PACKAGES="\
|
||||
zlib1g \
|
||||
# Barcode splitter
|
||||
libzbar0 \
|
||||
poppler-utils"
|
||||
poppler-utils \
|
||||
# XRechnung
|
||||
default-jre"
|
||||
|
||||
# Install basic runtime packages.
|
||||
# These change very infrequently
|
||||
@ -160,6 +162,8 @@ RUN set -eux \
|
||||
&& echo "Installing supervisor" \
|
||||
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
|
||||
|
||||
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
|
||||
|
||||
# Copy gunicorn config
|
||||
# Changes very infrequently
|
||||
WORKDIR /usr/src/paperless/
|
||||
|
@ -318,6 +318,7 @@ INSTALLED_APPS = [
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_xml.apps.PaperlessXMLConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
|
0
src/paperless_xml/__init__.py
Normal file
0
src/paperless_xml/__init__.py
Normal file
14
src/paperless_xml/apps.py
Normal file
14
src/paperless_xml/apps.py
Normal file
@ -0,0 +1,14 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_xml.signals import xml_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessXMLConfig(AppConfig):
|
||||
name = "paperless_xml"
|
||||
|
||||
def ready(self):
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(xml_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
107
src/paperless_xml/parsers.py
Normal file
107
src/paperless_xml/parsers.py
Normal file
@ -0,0 +1,107 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
class XMLDocumentParser(TextDocumentParser):
|
||||
"""
|
||||
This parser parses a xml document (.xml)
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.xml"
|
||||
|
||||
is_invoice = False
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
|
||||
if self.is_invoice:
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path,
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
else:
|
||||
return super().get_thumbnail(document_path, mime_type, file_name)
|
||||
|
||||
def xml_to_pdf_mustang(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type,
|
||||
file_name=None,
|
||||
) -> Path:
|
||||
outpdf = Path(self.tempdir, "out.pdf")
|
||||
res = subprocess.run(
|
||||
[
|
||||
"mustang-cli.jar",
|
||||
"--action",
|
||||
"pdf",
|
||||
"--source",
|
||||
document_path,
|
||||
"--out",
|
||||
outpdf,
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
if res.returncode != 0:
|
||||
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
||||
else:
|
||||
return outpdf
|
||||
|
||||
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
|
||||
outpdf = Path(self.tempdir, "combined.pdf")
|
||||
res = subprocess.run(
|
||||
[
|
||||
"mustang-cli.jar",
|
||||
"--action",
|
||||
"combine",
|
||||
"--source",
|
||||
pdf_path,
|
||||
"--source-xml",
|
||||
xml_path,
|
||||
"--format",
|
||||
"zf",
|
||||
"--version",
|
||||
"2",
|
||||
"--profile",
|
||||
"X",
|
||||
"--no-additional-attachments",
|
||||
"--out",
|
||||
outpdf,
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
if res.returncode != 0:
|
||||
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
||||
else:
|
||||
return outpdf
|
||||
|
||||
def is_xrechnung_mustang(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type,
|
||||
file_name=None,
|
||||
) -> bool:
|
||||
res = subprocess.run(
|
||||
[
|
||||
"mustang-cli.jar",
|
||||
"--action",
|
||||
"validate",
|
||||
"--source",
|
||||
document_path,
|
||||
"--no-notices",
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
return res.returncode == 0
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
super().parse(document_path, mime_type, file_name)
|
||||
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
|
||||
self.is_invoice = True
|
||||
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
|
||||
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
|
||||
self.archive_path = pdfWith
|
||||
else:
|
||||
self.is_invoice = False
|
16
src/paperless_xml/signals.py
Normal file
16
src/paperless_xml/signals.py
Normal file
@ -0,0 +1,16 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_xml.parsers import XMLDocumentParser
|
||||
|
||||
return XMLDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def xml_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 11,
|
||||
"mime_types": {
|
||||
"text/plain": ".txt",
|
||||
"text/xml": ".xml",
|
||||
"application/xml": ".xml",
|
||||
},
|
||||
}
|
0
src/paperless_xml/tests/__init__.py
Normal file
0
src/paperless_xml/tests/__init__.py
Normal file
30
src/paperless_xml/tests/conftest.py
Normal file
30
src/paperless_xml/tests/conftest.py
Normal file
@ -0,0 +1,30 @@
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_dir() -> Path:
|
||||
return (Path(__file__).parent / Path("samples")).resolve()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
try:
|
||||
parser = TextDocumentParser(logging_group=None)
|
||||
yield parser
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_txt_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "test.txt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def malformed_txt_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "decode_error.txt"
|
1
src/paperless_xml/tests/samples/decode_error.txt
Normal file
1
src/paperless_xml/tests/samples/decode_error.txt
Normal file
@ -0,0 +1 @@
|
||||
Pantothensäure
|
1
src/paperless_xml/tests/samples/test.txt
Normal file
1
src/paperless_xml/tests/samples/test.txt
Normal file
@ -0,0 +1 @@
|
||||
This is a test file.
|
37
src/paperless_xml/tests/test_parser.py
Normal file
37
src/paperless_xml/tests/test_parser.py
Normal file
@ -0,0 +1,37 @@
|
||||
from pathlib import Path
|
||||
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
class TestTextParser:
|
||||
def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
||||
# just make sure that it does not crash
|
||||
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
||||
assert f.exists()
|
||||
assert f.is_file()
|
||||
|
||||
def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "This is a test file.\n"
|
||||
assert text_parser.get_archive_path() is None
|
||||
|
||||
def test_parse_invalid_bytes(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
malformed_txt_file: Path,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Text file which contains invalid UTF bytes
|
||||
WHEN:
|
||||
- The file is parsed
|
||||
THEN:
|
||||
- Parsing continues
|
||||
- Invalid bytes are removed
|
||||
"""
|
||||
|
||||
text_parser.parse(malformed_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
||||
assert text_parser.get_archive_path() is None
|
Loading…
x
Reference in New Issue
Block a user