mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-29 11:09:27 -05:00
feat: add xrechnung to pdf visualization/parser
This commit is contained in:
parent
740bb39d7a
commit
ff25c8025a
@ -1,7 +1,8 @@
|
|||||||
# This file configures pre-commit hooks.
|
# This file configures pre-commit hooks.
|
||||||
# See https://pre-commit.com/ for general information
|
# See https://pre-commit.com/ for general information
|
||||||
# See https://pre-commit.com/hooks.html for a listing of possible hooks
|
# See https://pre-commit.com/hooks.html for a listing of possible hooks
|
||||||
|
default_language_version:
|
||||||
|
python: python3.11
|
||||||
repos:
|
repos:
|
||||||
# General hooks
|
# General hooks
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
@ -118,7 +118,9 @@ ARG RUNTIME_PACKAGES="\
|
|||||||
zlib1g \
|
zlib1g \
|
||||||
# Barcode splitter
|
# Barcode splitter
|
||||||
libzbar0 \
|
libzbar0 \
|
||||||
poppler-utils"
|
poppler-utils \
|
||||||
|
# XRechnung
|
||||||
|
default-jre"
|
||||||
|
|
||||||
# Install basic runtime packages.
|
# Install basic runtime packages.
|
||||||
# These change very infrequently
|
# These change very infrequently
|
||||||
@ -160,6 +162,8 @@ RUN set -eux \
|
|||||||
&& echo "Installing supervisor" \
|
&& echo "Installing supervisor" \
|
||||||
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
|
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
|
||||||
|
|
||||||
|
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
|
||||||
|
|
||||||
# Copy gunicorn config
|
# Copy gunicorn config
|
||||||
# Changes very infrequently
|
# Changes very infrequently
|
||||||
WORKDIR /usr/src/paperless/
|
WORKDIR /usr/src/paperless/
|
||||||
|
@ -318,6 +318,7 @@ INSTALLED_APPS = [
|
|||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
"paperless_text.apps.PaperlessTextConfig",
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
|
"paperless_xml.apps.PaperlessXMLConfig",
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
|
0
src/paperless_xml/__init__.py
Normal file
0
src/paperless_xml/__init__.py
Normal file
14
src/paperless_xml/apps.py
Normal file
14
src/paperless_xml/apps.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
from paperless_xml.signals import xml_consumer_declaration
|
||||||
|
|
||||||
|
|
||||||
|
class PaperlessXMLConfig(AppConfig):
|
||||||
|
name = "paperless_xml"
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
|
document_consumer_declaration.connect(xml_consumer_declaration)
|
||||||
|
|
||||||
|
AppConfig.ready(self)
|
107
src/paperless_xml/parsers.py
Normal file
107
src/paperless_xml/parsers.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
from paperless_text.parsers import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class XMLDocumentParser(TextDocumentParser):
|
||||||
|
"""
|
||||||
|
This parser parses a xml document (.xml)
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging_name = "paperless.parsing.xml"
|
||||||
|
|
||||||
|
is_invoice = False
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
|
||||||
|
if self.is_invoice:
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self.archive_path,
|
||||||
|
self.tempdir,
|
||||||
|
self.logging_group,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return super().get_thumbnail(document_path, mime_type, file_name)
|
||||||
|
|
||||||
|
def xml_to_pdf_mustang(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type,
|
||||||
|
file_name=None,
|
||||||
|
) -> Path:
|
||||||
|
outpdf = Path(self.tempdir, "out.pdf")
|
||||||
|
res = subprocess.run(
|
||||||
|
[
|
||||||
|
"mustang-cli.jar",
|
||||||
|
"--action",
|
||||||
|
"pdf",
|
||||||
|
"--source",
|
||||||
|
document_path,
|
||||||
|
"--out",
|
||||||
|
outpdf,
|
||||||
|
],
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
|
if res.returncode != 0:
|
||||||
|
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
||||||
|
else:
|
||||||
|
return outpdf
|
||||||
|
|
||||||
|
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
|
||||||
|
outpdf = Path(self.tempdir, "combined.pdf")
|
||||||
|
res = subprocess.run(
|
||||||
|
[
|
||||||
|
"mustang-cli.jar",
|
||||||
|
"--action",
|
||||||
|
"combine",
|
||||||
|
"--source",
|
||||||
|
pdf_path,
|
||||||
|
"--source-xml",
|
||||||
|
xml_path,
|
||||||
|
"--format",
|
||||||
|
"zf",
|
||||||
|
"--version",
|
||||||
|
"2",
|
||||||
|
"--profile",
|
||||||
|
"X",
|
||||||
|
"--no-additional-attachments",
|
||||||
|
"--out",
|
||||||
|
outpdf,
|
||||||
|
],
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
|
if res.returncode != 0:
|
||||||
|
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
||||||
|
else:
|
||||||
|
return outpdf
|
||||||
|
|
||||||
|
def is_xrechnung_mustang(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type,
|
||||||
|
file_name=None,
|
||||||
|
) -> bool:
|
||||||
|
res = subprocess.run(
|
||||||
|
[
|
||||||
|
"mustang-cli.jar",
|
||||||
|
"--action",
|
||||||
|
"validate",
|
||||||
|
"--source",
|
||||||
|
document_path,
|
||||||
|
"--no-notices",
|
||||||
|
],
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
|
return res.returncode == 0
|
||||||
|
|
||||||
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
|
super().parse(document_path, mime_type, file_name)
|
||||||
|
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
|
||||||
|
self.is_invoice = True
|
||||||
|
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
|
||||||
|
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
|
||||||
|
self.archive_path = pdfWith
|
||||||
|
else:
|
||||||
|
self.is_invoice = False
|
16
src/paperless_xml/signals.py
Normal file
16
src/paperless_xml/signals.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
def get_parser(*args, **kwargs):
|
||||||
|
from paperless_xml.parsers import XMLDocumentParser
|
||||||
|
|
||||||
|
return XMLDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def xml_consumer_declaration(sender, **kwargs):
|
||||||
|
return {
|
||||||
|
"parser": get_parser,
|
||||||
|
"weight": 11,
|
||||||
|
"mime_types": {
|
||||||
|
"text/plain": ".txt",
|
||||||
|
"text/xml": ".xml",
|
||||||
|
"application/xml": ".xml",
|
||||||
|
},
|
||||||
|
}
|
0
src/paperless_xml/tests/__init__.py
Normal file
0
src/paperless_xml/tests/__init__.py
Normal file
30
src/paperless_xml/tests/conftest.py
Normal file
30
src/paperless_xml/tests/conftest.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
from collections.abc import Generator
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless_text.parsers import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_dir() -> Path:
|
||||||
|
return (Path(__file__).parent / Path("samples")).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||||
|
try:
|
||||||
|
parser = TextDocumentParser(logging_group=None)
|
||||||
|
yield parser
|
||||||
|
finally:
|
||||||
|
parser.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_txt_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "test.txt"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def malformed_txt_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "decode_error.txt"
|
1
src/paperless_xml/tests/samples/decode_error.txt
Normal file
1
src/paperless_xml/tests/samples/decode_error.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Pantothensäure
|
1
src/paperless_xml/tests/samples/test.txt
Normal file
1
src/paperless_xml/tests/samples/test.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
This is a test file.
|
37
src/paperless_xml/tests/test_parser.py
Normal file
37
src/paperless_xml/tests/test_parser.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from paperless_text.parsers import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParser:
|
||||||
|
def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
||||||
|
# just make sure that it does not crash
|
||||||
|
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
||||||
|
assert f.exists()
|
||||||
|
assert f.is_file()
|
||||||
|
|
||||||
|
def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
||||||
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert text_parser.get_text() == "This is a test file.\n"
|
||||||
|
assert text_parser.get_archive_path() is None
|
||||||
|
|
||||||
|
def test_parse_invalid_bytes(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
malformed_txt_file: Path,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Text file which contains invalid UTF bytes
|
||||||
|
WHEN:
|
||||||
|
- The file is parsed
|
||||||
|
THEN:
|
||||||
|
- Parsing continues
|
||||||
|
- Invalid bytes are removed
|
||||||
|
"""
|
||||||
|
|
||||||
|
text_parser.parse(malformed_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
||||||
|
assert text_parser.get_archive_path() is None
|
Loading…
x
Reference in New Issue
Block a user