refactor: move xml parser to seperate container with rechnungless

This commit is contained in:
Marcel2508 2024-12-20 21:06:34 +01:00
parent 01c502a0e4
commit 5699b58fae
8 changed files with 121 additions and 86 deletions

View File

@ -118,9 +118,7 @@ ARG RUNTIME_PACKAGES="\
zlib1g \
# Barcode splitter
libzbar0 \
poppler-utils \
# XRechnung
default-jre"
poppler-utils"
# Install basic runtime packages.
# These change very infrequently
@ -162,8 +160,6 @@ RUN set -eux \
&& echo "Installing supervisor" \
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
# Copy gunicorn config
# Changes very infrequently
WORKDIR /usr/src/paperless/

View File

@ -0,0 +1,10 @@
services:
rechnungless:
image: marcel2508/rechnungless:latest
restart: unless-stopped
environment:
# REQUIRED FOR APPLE M4 CHIP / MACOS 15.2
JAVA_OPTS: -XX:UseSVE=0
webserver:
environment:
PAPERLESS_ENABLE_RECHNUNGLESS: 1

View File

@ -198,6 +198,18 @@ Docker, this may be the `environment` key of the webserver or a
containing the configuration parameters. Be sure to use the correct format
and watch out for indentation if editing the YAML file.
#### [`PAPERLESS_RECHNUNGLESS_ENABLED=<bool>`](#PAPERLESS_RECHNUNGLESS_ENABLED) {#PAPERLESS_RECHNUNGLESS_ENABLED}
: Enable (or disable) the Rechnungless xml to pdf converter and validator.
Defaults to false.
#### [`PAPERLESS_RECHNUNGLESS_ENDPOINT=<url>`](#PAPERLESS_RECHNUNGLESS_ENDPOINT) {#PAPERLESS_RECHNUNGLESS_ENDPOINT}
: Set the endpoint URL where Paperless can reach your Rechnungless api server.
Defaults to "<http://rechnungless:8080/rechnungless>".
## Paths and folders
#### [`PAPERLESS_CONSUMPTION_DIR=<path>`](#PAPERLESS_CONSUMPTION_DIR) {#PAPERLESS_CONSUMPTION_DIR}

View File

@ -142,6 +142,14 @@ echo ""
ask "Enable Apache Tika?" "no" "yes no"
TIKA_ENABLED=$ask_result
echo ""
echo "Paperless is able to use Mustang Library to support XML files in XInvoice schema"
echo "This feature requires more resources due to the required services."
echo ""
ask "Enable Rechnungless XInvoice service?" "no" "yes no"
RECHNUNGLESS_ENABLED=$ask_result
echo ""
echo "Specify the default language that most of your documents are written in."
echo "Use ISO 639-2, (T) variant language codes: "
@ -322,6 +330,10 @@ if [[ $TIKA_ENABLED == "yes" ]] ; then
DOCKER_COMPOSE_VERSION="$DOCKER_COMPOSE_VERSION-tika"
fi
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.rechnungless.yml" -O docker-compose.rechnungless.yml
fi
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.$DOCKER_COMPOSE_VERSION.yml" -O docker-compose.yml
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/.env" -O .env
@ -391,6 +403,9 @@ if [ "$l1" -eq "$l2" ] ; then
sed -i "/^volumes:/d" docker-compose.yml
fi
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
docker compose -f docker-compose.rechnungless.yml pull
fi
docker compose pull
@ -404,4 +419,8 @@ fi
docker compose run --rm -e DJANGO_SUPERUSER_PASSWORD="$PASSWORD" webserver createsuperuser --noinput --username "$USERNAME" --email "$EMAIL"
docker compose up --detach
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
docker compose up -f docker-compose.yml -f docker-compose.rechnungless.yml --detach
else
docker compose up --detach
fi

View File

@ -88,6 +88,11 @@
#PAPERLESS_TIKA_ENDPOINT=http://localhost:9998
#PAPERLESS_TIKA_GOTENBERG_ENDPOINT=http://localhost:3000
# Rechnungless settings
#PAPERLESS_RECHNUNGLESS_ENABLED=false
#PAPERLESS_RECHNUNGLESS_ENDPOINT=http://rechnungless:8080/rechnungless
# Binaries
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert

View File

@ -318,7 +318,6 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_xml.apps.PaperlessXMLConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@ -1090,6 +1089,16 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
if TIKA_ENABLED:
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
# XML / RECHNUNGLESS settings
RECHNUNGLESS_ENABLED = __get_boolean("PAPERLESS_RECHNUNGLESS_ENABLED", "NO")
RECHNUNGLESS_ENDPOINT = os.getenv(
"PAPERLESS_RECHNUNGLESS_ENDPOINT",
"http://rechnungless:8080/rechnungless",
)
if RECHNUNGLESS_ENABLED:
INSTALLED_APPS.append("paperless_xml.apps.PaperlessXMLConfig")
AUDIT_LOG_ENABLED = __get_boolean("PAPERLESS_AUDIT_LOG_ENABLED", "true")
if AUDIT_LOG_ENABLED:
INSTALLED_APPS.append("auditlog")

View File

@ -1,4 +1,5 @@
from django.apps import AppConfig
from django.conf import settings
from paperless_xml.signals import xml_consumer_declaration
@ -9,6 +10,7 @@ class PaperlessXMLConfig(AppConfig):
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(xml_consumer_declaration)
if settings.RECHNUNGLESS_ENABLED:
document_consumer_declaration.connect(xml_consumer_declaration)
AppConfig.ready(self)

View File

@ -1,6 +1,13 @@
import subprocess
import base64
import json
from datetime import datetime
from pathlib import Path
import httpx
from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless_text.parsers import TextDocumentParser
@ -25,83 +32,58 @@ class XMLDocumentParser(TextDocumentParser):
else:
return super().get_thumbnail(document_path, mime_type, file_name)
def xml_to_pdf_mustang(
self,
document_path: Path,
mime_type,
file_name=None,
) -> Path:
outpdf = Path(self.tempdir, "out.pdf")
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"pdf",
"--source",
document_path,
"--out",
outpdf,
],
timeout=20,
)
if res.returncode != 0:
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
else:
return outpdf
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
outpdf = Path(self.tempdir, "combined.pdf")
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"combine",
"--source",
pdf_path,
"--source-xml",
xml_path,
"--format",
"zf",
"--version",
"2",
"--profile",
"X",
"--no-additional-attachments",
"--out",
outpdf,
],
timeout=20,
)
if res.returncode != 0:
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
else:
return outpdf
def is_xrechnung_mustang(
self,
document_path: Path,
mime_type,
file_name=None,
) -> bool:
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"validate",
"--source",
document_path,
"--no-notices",
],
timeout=20,
)
return res.returncode == 0
def parse(self, document_path, mime_type, file_name=None):
super().parse(document_path, mime_type, file_name)
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
self.is_invoice = True
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
self.archive_path = pdfWith
else:
self.is_invoice = False
header = {"Content-Type": "application/xml"}
url = settings.RECHNUNGLESS_ENDPOINT
httpResponse = httpx.post(
url + "/convert",
headers=header,
data=self.text,
timeout=60.0,
)
if httpResponse.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
raise ParseError("Server Error: " + str(httpResponse.content))
if httpResponse.status_code not in (
httpx.codes.OK,
httpx.codes.UNPROCESSABLE_ENTITY,
):
raise ParseError(
"Unknown Error: HTTP"
+ str(httpResponse.status_code)
+ " "
+ str(httpResponse.content),
)
response = json.loads(httpResponse.content)
if response["result"] == "failed":
message = "Conversion failed: \n"
for msg in response["messages"]:
message += msg
self.log.info(f"Invalid schema: {message}")
self.is_invoice = False
return
if httpResponse.status_code == httpx.codes.UNPROCESSABLE_ENTITY:
message = "The XML file is not valid:"
for msg in response["messages"]:
message += "\n" + msg
self.log.info(f"Invalid schema: {message}")
self.is_invoice = False
return
if response["result"] == "invalid":
contStr = str(httpResponse.content)
self.log.warning(f"The file received is technically invalid: {contStr}")
self.archive_path = Path(self.tempdir, "invoice.pdf")
self.is_invoice = True
with self.archive_path.open("wb") as archiveFile:
archiveFile.write(base64.b64decode(response["archive_pdf"]))
if "issue_date" in response:
self.date = datetime.strptime(response["issue_date"], "%Y%m%d")
if is_naive(self.date):
self.date = make_aware(self.date)