refactor: move xml parser to seperate container with rechnungless

This commit is contained in:
Marcel2508 2024-12-20 21:06:34 +01:00
parent 01c502a0e4
commit 5699b58fae
8 changed files with 121 additions and 86 deletions

View File

@ -118,9 +118,7 @@ ARG RUNTIME_PACKAGES="\
zlib1g \ zlib1g \
# Barcode splitter # Barcode splitter
libzbar0 \ libzbar0 \
poppler-utils \ poppler-utils"
# XRechnung
default-jre"
# Install basic runtime packages. # Install basic runtime packages.
# These change very infrequently # These change very infrequently
@ -162,8 +160,6 @@ RUN set -eux \
&& echo "Installing supervisor" \ && echo "Installing supervisor" \
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5 && python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
# Copy gunicorn config # Copy gunicorn config
# Changes very infrequently # Changes very infrequently
WORKDIR /usr/src/paperless/ WORKDIR /usr/src/paperless/

View File

@ -0,0 +1,10 @@
services:
rechnungless:
image: marcel2508/rechnungless:latest
restart: unless-stopped
environment:
# REQUIRED FOR APPLE M4 CHIP / MACOS 15.2
JAVA_OPTS: -XX:UseSVE=0
webserver:
environment:
PAPERLESS_ENABLE_RECHNUNGLESS: 1

View File

@ -198,6 +198,18 @@ Docker, this may be the `environment` key of the webserver or a
containing the configuration parameters. Be sure to use the correct format containing the configuration parameters. Be sure to use the correct format
and watch out for indentation if editing the YAML file. and watch out for indentation if editing the YAML file.
#### [`PAPERLESS_RECHNUNGLESS_ENABLED=<bool>`](#PAPERLESS_RECHNUNGLESS_ENABLED) {#PAPERLESS_RECHNUNGLESS_ENABLED}
: Enable (or disable) the Rechnungless xml to pdf converter and validator.
Defaults to false.
#### [`PAPERLESS_RECHNUNGLESS_ENDPOINT=<url>`](#PAPERLESS_RECHNUNGLESS_ENDPOINT) {#PAPERLESS_RECHNUNGLESS_ENDPOINT}
: Set the endpoint URL where Paperless can reach your Rechnungless api server.
Defaults to "<http://rechnungless:8080/rechnungless>".
## Paths and folders ## Paths and folders
#### [`PAPERLESS_CONSUMPTION_DIR=<path>`](#PAPERLESS_CONSUMPTION_DIR) {#PAPERLESS_CONSUMPTION_DIR} #### [`PAPERLESS_CONSUMPTION_DIR=<path>`](#PAPERLESS_CONSUMPTION_DIR) {#PAPERLESS_CONSUMPTION_DIR}

View File

@ -142,6 +142,14 @@ echo ""
ask "Enable Apache Tika?" "no" "yes no" ask "Enable Apache Tika?" "no" "yes no"
TIKA_ENABLED=$ask_result TIKA_ENABLED=$ask_result
echo ""
echo "Paperless is able to use Mustang Library to support XML files in XInvoice schema"
echo "This feature requires more resources due to the required services."
echo ""
ask "Enable Rechnungless XInvoice service?" "no" "yes no"
RECHNUNGLESS_ENABLED=$ask_result
echo "" echo ""
echo "Specify the default language that most of your documents are written in." echo "Specify the default language that most of your documents are written in."
echo "Use ISO 639-2, (T) variant language codes: " echo "Use ISO 639-2, (T) variant language codes: "
@ -322,6 +330,10 @@ if [[ $TIKA_ENABLED == "yes" ]] ; then
DOCKER_COMPOSE_VERSION="$DOCKER_COMPOSE_VERSION-tika" DOCKER_COMPOSE_VERSION="$DOCKER_COMPOSE_VERSION-tika"
fi fi
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.rechnungless.yml" -O docker-compose.rechnungless.yml
fi
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.$DOCKER_COMPOSE_VERSION.yml" -O docker-compose.yml wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.$DOCKER_COMPOSE_VERSION.yml" -O docker-compose.yml
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/.env" -O .env wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/.env" -O .env
@ -391,6 +403,9 @@ if [ "$l1" -eq "$l2" ] ; then
sed -i "/^volumes:/d" docker-compose.yml sed -i "/^volumes:/d" docker-compose.yml
fi fi
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
docker compose -f docker-compose.rechnungless.yml pull
fi
docker compose pull docker compose pull
@ -404,4 +419,8 @@ fi
docker compose run --rm -e DJANGO_SUPERUSER_PASSWORD="$PASSWORD" webserver createsuperuser --noinput --username "$USERNAME" --email "$EMAIL" docker compose run --rm -e DJANGO_SUPERUSER_PASSWORD="$PASSWORD" webserver createsuperuser --noinput --username "$USERNAME" --email "$EMAIL"
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
docker compose up -f docker-compose.yml -f docker-compose.rechnungless.yml --detach
else
docker compose up --detach docker compose up --detach
fi

View File

@ -88,6 +88,11 @@
#PAPERLESS_TIKA_ENDPOINT=http://localhost:9998 #PAPERLESS_TIKA_ENDPOINT=http://localhost:9998
#PAPERLESS_TIKA_GOTENBERG_ENDPOINT=http://localhost:3000 #PAPERLESS_TIKA_GOTENBERG_ENDPOINT=http://localhost:3000
# Rechnungless settings
#PAPERLESS_RECHNUNGLESS_ENABLED=false
#PAPERLESS_RECHNUNGLESS_ENDPOINT=http://rechnungless:8080/rechnungless
# Binaries # Binaries
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert #PAPERLESS_CONVERT_BINARY=/usr/bin/convert

View File

@ -318,7 +318,6 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig", "paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig", "paperless_mail.apps.PaperlessMailConfig",
"paperless_xml.apps.PaperlessXMLConfig",
"django.contrib.admin", "django.contrib.admin",
"rest_framework", "rest_framework",
"rest_framework.authtoken", "rest_framework.authtoken",
@ -1090,6 +1089,16 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
if TIKA_ENABLED: if TIKA_ENABLED:
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
# XML / RECHNUNGLESS settings
RECHNUNGLESS_ENABLED = __get_boolean("PAPERLESS_RECHNUNGLESS_ENABLED", "NO")
RECHNUNGLESS_ENDPOINT = os.getenv(
"PAPERLESS_RECHNUNGLESS_ENDPOINT",
"http://rechnungless:8080/rechnungless",
)
if RECHNUNGLESS_ENABLED:
INSTALLED_APPS.append("paperless_xml.apps.PaperlessXMLConfig")
AUDIT_LOG_ENABLED = __get_boolean("PAPERLESS_AUDIT_LOG_ENABLED", "true") AUDIT_LOG_ENABLED = __get_boolean("PAPERLESS_AUDIT_LOG_ENABLED", "true")
if AUDIT_LOG_ENABLED: if AUDIT_LOG_ENABLED:
INSTALLED_APPS.append("auditlog") INSTALLED_APPS.append("auditlog")

View File

@ -1,4 +1,5 @@
from django.apps import AppConfig from django.apps import AppConfig
from django.conf import settings
from paperless_xml.signals import xml_consumer_declaration from paperless_xml.signals import xml_consumer_declaration
@ -9,6 +10,7 @@ class PaperlessXMLConfig(AppConfig):
def ready(self): def ready(self):
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
if settings.RECHNUNGLESS_ENABLED:
document_consumer_declaration.connect(xml_consumer_declaration) document_consumer_declaration.connect(xml_consumer_declaration)
AppConfig.ready(self) AppConfig.ready(self)

View File

@ -1,6 +1,13 @@
import subprocess import base64
import json
from datetime import datetime
from pathlib import Path from pathlib import Path
import httpx
from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf from documents.parsers import make_thumbnail_from_pdf
from paperless_text.parsers import TextDocumentParser from paperless_text.parsers import TextDocumentParser
@ -25,83 +32,58 @@ class XMLDocumentParser(TextDocumentParser):
else: else:
return super().get_thumbnail(document_path, mime_type, file_name) return super().get_thumbnail(document_path, mime_type, file_name)
def xml_to_pdf_mustang(
self,
document_path: Path,
mime_type,
file_name=None,
) -> Path:
outpdf = Path(self.tempdir, "out.pdf")
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"pdf",
"--source",
document_path,
"--out",
outpdf,
],
timeout=20,
)
if res.returncode != 0:
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
else:
return outpdf
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
outpdf = Path(self.tempdir, "combined.pdf")
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"combine",
"--source",
pdf_path,
"--source-xml",
xml_path,
"--format",
"zf",
"--version",
"2",
"--profile",
"X",
"--no-additional-attachments",
"--out",
outpdf,
],
timeout=20,
)
if res.returncode != 0:
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
else:
return outpdf
def is_xrechnung_mustang(
self,
document_path: Path,
mime_type,
file_name=None,
) -> bool:
res = subprocess.run(
[
"mustang-cli.jar",
"--action",
"validate",
"--source",
document_path,
"--no-notices",
],
timeout=20,
)
return res.returncode == 0
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
super().parse(document_path, mime_type, file_name) super().parse(document_path, mime_type, file_name)
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
self.is_invoice = True
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
self.archive_path = pdfWith
else:
self.is_invoice = False self.is_invoice = False
header = {"Content-Type": "application/xml"}
url = settings.RECHNUNGLESS_ENDPOINT
httpResponse = httpx.post(
url + "/convert",
headers=header,
data=self.text,
timeout=60.0,
)
if httpResponse.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
raise ParseError("Server Error: " + str(httpResponse.content))
if httpResponse.status_code not in (
httpx.codes.OK,
httpx.codes.UNPROCESSABLE_ENTITY,
):
raise ParseError(
"Unknown Error: HTTP"
+ str(httpResponse.status_code)
+ " "
+ str(httpResponse.content),
)
response = json.loads(httpResponse.content)
if response["result"] == "failed":
message = "Conversion failed: \n"
for msg in response["messages"]:
message += msg
self.log.info(f"Invalid schema: {message}")
self.is_invoice = False
return
if httpResponse.status_code == httpx.codes.UNPROCESSABLE_ENTITY:
message = "The XML file is not valid:"
for msg in response["messages"]:
message += "\n" + msg
self.log.info(f"Invalid schema: {message}")
self.is_invoice = False
return
if response["result"] == "invalid":
contStr = str(httpResponse.content)
self.log.warning(f"The file received is technically invalid: {contStr}")
self.archive_path = Path(self.tempdir, "invoice.pdf")
self.is_invoice = True
with self.archive_path.open("wb") as archiveFile:
archiveFile.write(base64.b64decode(response["archive_pdf"]))
if "issue_date" in response:
self.date = datetime.strptime(response["issue_date"], "%Y%m%d")
if is_naive(self.date):
self.date = make_aware(self.date)