mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-29 11:09:27 -05:00
refactor: move xml parser to seperate container with rechnungless
This commit is contained in:
parent
01c502a0e4
commit
5699b58fae
@ -118,9 +118,7 @@ ARG RUNTIME_PACKAGES="\
|
||||
zlib1g \
|
||||
# Barcode splitter
|
||||
libzbar0 \
|
||||
poppler-utils \
|
||||
# XRechnung
|
||||
default-jre"
|
||||
poppler-utils"
|
||||
|
||||
# Install basic runtime packages.
|
||||
# These change very infrequently
|
||||
@ -162,8 +160,6 @@ RUN set -eux \
|
||||
&& echo "Installing supervisor" \
|
||||
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
|
||||
|
||||
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
|
||||
|
||||
# Copy gunicorn config
|
||||
# Changes very infrequently
|
||||
WORKDIR /usr/src/paperless/
|
||||
|
10
docker/compose/docker-compose.rechnungless.yml
Normal file
10
docker/compose/docker-compose.rechnungless.yml
Normal file
@ -0,0 +1,10 @@
|
||||
services:
|
||||
rechnungless:
|
||||
image: marcel2508/rechnungless:latest
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
# REQUIRED FOR APPLE M4 CHIP / MACOS 15.2
|
||||
JAVA_OPTS: -XX:UseSVE=0
|
||||
webserver:
|
||||
environment:
|
||||
PAPERLESS_ENABLE_RECHNUNGLESS: 1
|
@ -198,6 +198,18 @@ Docker, this may be the `environment` key of the webserver or a
|
||||
containing the configuration parameters. Be sure to use the correct format
|
||||
and watch out for indentation if editing the YAML file.
|
||||
|
||||
#### [`PAPERLESS_RECHNUNGLESS_ENABLED=<bool>`](#PAPERLESS_RECHNUNGLESS_ENABLED) {#PAPERLESS_RECHNUNGLESS_ENABLED}
|
||||
|
||||
: Enable (or disable) the Rechnungless xml to pdf converter and validator.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
#### [`PAPERLESS_RECHNUNGLESS_ENDPOINT=<url>`](#PAPERLESS_RECHNUNGLESS_ENDPOINT) {#PAPERLESS_RECHNUNGLESS_ENDPOINT}
|
||||
|
||||
: Set the endpoint URL where Paperless can reach your Rechnungless api server.
|
||||
|
||||
Defaults to "<http://rechnungless:8080/rechnungless>".
|
||||
|
||||
## Paths and folders
|
||||
|
||||
#### [`PAPERLESS_CONSUMPTION_DIR=<path>`](#PAPERLESS_CONSUMPTION_DIR) {#PAPERLESS_CONSUMPTION_DIR}
|
||||
|
@ -142,6 +142,14 @@ echo ""
|
||||
ask "Enable Apache Tika?" "no" "yes no"
|
||||
TIKA_ENABLED=$ask_result
|
||||
|
||||
echo ""
|
||||
echo "Paperless is able to use Mustang Library to support XML files in XInvoice schema"
|
||||
echo "This feature requires more resources due to the required services."
|
||||
echo ""
|
||||
|
||||
ask "Enable Rechnungless XInvoice service?" "no" "yes no"
|
||||
RECHNUNGLESS_ENABLED=$ask_result
|
||||
|
||||
echo ""
|
||||
echo "Specify the default language that most of your documents are written in."
|
||||
echo "Use ISO 639-2, (T) variant language codes: "
|
||||
@ -322,6 +330,10 @@ if [[ $TIKA_ENABLED == "yes" ]] ; then
|
||||
DOCKER_COMPOSE_VERSION="$DOCKER_COMPOSE_VERSION-tika"
|
||||
fi
|
||||
|
||||
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
|
||||
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.rechnungless.yml" -O docker-compose.rechnungless.yml
|
||||
fi
|
||||
|
||||
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.$DOCKER_COMPOSE_VERSION.yml" -O docker-compose.yml
|
||||
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/.env" -O .env
|
||||
|
||||
@ -391,6 +403,9 @@ if [ "$l1" -eq "$l2" ] ; then
|
||||
sed -i "/^volumes:/d" docker-compose.yml
|
||||
fi
|
||||
|
||||
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
|
||||
docker compose -f docker-compose.rechnungless.yml pull
|
||||
fi
|
||||
|
||||
docker compose pull
|
||||
|
||||
@ -404,4 +419,8 @@ fi
|
||||
|
||||
docker compose run --rm -e DJANGO_SUPERUSER_PASSWORD="$PASSWORD" webserver createsuperuser --noinput --username "$USERNAME" --email "$EMAIL"
|
||||
|
||||
docker compose up --detach
|
||||
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
|
||||
docker compose up -f docker-compose.yml -f docker-compose.rechnungless.yml --detach
|
||||
else
|
||||
docker compose up --detach
|
||||
fi
|
||||
|
@ -88,6 +88,11 @@
|
||||
#PAPERLESS_TIKA_ENDPOINT=http://localhost:9998
|
||||
#PAPERLESS_TIKA_GOTENBERG_ENDPOINT=http://localhost:3000
|
||||
|
||||
# Rechnungless settings
|
||||
|
||||
#PAPERLESS_RECHNUNGLESS_ENABLED=false
|
||||
#PAPERLESS_RECHNUNGLESS_ENDPOINT=http://rechnungless:8080/rechnungless
|
||||
|
||||
# Binaries
|
||||
|
||||
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
|
||||
|
@ -318,7 +318,6 @@ INSTALLED_APPS = [
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_xml.apps.PaperlessXMLConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
@ -1090,6 +1089,16 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||
if TIKA_ENABLED:
|
||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||
|
||||
# XML / RECHNUNGLESS settings
|
||||
RECHNUNGLESS_ENABLED = __get_boolean("PAPERLESS_RECHNUNGLESS_ENABLED", "NO")
|
||||
RECHNUNGLESS_ENDPOINT = os.getenv(
|
||||
"PAPERLESS_RECHNUNGLESS_ENDPOINT",
|
||||
"http://rechnungless:8080/rechnungless",
|
||||
)
|
||||
|
||||
if RECHNUNGLESS_ENABLED:
|
||||
INSTALLED_APPS.append("paperless_xml.apps.PaperlessXMLConfig")
|
||||
|
||||
AUDIT_LOG_ENABLED = __get_boolean("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||
if AUDIT_LOG_ENABLED:
|
||||
INSTALLED_APPS.append("auditlog")
|
||||
|
@ -1,4 +1,5 @@
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_xml.signals import xml_consumer_declaration
|
||||
|
||||
@ -9,6 +10,7 @@ class PaperlessXMLConfig(AppConfig):
|
||||
def ready(self):
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(xml_consumer_declaration)
|
||||
if settings.RECHNUNGLESS_ENABLED:
|
||||
document_consumer_declaration.connect(xml_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@ -1,6 +1,13 @@
|
||||
import subprocess
|
||||
import base64
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from django.utils.timezone import is_naive
|
||||
from django.utils.timezone import make_aware
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
@ -25,83 +32,58 @@ class XMLDocumentParser(TextDocumentParser):
|
||||
else:
|
||||
return super().get_thumbnail(document_path, mime_type, file_name)
|
||||
|
||||
def xml_to_pdf_mustang(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type,
|
||||
file_name=None,
|
||||
) -> Path:
|
||||
outpdf = Path(self.tempdir, "out.pdf")
|
||||
res = subprocess.run(
|
||||
[
|
||||
"mustang-cli.jar",
|
||||
"--action",
|
||||
"pdf",
|
||||
"--source",
|
||||
document_path,
|
||||
"--out",
|
||||
outpdf,
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
if res.returncode != 0:
|
||||
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
||||
else:
|
||||
return outpdf
|
||||
|
||||
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
|
||||
outpdf = Path(self.tempdir, "combined.pdf")
|
||||
res = subprocess.run(
|
||||
[
|
||||
"mustang-cli.jar",
|
||||
"--action",
|
||||
"combine",
|
||||
"--source",
|
||||
pdf_path,
|
||||
"--source-xml",
|
||||
xml_path,
|
||||
"--format",
|
||||
"zf",
|
||||
"--version",
|
||||
"2",
|
||||
"--profile",
|
||||
"X",
|
||||
"--no-additional-attachments",
|
||||
"--out",
|
||||
outpdf,
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
if res.returncode != 0:
|
||||
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
||||
else:
|
||||
return outpdf
|
||||
|
||||
def is_xrechnung_mustang(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type,
|
||||
file_name=None,
|
||||
) -> bool:
|
||||
res = subprocess.run(
|
||||
[
|
||||
"mustang-cli.jar",
|
||||
"--action",
|
||||
"validate",
|
||||
"--source",
|
||||
document_path,
|
||||
"--no-notices",
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
return res.returncode == 0
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
super().parse(document_path, mime_type, file_name)
|
||||
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
|
||||
self.is_invoice = True
|
||||
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
|
||||
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
|
||||
self.archive_path = pdfWith
|
||||
else:
|
||||
self.is_invoice = False
|
||||
|
||||
header = {"Content-Type": "application/xml"}
|
||||
url = settings.RECHNUNGLESS_ENDPOINT
|
||||
httpResponse = httpx.post(
|
||||
url + "/convert",
|
||||
headers=header,
|
||||
data=self.text,
|
||||
timeout=60.0,
|
||||
)
|
||||
if httpResponse.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||
raise ParseError("Server Error: " + str(httpResponse.content))
|
||||
if httpResponse.status_code not in (
|
||||
httpx.codes.OK,
|
||||
httpx.codes.UNPROCESSABLE_ENTITY,
|
||||
):
|
||||
raise ParseError(
|
||||
"Unknown Error: HTTP"
|
||||
+ str(httpResponse.status_code)
|
||||
+ " "
|
||||
+ str(httpResponse.content),
|
||||
)
|
||||
response = json.loads(httpResponse.content)
|
||||
|
||||
if response["result"] == "failed":
|
||||
message = "Conversion failed: \n"
|
||||
for msg in response["messages"]:
|
||||
message += msg
|
||||
self.log.info(f"Invalid schema: {message}")
|
||||
self.is_invoice = False
|
||||
return
|
||||
if httpResponse.status_code == httpx.codes.UNPROCESSABLE_ENTITY:
|
||||
message = "The XML file is not valid:"
|
||||
for msg in response["messages"]:
|
||||
message += "\n" + msg
|
||||
self.log.info(f"Invalid schema: {message}")
|
||||
self.is_invoice = False
|
||||
return
|
||||
|
||||
if response["result"] == "invalid":
|
||||
contStr = str(httpResponse.content)
|
||||
self.log.warning(f"The file received is technically invalid: {contStr}")
|
||||
|
||||
self.archive_path = Path(self.tempdir, "invoice.pdf")
|
||||
self.is_invoice = True
|
||||
|
||||
with self.archive_path.open("wb") as archiveFile:
|
||||
archiveFile.write(base64.b64decode(response["archive_pdf"]))
|
||||
|
||||
if "issue_date" in response:
|
||||
self.date = datetime.strptime(response["issue_date"], "%Y%m%d")
|
||||
if is_naive(self.date):
|
||||
self.date = make_aware(self.date)
|
||||
|
Loading…
x
Reference in New Issue
Block a user