mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-29 11:09:27 -05:00
refactor: move xml parser to seperate container with rechnungless
This commit is contained in:
parent
01c502a0e4
commit
5699b58fae
@ -118,9 +118,7 @@ ARG RUNTIME_PACKAGES="\
|
|||||||
zlib1g \
|
zlib1g \
|
||||||
# Barcode splitter
|
# Barcode splitter
|
||||||
libzbar0 \
|
libzbar0 \
|
||||||
poppler-utils \
|
poppler-utils"
|
||||||
# XRechnung
|
|
||||||
default-jre"
|
|
||||||
|
|
||||||
# Install basic runtime packages.
|
# Install basic runtime packages.
|
||||||
# These change very infrequently
|
# These change very infrequently
|
||||||
@ -162,8 +160,6 @@ RUN set -eux \
|
|||||||
&& echo "Installing supervisor" \
|
&& echo "Installing supervisor" \
|
||||||
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
|
&& python3 -m pip install --default-timeout=1000 --upgrade --no-cache-dir supervisor==4.2.5
|
||||||
|
|
||||||
RUN curl -o /usr/local/bin/mustang-cli.jar https://github.com/ZUGFeRD/mustangproject/releases/download/core-2.15.1/Mustang-CLI-2.15.1.jar && chmod +x /usr/local/bin/mustang-cli.jar
|
|
||||||
|
|
||||||
# Copy gunicorn config
|
# Copy gunicorn config
|
||||||
# Changes very infrequently
|
# Changes very infrequently
|
||||||
WORKDIR /usr/src/paperless/
|
WORKDIR /usr/src/paperless/
|
||||||
|
10
docker/compose/docker-compose.rechnungless.yml
Normal file
10
docker/compose/docker-compose.rechnungless.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
services:
|
||||||
|
rechnungless:
|
||||||
|
image: marcel2508/rechnungless:latest
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
# REQUIRED FOR APPLE M4 CHIP / MACOS 15.2
|
||||||
|
JAVA_OPTS: -XX:UseSVE=0
|
||||||
|
webserver:
|
||||||
|
environment:
|
||||||
|
PAPERLESS_ENABLE_RECHNUNGLESS: 1
|
@ -198,6 +198,18 @@ Docker, this may be the `environment` key of the webserver or a
|
|||||||
containing the configuration parameters. Be sure to use the correct format
|
containing the configuration parameters. Be sure to use the correct format
|
||||||
and watch out for indentation if editing the YAML file.
|
and watch out for indentation if editing the YAML file.
|
||||||
|
|
||||||
|
#### [`PAPERLESS_RECHNUNGLESS_ENABLED=<bool>`](#PAPERLESS_RECHNUNGLESS_ENABLED) {#PAPERLESS_RECHNUNGLESS_ENABLED}
|
||||||
|
|
||||||
|
: Enable (or disable) the Rechnungless xml to pdf converter and validator.
|
||||||
|
|
||||||
|
Defaults to false.
|
||||||
|
|
||||||
|
#### [`PAPERLESS_RECHNUNGLESS_ENDPOINT=<url>`](#PAPERLESS_RECHNUNGLESS_ENDPOINT) {#PAPERLESS_RECHNUNGLESS_ENDPOINT}
|
||||||
|
|
||||||
|
: Set the endpoint URL where Paperless can reach your Rechnungless api server.
|
||||||
|
|
||||||
|
Defaults to "<http://rechnungless:8080/rechnungless>".
|
||||||
|
|
||||||
## Paths and folders
|
## Paths and folders
|
||||||
|
|
||||||
#### [`PAPERLESS_CONSUMPTION_DIR=<path>`](#PAPERLESS_CONSUMPTION_DIR) {#PAPERLESS_CONSUMPTION_DIR}
|
#### [`PAPERLESS_CONSUMPTION_DIR=<path>`](#PAPERLESS_CONSUMPTION_DIR) {#PAPERLESS_CONSUMPTION_DIR}
|
||||||
|
@ -142,6 +142,14 @@ echo ""
|
|||||||
ask "Enable Apache Tika?" "no" "yes no"
|
ask "Enable Apache Tika?" "no" "yes no"
|
||||||
TIKA_ENABLED=$ask_result
|
TIKA_ENABLED=$ask_result
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Paperless is able to use Mustang Library to support XML files in XInvoice schema"
|
||||||
|
echo "This feature requires more resources due to the required services."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
ask "Enable Rechnungless XInvoice service?" "no" "yes no"
|
||||||
|
RECHNUNGLESS_ENABLED=$ask_result
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Specify the default language that most of your documents are written in."
|
echo "Specify the default language that most of your documents are written in."
|
||||||
echo "Use ISO 639-2, (T) variant language codes: "
|
echo "Use ISO 639-2, (T) variant language codes: "
|
||||||
@ -322,6 +330,10 @@ if [[ $TIKA_ENABLED == "yes" ]] ; then
|
|||||||
DOCKER_COMPOSE_VERSION="$DOCKER_COMPOSE_VERSION-tika"
|
DOCKER_COMPOSE_VERSION="$DOCKER_COMPOSE_VERSION-tika"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
|
||||||
|
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.rechnungless.yml" -O docker-compose.rechnungless.yml
|
||||||
|
fi
|
||||||
|
|
||||||
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.$DOCKER_COMPOSE_VERSION.yml" -O docker-compose.yml
|
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/docker-compose.$DOCKER_COMPOSE_VERSION.yml" -O docker-compose.yml
|
||||||
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/.env" -O .env
|
wget "https://raw.githubusercontent.com/paperless-ngx/paperless-ngx/main/docker/compose/.env" -O .env
|
||||||
|
|
||||||
@ -391,6 +403,9 @@ if [ "$l1" -eq "$l2" ] ; then
|
|||||||
sed -i "/^volumes:/d" docker-compose.yml
|
sed -i "/^volumes:/d" docker-compose.yml
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
|
||||||
|
docker compose -f docker-compose.rechnungless.yml pull
|
||||||
|
fi
|
||||||
|
|
||||||
docker compose pull
|
docker compose pull
|
||||||
|
|
||||||
@ -404,4 +419,8 @@ fi
|
|||||||
|
|
||||||
docker compose run --rm -e DJANGO_SUPERUSER_PASSWORD="$PASSWORD" webserver createsuperuser --noinput --username "$USERNAME" --email "$EMAIL"
|
docker compose run --rm -e DJANGO_SUPERUSER_PASSWORD="$PASSWORD" webserver createsuperuser --noinput --username "$USERNAME" --email "$EMAIL"
|
||||||
|
|
||||||
|
if [[ $RECHNUNGLESS_ENABLED == "yes" ]] ; then
|
||||||
|
docker compose up -f docker-compose.yml -f docker-compose.rechnungless.yml --detach
|
||||||
|
else
|
||||||
docker compose up --detach
|
docker compose up --detach
|
||||||
|
fi
|
||||||
|
@ -88,6 +88,11 @@
|
|||||||
#PAPERLESS_TIKA_ENDPOINT=http://localhost:9998
|
#PAPERLESS_TIKA_ENDPOINT=http://localhost:9998
|
||||||
#PAPERLESS_TIKA_GOTENBERG_ENDPOINT=http://localhost:3000
|
#PAPERLESS_TIKA_GOTENBERG_ENDPOINT=http://localhost:3000
|
||||||
|
|
||||||
|
# Rechnungless settings
|
||||||
|
|
||||||
|
#PAPERLESS_RECHNUNGLESS_ENABLED=false
|
||||||
|
#PAPERLESS_RECHNUNGLESS_ENDPOINT=http://rechnungless:8080/rechnungless
|
||||||
|
|
||||||
# Binaries
|
# Binaries
|
||||||
|
|
||||||
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
|
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
|
||||||
|
@ -318,7 +318,6 @@ INSTALLED_APPS = [
|
|||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
"paperless_text.apps.PaperlessTextConfig",
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
"paperless_xml.apps.PaperlessXMLConfig",
|
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
@ -1090,6 +1089,16 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
|||||||
if TIKA_ENABLED:
|
if TIKA_ENABLED:
|
||||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||||
|
|
||||||
|
# XML / RECHNUNGLESS settings
|
||||||
|
RECHNUNGLESS_ENABLED = __get_boolean("PAPERLESS_RECHNUNGLESS_ENABLED", "NO")
|
||||||
|
RECHNUNGLESS_ENDPOINT = os.getenv(
|
||||||
|
"PAPERLESS_RECHNUNGLESS_ENDPOINT",
|
||||||
|
"http://rechnungless:8080/rechnungless",
|
||||||
|
)
|
||||||
|
|
||||||
|
if RECHNUNGLESS_ENABLED:
|
||||||
|
INSTALLED_APPS.append("paperless_xml.apps.PaperlessXMLConfig")
|
||||||
|
|
||||||
AUDIT_LOG_ENABLED = __get_boolean("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
AUDIT_LOG_ENABLED = __get_boolean("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||||
if AUDIT_LOG_ENABLED:
|
if AUDIT_LOG_ENABLED:
|
||||||
INSTALLED_APPS.append("auditlog")
|
INSTALLED_APPS.append("auditlog")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
from paperless_xml.signals import xml_consumer_declaration
|
from paperless_xml.signals import xml_consumer_declaration
|
||||||
|
|
||||||
@ -9,6 +10,7 @@ class PaperlessXMLConfig(AppConfig):
|
|||||||
def ready(self):
|
def ready(self):
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
|
if settings.RECHNUNGLESS_ENABLED:
|
||||||
document_consumer_declaration.connect(xml_consumer_declaration)
|
document_consumer_declaration.connect(xml_consumer_declaration)
|
||||||
|
|
||||||
AppConfig.ready(self)
|
AppConfig.ready(self)
|
||||||
|
@ -1,6 +1,13 @@
|
|||||||
import subprocess
|
import base64
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils.timezone import is_naive
|
||||||
|
from django.utils.timezone import make_aware
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import make_thumbnail_from_pdf
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
from paperless_text.parsers import TextDocumentParser
|
from paperless_text.parsers import TextDocumentParser
|
||||||
@ -25,83 +32,58 @@ class XMLDocumentParser(TextDocumentParser):
|
|||||||
else:
|
else:
|
||||||
return super().get_thumbnail(document_path, mime_type, file_name)
|
return super().get_thumbnail(document_path, mime_type, file_name)
|
||||||
|
|
||||||
def xml_to_pdf_mustang(
|
|
||||||
self,
|
|
||||||
document_path: Path,
|
|
||||||
mime_type,
|
|
||||||
file_name=None,
|
|
||||||
) -> Path:
|
|
||||||
outpdf = Path(self.tempdir, "out.pdf")
|
|
||||||
res = subprocess.run(
|
|
||||||
[
|
|
||||||
"mustang-cli.jar",
|
|
||||||
"--action",
|
|
||||||
"pdf",
|
|
||||||
"--source",
|
|
||||||
document_path,
|
|
||||||
"--out",
|
|
||||||
outpdf,
|
|
||||||
],
|
|
||||||
timeout=20,
|
|
||||||
)
|
|
||||||
if res.returncode != 0:
|
|
||||||
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
|
||||||
else:
|
|
||||||
return outpdf
|
|
||||||
|
|
||||||
def attach_xml_pdf_mustang(self, pdf_path, xml_path) -> Path:
|
|
||||||
outpdf = Path(self.tempdir, "combined.pdf")
|
|
||||||
res = subprocess.run(
|
|
||||||
[
|
|
||||||
"mustang-cli.jar",
|
|
||||||
"--action",
|
|
||||||
"combine",
|
|
||||||
"--source",
|
|
||||||
pdf_path,
|
|
||||||
"--source-xml",
|
|
||||||
xml_path,
|
|
||||||
"--format",
|
|
||||||
"zf",
|
|
||||||
"--version",
|
|
||||||
"2",
|
|
||||||
"--profile",
|
|
||||||
"X",
|
|
||||||
"--no-additional-attachments",
|
|
||||||
"--out",
|
|
||||||
outpdf,
|
|
||||||
],
|
|
||||||
timeout=20,
|
|
||||||
)
|
|
||||||
if res.returncode != 0:
|
|
||||||
raise ParseError("Mustang CLI exited with code: " + str(res.returncode))
|
|
||||||
else:
|
|
||||||
return outpdf
|
|
||||||
|
|
||||||
def is_xrechnung_mustang(
|
|
||||||
self,
|
|
||||||
document_path: Path,
|
|
||||||
mime_type,
|
|
||||||
file_name=None,
|
|
||||||
) -> bool:
|
|
||||||
res = subprocess.run(
|
|
||||||
[
|
|
||||||
"mustang-cli.jar",
|
|
||||||
"--action",
|
|
||||||
"validate",
|
|
||||||
"--source",
|
|
||||||
document_path,
|
|
||||||
"--no-notices",
|
|
||||||
],
|
|
||||||
timeout=20,
|
|
||||||
)
|
|
||||||
return res.returncode == 0
|
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
super().parse(document_path, mime_type, file_name)
|
super().parse(document_path, mime_type, file_name)
|
||||||
if self.is_xrechnung_mustang(document_path, mime_type, file_name):
|
|
||||||
self.is_invoice = True
|
|
||||||
pdfOnly = self.xml_to_pdf_mustang(document_path, mime_type, file_name)
|
|
||||||
pdfWith = self.attach_xml_pdf_mustang(pdfOnly, document_path)
|
|
||||||
self.archive_path = pdfWith
|
|
||||||
else:
|
|
||||||
self.is_invoice = False
|
self.is_invoice = False
|
||||||
|
|
||||||
|
header = {"Content-Type": "application/xml"}
|
||||||
|
url = settings.RECHNUNGLESS_ENDPOINT
|
||||||
|
httpResponse = httpx.post(
|
||||||
|
url + "/convert",
|
||||||
|
headers=header,
|
||||||
|
data=self.text,
|
||||||
|
timeout=60.0,
|
||||||
|
)
|
||||||
|
if httpResponse.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||||
|
raise ParseError("Server Error: " + str(httpResponse.content))
|
||||||
|
if httpResponse.status_code not in (
|
||||||
|
httpx.codes.OK,
|
||||||
|
httpx.codes.UNPROCESSABLE_ENTITY,
|
||||||
|
):
|
||||||
|
raise ParseError(
|
||||||
|
"Unknown Error: HTTP"
|
||||||
|
+ str(httpResponse.status_code)
|
||||||
|
+ " "
|
||||||
|
+ str(httpResponse.content),
|
||||||
|
)
|
||||||
|
response = json.loads(httpResponse.content)
|
||||||
|
|
||||||
|
if response["result"] == "failed":
|
||||||
|
message = "Conversion failed: \n"
|
||||||
|
for msg in response["messages"]:
|
||||||
|
message += msg
|
||||||
|
self.log.info(f"Invalid schema: {message}")
|
||||||
|
self.is_invoice = False
|
||||||
|
return
|
||||||
|
if httpResponse.status_code == httpx.codes.UNPROCESSABLE_ENTITY:
|
||||||
|
message = "The XML file is not valid:"
|
||||||
|
for msg in response["messages"]:
|
||||||
|
message += "\n" + msg
|
||||||
|
self.log.info(f"Invalid schema: {message}")
|
||||||
|
self.is_invoice = False
|
||||||
|
return
|
||||||
|
|
||||||
|
if response["result"] == "invalid":
|
||||||
|
contStr = str(httpResponse.content)
|
||||||
|
self.log.warning(f"The file received is technically invalid: {contStr}")
|
||||||
|
|
||||||
|
self.archive_path = Path(self.tempdir, "invoice.pdf")
|
||||||
|
self.is_invoice = True
|
||||||
|
|
||||||
|
with self.archive_path.open("wb") as archiveFile:
|
||||||
|
archiveFile.write(base64.b64decode(response["archive_pdf"]))
|
||||||
|
|
||||||
|
if "issue_date" in response:
|
||||||
|
self.date = datetime.strptime(response["issue_date"], "%Y%m%d")
|
||||||
|
if is_naive(self.date):
|
||||||
|
self.date = make_aware(self.date)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user