From b8e8bf3dd47d738f2e4c8ca67f14b6e8131071dc Mon Sep 17 00:00:00 2001 From: Jo Vandeginste Date: Tue, 29 Dec 2020 01:23:40 +0100 Subject: [PATCH 1/2] Add the new paperless_tika parser This parser will use an external Tika and Gotenberg server to parse "Office" documents (.doc, .xls, .odt, etc.) Signed-off-by: Jo Vandeginste --- Pipfile | 1 + docker/hub/docker-compose.tika.yml | 43 ++++++++++ docker/local/docker-compose.tika.yml | 43 ++++++++++ docs/configuration.rst | 29 +++++++ src/paperless/settings.py | 5 ++ src/paperless_tika/apps.py | 14 ++++ src/paperless_tika/parsers.py | 118 +++++++++++++++++++++++++++ src/paperless_tika/signals.py | 20 +++++ src/paperless_tika/test.py | 3 + 9 files changed, 276 insertions(+) create mode 100644 docker/hub/docker-compose.tika.yml create mode 100644 docker/local/docker-compose.tika.yml create mode 100644 src/paperless_tika/apps.py create mode 100644 src/paperless_tika/parsers.py create mode 100644 src/paperless_tika/signals.py create mode 100644 src/paperless_tika/test.py diff --git a/Pipfile b/Pipfile index 48759307c..f6301b98f 100644 --- a/Pipfile +++ b/Pipfile @@ -42,6 +42,7 @@ whoosh="~=2.7.4" inotifyrecursive = "~=0.3.4" ocrmypdf = "*" tqdm = "*" +tika = "*" [dev-packages] coveralls = "*" diff --git a/docker/hub/docker-compose.tika.yml b/docker/hub/docker-compose.tika.yml new file mode 100644 index 000000000..04dd3260e --- /dev/null +++ b/docker/hub/docker-compose.tika.yml @@ -0,0 +1,43 @@ +version: "3.4" +services: + broker: + image: redis:6.0 + restart: always + + webserver: + image: jonaswinkler/paperless-ng:0.9.9 + restart: always + depends_on: + - broker + ports: + - 8000:8000 + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000"] + interval: 30s + timeout: 10s + retries: 5 + volumes: + - data:/usr/src/paperless/data + - media:/usr/src/paperless/media + - ./export:/usr/src/paperless/export + - ./consume:/usr/src/paperless/consume + env_file: docker-compose.env + environment: + PAPERLESS_REDIS: redis://broker:6379 + PAPERLESS_TIKA: 1 + GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 + TIKA_SERVER_ENDPOINT: http://tika:9998 + + gotenberg: + image: thecodingmachine/gotenberg + restart: unless-stopped + environment: + DISABLE_GOOGLE_CHROME: 1 + + tika: + image: apache/tika + restart: unless-stopped + +volumes: + data: + media: diff --git a/docker/local/docker-compose.tika.yml b/docker/local/docker-compose.tika.yml new file mode 100644 index 000000000..ab901f306 --- /dev/null +++ b/docker/local/docker-compose.tika.yml @@ -0,0 +1,43 @@ +version: "3.4" +services: + broker: + image: redis:6.0 + restart: always + + webserver: + build: . + restart: always + depends_on: + - broker + ports: + - 8000:8000 + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000"] + interval: 30s + timeout: 10s + retries: 5 + volumes: + - data:/usr/src/paperless/data + - media:/usr/src/paperless/media + - ./export:/usr/src/paperless/export + - ./consume:/usr/src/paperless/consume + env_file: docker-compose.env + environment: + PAPERLESS_REDIS: redis://broker:6379 + PAPERLESS_TIKA: 1 + GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 + TIKA_SERVER_ENDPOINT: http://tika:9998 + + gotenberg: + image: thecodingmachine/gotenberg + restart: unless-stopped + environment: + DISABLE_GOOGLE_CHROME: 1 + + tika: + image: apache/tika + restart: unless-stopped + +volumes: + data: + media: diff --git a/docs/configuration.rst b/docs/configuration.rst index 5ccb80b3a..f53acb633 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -277,6 +277,35 @@ PAPERLESS_OCR_USER_ARG= {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"} +.. _configuration-tika: + +Tika settings +############# + +Paperless can make use of `Tika `_ and +`Gotenberg `_ for parsing and +converting "Office" documents (such as ".doc", ".xlsx" and ".odt"). If you +wish to use this, you must provide a Tika server and a Gotenberg server, +configure their endpoints, and enable the feature. + +If you run paperless on docker, you can add those services to the docker-compose +file (see the examples provided). + +PAPERLESS_TIKA= + Enable (or disable) the Tika parser. + + Defaults to false. + +TIKA_SERVER_ENDPOINT= + Set the endpoint URL were Paperless can reach your Tika server. + + Defaults to "http://localhost:9998". + +GOTENBERG_SERVER_ENDPOINT= + Set the endpoint URL were Paperless can reach your Gotenberg server. + + Defaults to "http://localhost:3000". + Software tweaks ############### diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 5af1be85e..219166cf5 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -87,6 +87,7 @@ INSTALLED_APPS = [ "documents.apps.DocumentsConfig", "paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", + "paperless_tika.apps.PaperlessTikaConfig", "paperless_mail.apps.PaperlessMailConfig", "django.contrib.admin", @@ -424,3 +425,7 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf") + +# Tika settings +PAPERLESS_TIKA = __get_boolean("PAPERLESS_TIKA", "NO") +GOTENBERG_SERVER_ENDPOINT = os.getenv("GOTENBERG_SERVER_ENDPOINT", "http://localhost:3000") diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py new file mode 100644 index 000000000..c29586d6a --- /dev/null +++ b/src/paperless_tika/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig +from django.conf import settings +from paperless_tika.signals import tika_consumer_declaration + + +class PaperlessTikaConfig(AppConfig): + name = "paperless_tika" + + def ready(self): + from documents.signals import document_consumer_declaration + + if settings.PAPERLESS_TIKA: + document_consumer_declaration.connect(tika_consumer_declaration) + AppConfig.ready(self) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py new file mode 100644 index 000000000..5a77681f2 --- /dev/null +++ b/src/paperless_tika/parsers.py @@ -0,0 +1,118 @@ +import os +import subprocess +import tika +import requests +import dateutil.parser + +from PIL import ImageDraw, ImageFont, Image +from django.conf import settings + +from documents.parsers import DocumentParser, ParseError, run_convert +from paperless_tesseract.parsers import RasterisedDocumentParser +from tika import parser + + +class TikaDocumentParser(DocumentParser): + """ + This parser sends documents to a local tika server + """ + + def get_thumbnail(self, document_path, mime_type): + self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") + archive_path = self.archive_path + + out_path = os.path.join(self.tempdir, "convert.png") + + # Run convert to get a decent thumbnail + try: + run_convert( + density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + input_file="{}[0]".format(archive_path), + output_file=out_path, + logging_group=self.logging_group, + ) + except ParseError: + # if convert fails, fall back to extracting + # the first PDF page as a PNG using Ghostscript + self.log( + "warning", + "Thumbnail generation with ImageMagick failed, falling back " + "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", + ) + gs_out_path = os.path.join(self.tempdir, "gs_out.png") + cmd = [ + settings.GS_BINARY, + "-q", + "-sDEVICE=pngalpha", + "-o", + gs_out_path, + archive_path, + ] + if not subprocess.Popen(cmd).wait() == 0: + raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) + # then run convert on the output from gs + run_convert( + density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + input_file=gs_out_path, + output_file=out_path, + logging_group=self.logging_group, + ) + + return out_path + + def parse(self, document_path, mime_type): + self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") + + try: + parsed = parser.from_file(document_path) + except requests.exceptions.HTTPError as err: + raise ParseError(f"Could not parse {document_path} with tika server: {err}") + + try: + content = parsed["content"].strip() + except: + content = "" + + try: + creation_date = dateutil.parser.isoparse( + parsed["metadata"]["Creation-Date"] + ) + except: + creation_date = None + + archive_path = os.path.join(self.tempdir, "convert.pdf") + convert_to_pdf(self, document_path, archive_path) + + self.archive_path = archive_path + self.date = creation_date + self.text = content + + +def convert_to_pdf(self, document_path, pdf_path): + pdf_path = os.path.join(self.tempdir, "convert.pdf") + gotenberg_server = settings.GOTENBERG_SERVER_ENDPOINT + url = gotenberg_server + "/convert/office" + + self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") + files = {"files": open(document_path, "rb")} + headers = {} + + try: + response = requests.post(url, files=files, headers=headers) + response.raise_for_status() # ensure we notice bad responses + except requests.exceptions.HTTPError as err: + raise ParseError( + f"Could not contact gotenberg server at {gotenberg_server}: {err}" + ) + + file = open(pdf_path, "wb") + file.write(response.content) + file.close() diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py new file mode 100644 index 000000000..409daebe2 --- /dev/null +++ b/src/paperless_tika/signals.py @@ -0,0 +1,20 @@ +from .parsers import TikaDocumentParser + + +def tika_consumer_declaration(sender, **kwargs): + return { + "parser": TikaDocumentParser, + "weight": 10, + "mime_types": { + "application/msword": ".doc", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.ms-excel": ".xls", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.ms-powerpoint": ".ppt", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx", + "application/vnd.oasis.opendocument.presentation": ".odp", + "application/vnd.oasis.opendocument.spreadsheet": ".ods", + "application/vnd.oasis.opendocument.text": ".odt", + }, + } diff --git a/src/paperless_tika/test.py b/src/paperless_tika/test.py new file mode 100644 index 000000000..0a2885226 --- /dev/null +++ b/src/paperless_tika/test.py @@ -0,0 +1,3 @@ +import magic +m = magic.from_file("/nfsstorage/jo/syncthing/Documenten/20R-309.153.052.pdf") +print(m) From 5236f4e58d26508a4c4d06c1b2e8a4a18822fc2f Mon Sep 17 00:00:00 2001 From: Jo Vandeginste Date: Thu, 31 Dec 2020 14:41:47 +0100 Subject: [PATCH 2/2] Refactor after feedback: - rename PAPERLESS_TIKA to PAPERLESS_TIKA_ENABLED - all other env params now start with PAPERLESS_TIKA - convert_to_pdf as class instance method - smaller details Signed-off-by: Jo Vandeginste --- docker/hub/docker-compose.tika.yml | 6 +-- docker/local/docker-compose.tika.yml | 6 +-- docs/configuration.rst | 6 +-- src/paperless/settings.py | 7 ++- src/paperless_tika/apps.py | 2 +- src/paperless_tika/parsers.py | 65 +++++++++++++--------------- src/paperless_tika/test.py | 3 -- 7 files changed, 46 insertions(+), 49 deletions(-) delete mode 100644 src/paperless_tika/test.py diff --git a/docker/hub/docker-compose.tika.yml b/docker/hub/docker-compose.tika.yml index 04dd3260e..af8f575a0 100644 --- a/docker/hub/docker-compose.tika.yml +++ b/docker/hub/docker-compose.tika.yml @@ -24,9 +24,9 @@ services: env_file: docker-compose.env environment: PAPERLESS_REDIS: redis://broker:6379 - PAPERLESS_TIKA: 1 - GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 - TIKA_SERVER_ENDPOINT: http://tika:9998 + PAPERLESS_TIKA_ENABLED: 1 + PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000 + PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: image: thecodingmachine/gotenberg diff --git a/docker/local/docker-compose.tika.yml b/docker/local/docker-compose.tika.yml index ab901f306..889713908 100644 --- a/docker/local/docker-compose.tika.yml +++ b/docker/local/docker-compose.tika.yml @@ -24,9 +24,9 @@ services: env_file: docker-compose.env environment: PAPERLESS_REDIS: redis://broker:6379 - PAPERLESS_TIKA: 1 - GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 - TIKA_SERVER_ENDPOINT: http://tika:9998 + PAPERLESS_TIKA_ENABLED: 1 + PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000 + PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: image: thecodingmachine/gotenberg diff --git a/docs/configuration.rst b/docs/configuration.rst index f53acb633..49c95bff1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -291,17 +291,17 @@ configure their endpoints, and enable the feature. If you run paperless on docker, you can add those services to the docker-compose file (see the examples provided). -PAPERLESS_TIKA= +PAPERLESS_TIKA_ENABLED= Enable (or disable) the Tika parser. Defaults to false. -TIKA_SERVER_ENDPOINT= +PAPERLESS_TIKA_ENDPOINT= Set the endpoint URL were Paperless can reach your Tika server. Defaults to "http://localhost:9998". -GOTENBERG_SERVER_ENDPOINT= +PAPERLESS_TIKA_GOTENBERG_ENDPOINT= Set the endpoint URL were Paperless can reach your Gotenberg server. Defaults to "http://localhost:3000". diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 219166cf5..caa1b9b18 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -427,5 +427,8 @@ PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf") # Tika settings -PAPERLESS_TIKA = __get_boolean("PAPERLESS_TIKA", "NO") -GOTENBERG_SERVER_ENDPOINT = os.getenv("GOTENBERG_SERVER_ENDPOINT", "http://localhost:3000") +PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO") +PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998") +PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( + "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" +) diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py index c29586d6a..5cab21427 100644 --- a/src/paperless_tika/apps.py +++ b/src/paperless_tika/apps.py @@ -9,6 +9,6 @@ class PaperlessTikaConfig(AppConfig): def ready(self): from documents.signals import document_consumer_declaration - if settings.PAPERLESS_TIKA: + if settings.PAPERLESS_TIKA_ENABLED: document_consumer_declaration.connect(tika_consumer_declaration) AppConfig.ready(self) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 5a77681f2..81f213a6b 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -70,49 +70,46 @@ class TikaDocumentParser(DocumentParser): def parse(self, document_path, mime_type): self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: - parsed = parser.from_file(document_path) + parsed = parser.from_file(document_path, tika_server) except requests.exceptions.HTTPError as err: - raise ParseError(f"Could not parse {document_path} with tika server: {err}") - - try: - content = parsed["content"].strip() - except: - content = "" - - try: - creation_date = dateutil.parser.isoparse( - parsed["metadata"]["Creation-Date"] + raise ParseError( + f"Could not parse {document_path} with tika server at {tika_server}: {err}" ) + + try: + self.text = parsed["content"].strip() except: - creation_date = None + pass + + try: + self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) + except: + pass archive_path = os.path.join(self.tempdir, "convert.pdf") - convert_to_pdf(self, document_path, archive_path) - + convert_to_pdf(document_path, archive_path) self.archive_path = archive_path - self.date = creation_date - self.text = content + def convert_to_pdf(document_path, pdf_path): + pdf_path = os.path.join(self.tempdir, "convert.pdf") + gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT + url = gotenberg_server + "/convert/office" -def convert_to_pdf(self, document_path, pdf_path): - pdf_path = os.path.join(self.tempdir, "convert.pdf") - gotenberg_server = settings.GOTENBERG_SERVER_ENDPOINT - url = gotenberg_server + "/convert/office" + self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") + files = {"files": open(document_path, "rb")} + headers = {} - self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") - files = {"files": open(document_path, "rb")} - headers = {} + try: + response = requests.post(url, files=files, headers=headers) + response.raise_for_status() # ensure we notice bad responses + except requests.exceptions.HTTPError as err: + raise ParseError( + f"Could not contact gotenberg server at {gotenberg_server}: {err}" + ) - try: - response = requests.post(url, files=files, headers=headers) - response.raise_for_status() # ensure we notice bad responses - except requests.exceptions.HTTPError as err: - raise ParseError( - f"Could not contact gotenberg server at {gotenberg_server}: {err}" - ) - - file = open(pdf_path, "wb") - file.write(response.content) - file.close() + file = open(pdf_path, "wb") + file.write(response.content) + file.close() diff --git a/src/paperless_tika/test.py b/src/paperless_tika/test.py deleted file mode 100644 index 0a2885226..000000000 --- a/src/paperless_tika/test.py +++ /dev/null @@ -1,3 +0,0 @@ -import magic -m = magic.from_file("/nfsstorage/jo/syncthing/Documenten/20R-309.153.052.pdf") -print(m)