diff --git a/docker/hub/docker-compose.tika.yml b/docker/hub/docker-compose.tika.yml index 04dd3260e..af8f575a0 100644 --- a/docker/hub/docker-compose.tika.yml +++ b/docker/hub/docker-compose.tika.yml @@ -24,9 +24,9 @@ services: env_file: docker-compose.env environment: PAPERLESS_REDIS: redis://broker:6379 - PAPERLESS_TIKA: 1 - GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 - TIKA_SERVER_ENDPOINT: http://tika:9998 + PAPERLESS_TIKA_ENABLED: 1 + PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000 + PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: image: thecodingmachine/gotenberg diff --git a/docker/local/docker-compose.tika.yml b/docker/local/docker-compose.tika.yml index ab901f306..889713908 100644 --- a/docker/local/docker-compose.tika.yml +++ b/docker/local/docker-compose.tika.yml @@ -24,9 +24,9 @@ services: env_file: docker-compose.env environment: PAPERLESS_REDIS: redis://broker:6379 - PAPERLESS_TIKA: 1 - GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 - TIKA_SERVER_ENDPOINT: http://tika:9998 + PAPERLESS_TIKA_ENABLED: 1 + PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000 + PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: image: thecodingmachine/gotenberg diff --git a/docs/configuration.rst b/docs/configuration.rst index f53acb633..49c95bff1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -291,17 +291,17 @@ configure their endpoints, and enable the feature. If you run paperless on docker, you can add those services to the docker-compose file (see the examples provided). -PAPERLESS_TIKA= +PAPERLESS_TIKA_ENABLED= Enable (or disable) the Tika parser. Defaults to false. -TIKA_SERVER_ENDPOINT= +PAPERLESS_TIKA_ENDPOINT= Set the endpoint URL were Paperless can reach your Tika server. Defaults to "http://localhost:9998". -GOTENBERG_SERVER_ENDPOINT= +PAPERLESS_TIKA_GOTENBERG_ENDPOINT= Set the endpoint URL were Paperless can reach your Gotenberg server. Defaults to "http://localhost:3000". diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 219166cf5..caa1b9b18 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -427,5 +427,8 @@ PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf") # Tika settings -PAPERLESS_TIKA = __get_boolean("PAPERLESS_TIKA", "NO") -GOTENBERG_SERVER_ENDPOINT = os.getenv("GOTENBERG_SERVER_ENDPOINT", "http://localhost:3000") +PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO") +PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998") +PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( + "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" +) diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py index c29586d6a..5cab21427 100644 --- a/src/paperless_tika/apps.py +++ b/src/paperless_tika/apps.py @@ -9,6 +9,6 @@ class PaperlessTikaConfig(AppConfig): def ready(self): from documents.signals import document_consumer_declaration - if settings.PAPERLESS_TIKA: + if settings.PAPERLESS_TIKA_ENABLED: document_consumer_declaration.connect(tika_consumer_declaration) AppConfig.ready(self) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 5a77681f2..81f213a6b 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -70,49 +70,46 @@ class TikaDocumentParser(DocumentParser): def parse(self, document_path, mime_type): self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: - parsed = parser.from_file(document_path) + parsed = parser.from_file(document_path, tika_server) except requests.exceptions.HTTPError as err: - raise ParseError(f"Could not parse {document_path} with tika server: {err}") - - try: - content = parsed["content"].strip() - except: - content = "" - - try: - creation_date = dateutil.parser.isoparse( - parsed["metadata"]["Creation-Date"] + raise ParseError( + f"Could not parse {document_path} with tika server at {tika_server}: {err}" ) + + try: + self.text = parsed["content"].strip() except: - creation_date = None + pass + + try: + self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) + except: + pass archive_path = os.path.join(self.tempdir, "convert.pdf") - convert_to_pdf(self, document_path, archive_path) - + convert_to_pdf(document_path, archive_path) self.archive_path = archive_path - self.date = creation_date - self.text = content + def convert_to_pdf(document_path, pdf_path): + pdf_path = os.path.join(self.tempdir, "convert.pdf") + gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT + url = gotenberg_server + "/convert/office" -def convert_to_pdf(self, document_path, pdf_path): - pdf_path = os.path.join(self.tempdir, "convert.pdf") - gotenberg_server = settings.GOTENBERG_SERVER_ENDPOINT - url = gotenberg_server + "/convert/office" + self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") + files = {"files": open(document_path, "rb")} + headers = {} - self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") - files = {"files": open(document_path, "rb")} - headers = {} + try: + response = requests.post(url, files=files, headers=headers) + response.raise_for_status() # ensure we notice bad responses + except requests.exceptions.HTTPError as err: + raise ParseError( + f"Could not contact gotenberg server at {gotenberg_server}: {err}" + ) - try: - response = requests.post(url, files=files, headers=headers) - response.raise_for_status() # ensure we notice bad responses - except requests.exceptions.HTTPError as err: - raise ParseError( - f"Could not contact gotenberg server at {gotenberg_server}: {err}" - ) - - file = open(pdf_path, "wb") - file.write(response.content) - file.close() + file = open(pdf_path, "wb") + file.write(response.content) + file.close() diff --git a/src/paperless_tika/test.py b/src/paperless_tika/test.py deleted file mode 100644 index 0a2885226..000000000 --- a/src/paperless_tika/test.py +++ /dev/null @@ -1,3 +0,0 @@ -import magic -m = magic.from_file("/nfsstorage/jo/syncthing/Documenten/20R-309.153.052.pdf") -print(m)