mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add the new paperless_tika parser
This parser will use an external Tika and Gotenberg server to parse "Office" documents (.doc, .xls, .odt, etc.) Signed-off-by: Jo Vandeginste <Jo.Vandeginste@kuleuven.be>
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @@ -42,6 +42,7 @@ whoosh="~=2.7.4" | |||||||
| inotifyrecursive = "~=0.3.4" | inotifyrecursive = "~=0.3.4" | ||||||
| ocrmypdf = "*" | ocrmypdf = "*" | ||||||
| tqdm = "*" | tqdm = "*" | ||||||
|  | tika = "*" | ||||||
|  |  | ||||||
| [dev-packages] | [dev-packages] | ||||||
| coveralls = "*" | coveralls = "*" | ||||||
|   | |||||||
							
								
								
									
										43
									
								
								docker/hub/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docker/hub/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | |||||||
|  | version: "3.4" | ||||||
|  | services: | ||||||
|  |   broker: | ||||||
|  |     image: redis:6.0 | ||||||
|  |     restart: always | ||||||
|  |  | ||||||
|  |   webserver: | ||||||
|  |     image: jonaswinkler/paperless-ng:0.9.9 | ||||||
|  |     restart: always | ||||||
|  |     depends_on: | ||||||
|  |       - broker | ||||||
|  |     ports: | ||||||
|  |       - 8000:8000 | ||||||
|  |     healthcheck: | ||||||
|  |       test: ["CMD", "curl", "-f", "http://localhost:8000"] | ||||||
|  |       interval: 30s | ||||||
|  |       timeout: 10s | ||||||
|  |       retries: 5 | ||||||
|  |     volumes: | ||||||
|  |       - data:/usr/src/paperless/data | ||||||
|  |       - media:/usr/src/paperless/media | ||||||
|  |       - ./export:/usr/src/paperless/export | ||||||
|  |       - ./consume:/usr/src/paperless/consume | ||||||
|  |     env_file: docker-compose.env | ||||||
|  |     environment: | ||||||
|  |       PAPERLESS_REDIS: redis://broker:6379 | ||||||
|  |       PAPERLESS_TIKA: 1 | ||||||
|  |       GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 | ||||||
|  |       TIKA_SERVER_ENDPOINT: http://tika:9998 | ||||||
|  |  | ||||||
|  |   gotenberg: | ||||||
|  |     image: thecodingmachine/gotenberg | ||||||
|  |     restart: unless-stopped | ||||||
|  |     environment: | ||||||
|  |       DISABLE_GOOGLE_CHROME: 1 | ||||||
|  |  | ||||||
|  |   tika: | ||||||
|  |     image: apache/tika | ||||||
|  |     restart: unless-stopped | ||||||
|  |  | ||||||
|  | volumes: | ||||||
|  |   data: | ||||||
|  |   media: | ||||||
							
								
								
									
										43
									
								
								docker/local/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docker/local/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | |||||||
|  | version: "3.4" | ||||||
|  | services: | ||||||
|  |   broker: | ||||||
|  |     image: redis:6.0 | ||||||
|  |     restart: always | ||||||
|  |  | ||||||
|  |   webserver: | ||||||
|  |     build: . | ||||||
|  |     restart: always | ||||||
|  |     depends_on: | ||||||
|  |       - broker | ||||||
|  |     ports: | ||||||
|  |       - 8000:8000 | ||||||
|  |     healthcheck: | ||||||
|  |       test: ["CMD", "curl", "-f", "http://localhost:8000"] | ||||||
|  |       interval: 30s | ||||||
|  |       timeout: 10s | ||||||
|  |       retries: 5 | ||||||
|  |     volumes: | ||||||
|  |       - data:/usr/src/paperless/data | ||||||
|  |       - media:/usr/src/paperless/media | ||||||
|  |       - ./export:/usr/src/paperless/export | ||||||
|  |       - ./consume:/usr/src/paperless/consume | ||||||
|  |     env_file: docker-compose.env | ||||||
|  |     environment: | ||||||
|  |       PAPERLESS_REDIS: redis://broker:6379 | ||||||
|  |       PAPERLESS_TIKA: 1 | ||||||
|  |       GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 | ||||||
|  |       TIKA_SERVER_ENDPOINT: http://tika:9998 | ||||||
|  |  | ||||||
|  |   gotenberg: | ||||||
|  |     image: thecodingmachine/gotenberg | ||||||
|  |     restart: unless-stopped | ||||||
|  |     environment: | ||||||
|  |       DISABLE_GOOGLE_CHROME: 1 | ||||||
|  |  | ||||||
|  |   tika: | ||||||
|  |     image: apache/tika | ||||||
|  |     restart: unless-stopped | ||||||
|  |  | ||||||
|  | volumes: | ||||||
|  |   data: | ||||||
|  |   media: | ||||||
| @@ -277,6 +277,35 @@ PAPERLESS_OCR_USER_ARG=<json> | |||||||
|  |  | ||||||
|         {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}     |         {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}     | ||||||
|      |      | ||||||
|  | .. _configuration-tika: | ||||||
|  |  | ||||||
|  | Tika settings | ||||||
|  | ############# | ||||||
|  |  | ||||||
|  | Paperless can make use of `Tika <https://tika.apache.org/>`_ and  | ||||||
|  | `Gotenberg <https://thecodingmachine.github.io/gotenberg/>`_ for parsing and | ||||||
|  | converting "Office" documents (such as ".doc", ".xlsx" and ".odt"). If you | ||||||
|  | wish to use this, you must provide a Tika server and a Gotenberg server, | ||||||
|  | configure their endpoints, and enable the feature. | ||||||
|  |  | ||||||
|  | If you run paperless on docker, you can add those services to the docker-compose | ||||||
|  | file (see the examples provided). | ||||||
|  |  | ||||||
|  | PAPERLESS_TIKA=<bool> | ||||||
|  |     Enable (or disable) the Tika parser. | ||||||
|  |  | ||||||
|  |     Defaults to false. | ||||||
|  |  | ||||||
|  | TIKA_SERVER_ENDPOINT=<url> | ||||||
|  |     Set the endpoint URL were Paperless can reach your Tika server. | ||||||
|  |  | ||||||
|  |     Defaults to "http://localhost:9998". | ||||||
|  |  | ||||||
|  | GOTENBERG_SERVER_ENDPOINT=<url> | ||||||
|  |     Set the endpoint URL were Paperless can reach your Gotenberg server. | ||||||
|  |  | ||||||
|  |     Defaults to "http://localhost:3000". | ||||||
|  |  | ||||||
|      |      | ||||||
| Software tweaks | Software tweaks | ||||||
| ############### | ############### | ||||||
|   | |||||||
| @@ -87,6 +87,7 @@ INSTALLED_APPS = [ | |||||||
|     "documents.apps.DocumentsConfig", |     "documents.apps.DocumentsConfig", | ||||||
|     "paperless_tesseract.apps.PaperlessTesseractConfig", |     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||||
|     "paperless_text.apps.PaperlessTextConfig", |     "paperless_text.apps.PaperlessTextConfig", | ||||||
|  |     "paperless_tika.apps.PaperlessTikaConfig", | ||||||
|     "paperless_mail.apps.PaperlessMailConfig", |     "paperless_mail.apps.PaperlessMailConfig", | ||||||
|  |  | ||||||
|     "django.contrib.admin", |     "django.contrib.admin", | ||||||
| @@ -424,3 +425,7 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): | |||||||
| PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") | PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") | ||||||
|  |  | ||||||
| THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf") | THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf") | ||||||
|  |  | ||||||
|  | # Tika settings | ||||||
|  | PAPERLESS_TIKA = __get_boolean("PAPERLESS_TIKA", "NO") | ||||||
|  | GOTENBERG_SERVER_ENDPOINT = os.getenv("GOTENBERG_SERVER_ENDPOINT", "http://localhost:3000") | ||||||
|   | |||||||
							
								
								
									
										14
									
								
								src/paperless_tika/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								src/paperless_tika/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | from django.apps import AppConfig | ||||||
|  | from django.conf import settings | ||||||
|  | from paperless_tika.signals import tika_consumer_declaration | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PaperlessTikaConfig(AppConfig): | ||||||
|  |     name = "paperless_tika" | ||||||
|  |  | ||||||
|  |     def ready(self): | ||||||
|  |         from documents.signals import document_consumer_declaration | ||||||
|  |  | ||||||
|  |         if settings.PAPERLESS_TIKA: | ||||||
|  |             document_consumer_declaration.connect(tika_consumer_declaration) | ||||||
|  |         AppConfig.ready(self) | ||||||
							
								
								
									
										118
									
								
								src/paperless_tika/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								src/paperless_tika/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,118 @@ | |||||||
|  | import os | ||||||
|  | import subprocess | ||||||
|  | import tika | ||||||
|  | import requests | ||||||
|  | import dateutil.parser | ||||||
|  |  | ||||||
|  | from PIL import ImageDraw, ImageFont, Image | ||||||
|  | from django.conf import settings | ||||||
|  |  | ||||||
|  | from documents.parsers import DocumentParser, ParseError, run_convert | ||||||
|  | from paperless_tesseract.parsers import RasterisedDocumentParser | ||||||
|  | from tika import parser | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TikaDocumentParser(DocumentParser): | ||||||
|  |     """ | ||||||
|  |     This parser sends documents to a local tika server | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def get_thumbnail(self, document_path, mime_type): | ||||||
|  |         self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") | ||||||
|  |         archive_path = self.archive_path | ||||||
|  |  | ||||||
|  |         out_path = os.path.join(self.tempdir, "convert.png") | ||||||
|  |  | ||||||
|  |         # Run convert to get a decent thumbnail | ||||||
|  |         try: | ||||||
|  |             run_convert( | ||||||
|  |                 density=300, | ||||||
|  |                 scale="500x5000>", | ||||||
|  |                 alpha="remove", | ||||||
|  |                 strip=True, | ||||||
|  |                 trim=False, | ||||||
|  |                 input_file="{}[0]".format(archive_path), | ||||||
|  |                 output_file=out_path, | ||||||
|  |                 logging_group=self.logging_group, | ||||||
|  |             ) | ||||||
|  |         except ParseError: | ||||||
|  |             # if convert fails, fall back to extracting | ||||||
|  |             # the first PDF page as a PNG using Ghostscript | ||||||
|  |             self.log( | ||||||
|  |                 "warning", | ||||||
|  |                 "Thumbnail generation with ImageMagick failed, falling back " | ||||||
|  |                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", | ||||||
|  |             ) | ||||||
|  |             gs_out_path = os.path.join(self.tempdir, "gs_out.png") | ||||||
|  |             cmd = [ | ||||||
|  |                 settings.GS_BINARY, | ||||||
|  |                 "-q", | ||||||
|  |                 "-sDEVICE=pngalpha", | ||||||
|  |                 "-o", | ||||||
|  |                 gs_out_path, | ||||||
|  |                 archive_path, | ||||||
|  |             ] | ||||||
|  |             if not subprocess.Popen(cmd).wait() == 0: | ||||||
|  |                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||||
|  |             # then run convert on the output from gs | ||||||
|  |             run_convert( | ||||||
|  |                 density=300, | ||||||
|  |                 scale="500x5000>", | ||||||
|  |                 alpha="remove", | ||||||
|  |                 strip=True, | ||||||
|  |                 trim=False, | ||||||
|  |                 input_file=gs_out_path, | ||||||
|  |                 output_file=out_path, | ||||||
|  |                 logging_group=self.logging_group, | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         return out_path | ||||||
|  |  | ||||||
|  |     def parse(self, document_path, mime_type): | ||||||
|  |         self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             parsed = parser.from_file(document_path) | ||||||
|  |         except requests.exceptions.HTTPError as err: | ||||||
|  |             raise ParseError(f"Could not parse {document_path} with tika server: {err}") | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             content = parsed["content"].strip() | ||||||
|  |         except: | ||||||
|  |             content = "" | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             creation_date = dateutil.parser.isoparse( | ||||||
|  |                 parsed["metadata"]["Creation-Date"] | ||||||
|  |             ) | ||||||
|  |         except: | ||||||
|  |             creation_date = None | ||||||
|  |  | ||||||
|  |         archive_path = os.path.join(self.tempdir, "convert.pdf") | ||||||
|  |         convert_to_pdf(self, document_path, archive_path) | ||||||
|  |  | ||||||
|  |         self.archive_path = archive_path | ||||||
|  |         self.date = creation_date | ||||||
|  |         self.text = content | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def convert_to_pdf(self, document_path, pdf_path): | ||||||
|  |     pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||||
|  |     gotenberg_server = settings.GOTENBERG_SERVER_ENDPOINT | ||||||
|  |     url = gotenberg_server + "/convert/office" | ||||||
|  |  | ||||||
|  |     self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") | ||||||
|  |     files = {"files": open(document_path, "rb")} | ||||||
|  |     headers = {} | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         response = requests.post(url, files=files, headers=headers) | ||||||
|  |         response.raise_for_status()  # ensure we notice bad responses | ||||||
|  |     except requests.exceptions.HTTPError as err: | ||||||
|  |         raise ParseError( | ||||||
|  |             f"Could not contact gotenberg server at {gotenberg_server}: {err}" | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     file = open(pdf_path, "wb") | ||||||
|  |     file.write(response.content) | ||||||
|  |     file.close() | ||||||
							
								
								
									
										20
									
								
								src/paperless_tika/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								src/paperless_tika/signals.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | from .parsers import TikaDocumentParser | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def tika_consumer_declaration(sender, **kwargs): | ||||||
|  |     return { | ||||||
|  |         "parser": TikaDocumentParser, | ||||||
|  |         "weight": 10, | ||||||
|  |         "mime_types": { | ||||||
|  |             "application/msword": ".doc", | ||||||
|  |             "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", | ||||||
|  |             "application/vnd.ms-excel": ".xls", | ||||||
|  |             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", | ||||||
|  |             "application/vnd.ms-powerpoint": ".ppt", | ||||||
|  |             "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", | ||||||
|  |             "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx", | ||||||
|  |             "application/vnd.oasis.opendocument.presentation": ".odp", | ||||||
|  |             "application/vnd.oasis.opendocument.spreadsheet": ".ods", | ||||||
|  |             "application/vnd.oasis.opendocument.text": ".odt", | ||||||
|  |         }, | ||||||
|  |     } | ||||||
							
								
								
									
										3
									
								
								src/paperless_tika/test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/paperless_tika/test.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | import magic | ||||||
|  | m = magic.from_file("/nfsstorage/jo/syncthing/Documenten/20R-309.153.052.pdf") | ||||||
|  | print(m) | ||||||
		Reference in New Issue
	
	Block a user
	 Jo Vandeginste
					Jo Vandeginste