mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add the new paperless_tika parser
This parser will use an external Tika and Gotenberg server to parse "Office" documents (.doc, .xls, .odt, etc.) Signed-off-by: Jo Vandeginste <Jo.Vandeginste@kuleuven.be>
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @@ -42,6 +42,7 @@ whoosh="~=2.7.4" | ||||
| inotifyrecursive = "~=0.3.4" | ||||
| ocrmypdf = "*" | ||||
| tqdm = "*" | ||||
| tika = "*" | ||||
|  | ||||
| [dev-packages] | ||||
| coveralls = "*" | ||||
|   | ||||
							
								
								
									
										43
									
								
								docker/hub/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docker/hub/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| version: "3.4" | ||||
| services: | ||||
|   broker: | ||||
|     image: redis:6.0 | ||||
|     restart: always | ||||
|  | ||||
|   webserver: | ||||
|     image: jonaswinkler/paperless-ng:0.9.9 | ||||
|     restart: always | ||||
|     depends_on: | ||||
|       - broker | ||||
|     ports: | ||||
|       - 8000:8000 | ||||
|     healthcheck: | ||||
|       test: ["CMD", "curl", "-f", "http://localhost:8000"] | ||||
|       interval: 30s | ||||
|       timeout: 10s | ||||
|       retries: 5 | ||||
|     volumes: | ||||
|       - data:/usr/src/paperless/data | ||||
|       - media:/usr/src/paperless/media | ||||
|       - ./export:/usr/src/paperless/export | ||||
|       - ./consume:/usr/src/paperless/consume | ||||
|     env_file: docker-compose.env | ||||
|     environment: | ||||
|       PAPERLESS_REDIS: redis://broker:6379 | ||||
|       PAPERLESS_TIKA: 1 | ||||
|       GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 | ||||
|       TIKA_SERVER_ENDPOINT: http://tika:9998 | ||||
|  | ||||
|   gotenberg: | ||||
|     image: thecodingmachine/gotenberg | ||||
|     restart: unless-stopped | ||||
|     environment: | ||||
|       DISABLE_GOOGLE_CHROME: 1 | ||||
|  | ||||
|   tika: | ||||
|     image: apache/tika | ||||
|     restart: unless-stopped | ||||
|  | ||||
| volumes: | ||||
|   data: | ||||
|   media: | ||||
							
								
								
									
										43
									
								
								docker/local/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docker/local/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| version: "3.4" | ||||
| services: | ||||
|   broker: | ||||
|     image: redis:6.0 | ||||
|     restart: always | ||||
|  | ||||
|   webserver: | ||||
|     build: . | ||||
|     restart: always | ||||
|     depends_on: | ||||
|       - broker | ||||
|     ports: | ||||
|       - 8000:8000 | ||||
|     healthcheck: | ||||
|       test: ["CMD", "curl", "-f", "http://localhost:8000"] | ||||
|       interval: 30s | ||||
|       timeout: 10s | ||||
|       retries: 5 | ||||
|     volumes: | ||||
|       - data:/usr/src/paperless/data | ||||
|       - media:/usr/src/paperless/media | ||||
|       - ./export:/usr/src/paperless/export | ||||
|       - ./consume:/usr/src/paperless/consume | ||||
|     env_file: docker-compose.env | ||||
|     environment: | ||||
|       PAPERLESS_REDIS: redis://broker:6379 | ||||
|       PAPERLESS_TIKA: 1 | ||||
|       GOTENBERG_SERVER_ENDPOINT: http://gotenberg:3000 | ||||
|       TIKA_SERVER_ENDPOINT: http://tika:9998 | ||||
|  | ||||
|   gotenberg: | ||||
|     image: thecodingmachine/gotenberg | ||||
|     restart: unless-stopped | ||||
|     environment: | ||||
|       DISABLE_GOOGLE_CHROME: 1 | ||||
|  | ||||
|   tika: | ||||
|     image: apache/tika | ||||
|     restart: unless-stopped | ||||
|  | ||||
| volumes: | ||||
|   data: | ||||
|   media: | ||||
| @@ -277,6 +277,35 @@ PAPERLESS_OCR_USER_ARG=<json> | ||||
|  | ||||
|         {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}     | ||||
|      | ||||
| .. _configuration-tika: | ||||
|  | ||||
| Tika settings | ||||
| ############# | ||||
|  | ||||
| Paperless can make use of `Tika <https://tika.apache.org/>`_ and  | ||||
| `Gotenberg <https://thecodingmachine.github.io/gotenberg/>`_ for parsing and | ||||
| converting "Office" documents (such as ".doc", ".xlsx" and ".odt"). If you | ||||
| wish to use this, you must provide a Tika server and a Gotenberg server, | ||||
| configure their endpoints, and enable the feature. | ||||
|  | ||||
| If you run paperless on docker, you can add those services to the docker-compose | ||||
| file (see the examples provided). | ||||
|  | ||||
| PAPERLESS_TIKA=<bool> | ||||
|     Enable (or disable) the Tika parser. | ||||
|  | ||||
|     Defaults to false. | ||||
|  | ||||
| TIKA_SERVER_ENDPOINT=<url> | ||||
|     Set the endpoint URL were Paperless can reach your Tika server. | ||||
|  | ||||
|     Defaults to "http://localhost:9998". | ||||
|  | ||||
| GOTENBERG_SERVER_ENDPOINT=<url> | ||||
|     Set the endpoint URL were Paperless can reach your Gotenberg server. | ||||
|  | ||||
|     Defaults to "http://localhost:3000". | ||||
|  | ||||
|      | ||||
| Software tweaks | ||||
| ############### | ||||
|   | ||||
| @@ -87,6 +87,7 @@ INSTALLED_APPS = [ | ||||
|     "documents.apps.DocumentsConfig", | ||||
|     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||
|     "paperless_text.apps.PaperlessTextConfig", | ||||
|     "paperless_tika.apps.PaperlessTikaConfig", | ||||
|     "paperless_mail.apps.PaperlessMailConfig", | ||||
|  | ||||
|     "django.contrib.admin", | ||||
| @@ -424,3 +425,7 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): | ||||
| PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") | ||||
|  | ||||
| THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf") | ||||
|  | ||||
| # Tika settings | ||||
| PAPERLESS_TIKA = __get_boolean("PAPERLESS_TIKA", "NO") | ||||
| GOTENBERG_SERVER_ENDPOINT = os.getenv("GOTENBERG_SERVER_ENDPOINT", "http://localhost:3000") | ||||
|   | ||||
							
								
								
									
										14
									
								
								src/paperless_tika/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								src/paperless_tika/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| from django.apps import AppConfig | ||||
| from django.conf import settings | ||||
| from paperless_tika.signals import tika_consumer_declaration | ||||
|  | ||||
|  | ||||
| class PaperlessTikaConfig(AppConfig): | ||||
|     name = "paperless_tika" | ||||
|  | ||||
|     def ready(self): | ||||
|         from documents.signals import document_consumer_declaration | ||||
|  | ||||
|         if settings.PAPERLESS_TIKA: | ||||
|             document_consumer_declaration.connect(tika_consumer_declaration) | ||||
|         AppConfig.ready(self) | ||||
							
								
								
									
										118
									
								
								src/paperless_tika/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								src/paperless_tika/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,118 @@ | ||||
| import os | ||||
| import subprocess | ||||
| import tika | ||||
| import requests | ||||
| import dateutil.parser | ||||
|  | ||||
| from PIL import ImageDraw, ImageFont, Image | ||||
| from django.conf import settings | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, run_convert | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
| from tika import parser | ||||
|  | ||||
|  | ||||
| class TikaDocumentParser(DocumentParser): | ||||
|     """ | ||||
|     This parser sends documents to a local tika server | ||||
|     """ | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") | ||||
|         archive_path = self.archive_path | ||||
|  | ||||
|         out_path = os.path.join(self.tempdir, "convert.png") | ||||
|  | ||||
|         # Run convert to get a decent thumbnail | ||||
|         try: | ||||
|             run_convert( | ||||
|                 density=300, | ||||
|                 scale="500x5000>", | ||||
|                 alpha="remove", | ||||
|                 strip=True, | ||||
|                 trim=False, | ||||
|                 input_file="{}[0]".format(archive_path), | ||||
|                 output_file=out_path, | ||||
|                 logging_group=self.logging_group, | ||||
|             ) | ||||
|         except ParseError: | ||||
|             # if convert fails, fall back to extracting | ||||
|             # the first PDF page as a PNG using Ghostscript | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "Thumbnail generation with ImageMagick failed, falling back " | ||||
|                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", | ||||
|             ) | ||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") | ||||
|             cmd = [ | ||||
|                 settings.GS_BINARY, | ||||
|                 "-q", | ||||
|                 "-sDEVICE=pngalpha", | ||||
|                 "-o", | ||||
|                 gs_out_path, | ||||
|                 archive_path, | ||||
|             ] | ||||
|             if not subprocess.Popen(cmd).wait() == 0: | ||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||
|             # then run convert on the output from gs | ||||
|             run_convert( | ||||
|                 density=300, | ||||
|                 scale="500x5000>", | ||||
|                 alpha="remove", | ||||
|                 strip=True, | ||||
|                 trim=False, | ||||
|                 input_file=gs_out_path, | ||||
|                 output_file=out_path, | ||||
|                 logging_group=self.logging_group, | ||||
|             ) | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") | ||||
|  | ||||
|         try: | ||||
|             parsed = parser.from_file(document_path) | ||||
|         except requests.exceptions.HTTPError as err: | ||||
|             raise ParseError(f"Could not parse {document_path} with tika server: {err}") | ||||
|  | ||||
|         try: | ||||
|             content = parsed["content"].strip() | ||||
|         except: | ||||
|             content = "" | ||||
|  | ||||
|         try: | ||||
|             creation_date = dateutil.parser.isoparse( | ||||
|                 parsed["metadata"]["Creation-Date"] | ||||
|             ) | ||||
|         except: | ||||
|             creation_date = None | ||||
|  | ||||
|         archive_path = os.path.join(self.tempdir, "convert.pdf") | ||||
|         convert_to_pdf(self, document_path, archive_path) | ||||
|  | ||||
|         self.archive_path = archive_path | ||||
|         self.date = creation_date | ||||
|         self.text = content | ||||
|  | ||||
|  | ||||
| def convert_to_pdf(self, document_path, pdf_path): | ||||
|     pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||
|     gotenberg_server = settings.GOTENBERG_SERVER_ENDPOINT | ||||
|     url = gotenberg_server + "/convert/office" | ||||
|  | ||||
|     self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") | ||||
|     files = {"files": open(document_path, "rb")} | ||||
|     headers = {} | ||||
|  | ||||
|     try: | ||||
|         response = requests.post(url, files=files, headers=headers) | ||||
|         response.raise_for_status()  # ensure we notice bad responses | ||||
|     except requests.exceptions.HTTPError as err: | ||||
|         raise ParseError( | ||||
|             f"Could not contact gotenberg server at {gotenberg_server}: {err}" | ||||
|         ) | ||||
|  | ||||
|     file = open(pdf_path, "wb") | ||||
|     file.write(response.content) | ||||
|     file.close() | ||||
							
								
								
									
										20
									
								
								src/paperless_tika/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								src/paperless_tika/signals.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | ||||
| from .parsers import TikaDocumentParser | ||||
|  | ||||
|  | ||||
| def tika_consumer_declaration(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": TikaDocumentParser, | ||||
|         "weight": 10, | ||||
|         "mime_types": { | ||||
|             "application/msword": ".doc", | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", | ||||
|             "application/vnd.ms-excel": ".xls", | ||||
|             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", | ||||
|             "application/vnd.ms-powerpoint": ".ppt", | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx", | ||||
|             "application/vnd.oasis.opendocument.presentation": ".odp", | ||||
|             "application/vnd.oasis.opendocument.spreadsheet": ".ods", | ||||
|             "application/vnd.oasis.opendocument.text": ".odt", | ||||
|         }, | ||||
|     } | ||||
							
								
								
									
										3
									
								
								src/paperless_tika/test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/paperless_tika/test.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| import magic | ||||
| m = magic.from_file("/nfsstorage/jo/syncthing/Documenten/20R-309.153.052.pdf") | ||||
| print(m) | ||||
		Reference in New Issue
	
	Block a user
	 Jo Vandeginste
					Jo Vandeginste