mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Merge pull request #204 from jovandeginste/paperless_tika
WIP: Add the new paperless_tika parser
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							@@ -42,6 +42,7 @@ whoosh="~=2.7.4"
 | 
			
		||||
inotifyrecursive = "~=0.3.4"
 | 
			
		||||
ocrmypdf = "*"
 | 
			
		||||
tqdm = "*"
 | 
			
		||||
tika = "*"
 | 
			
		||||
 | 
			
		||||
[dev-packages]
 | 
			
		||||
coveralls = "*"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										43
									
								
								docker/hub/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docker/hub/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,43 @@
 | 
			
		||||
version: "3.4"
 | 
			
		||||
services:
 | 
			
		||||
  broker:
 | 
			
		||||
    image: redis:6.0
 | 
			
		||||
    restart: always
 | 
			
		||||
 | 
			
		||||
  webserver:
 | 
			
		||||
    image: jonaswinkler/paperless-ng:0.9.9
 | 
			
		||||
    restart: always
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - broker
 | 
			
		||||
    ports:
 | 
			
		||||
      - 8000:8000
 | 
			
		||||
    healthcheck:
 | 
			
		||||
      test: ["CMD", "curl", "-f", "http://localhost:8000"]
 | 
			
		||||
      interval: 30s
 | 
			
		||||
      timeout: 10s
 | 
			
		||||
      retries: 5
 | 
			
		||||
    volumes:
 | 
			
		||||
      - data:/usr/src/paperless/data
 | 
			
		||||
      - media:/usr/src/paperless/media
 | 
			
		||||
      - ./export:/usr/src/paperless/export
 | 
			
		||||
      - ./consume:/usr/src/paperless/consume
 | 
			
		||||
    env_file: docker-compose.env
 | 
			
		||||
    environment:
 | 
			
		||||
      PAPERLESS_REDIS: redis://broker:6379
 | 
			
		||||
      PAPERLESS_TIKA_ENABLED: 1
 | 
			
		||||
      PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
 | 
			
		||||
      PAPERLESS_TIKA_ENDPOINT: http://tika:9998
 | 
			
		||||
 | 
			
		||||
  gotenberg:
 | 
			
		||||
    image: thecodingmachine/gotenberg
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
    environment:
 | 
			
		||||
      DISABLE_GOOGLE_CHROME: 1
 | 
			
		||||
 | 
			
		||||
  tika:
 | 
			
		||||
    image: apache/tika
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
 | 
			
		||||
volumes:
 | 
			
		||||
  data:
 | 
			
		||||
  media:
 | 
			
		||||
							
								
								
									
										43
									
								
								docker/local/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								docker/local/docker-compose.tika.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,43 @@
 | 
			
		||||
version: "3.4"
 | 
			
		||||
services:
 | 
			
		||||
  broker:
 | 
			
		||||
    image: redis:6.0
 | 
			
		||||
    restart: always
 | 
			
		||||
 | 
			
		||||
  webserver:
 | 
			
		||||
    build: .
 | 
			
		||||
    restart: always
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - broker
 | 
			
		||||
    ports:
 | 
			
		||||
      - 8000:8000
 | 
			
		||||
    healthcheck:
 | 
			
		||||
      test: ["CMD", "curl", "-f", "http://localhost:8000"]
 | 
			
		||||
      interval: 30s
 | 
			
		||||
      timeout: 10s
 | 
			
		||||
      retries: 5
 | 
			
		||||
    volumes:
 | 
			
		||||
      - data:/usr/src/paperless/data
 | 
			
		||||
      - media:/usr/src/paperless/media
 | 
			
		||||
      - ./export:/usr/src/paperless/export
 | 
			
		||||
      - ./consume:/usr/src/paperless/consume
 | 
			
		||||
    env_file: docker-compose.env
 | 
			
		||||
    environment:
 | 
			
		||||
      PAPERLESS_REDIS: redis://broker:6379
 | 
			
		||||
      PAPERLESS_TIKA_ENABLED: 1
 | 
			
		||||
      PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
 | 
			
		||||
      PAPERLESS_TIKA_ENDPOINT: http://tika:9998
 | 
			
		||||
 | 
			
		||||
  gotenberg:
 | 
			
		||||
    image: thecodingmachine/gotenberg
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
    environment:
 | 
			
		||||
      DISABLE_GOOGLE_CHROME: 1
 | 
			
		||||
 | 
			
		||||
  tika:
 | 
			
		||||
    image: apache/tika
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
 | 
			
		||||
volumes:
 | 
			
		||||
  data:
 | 
			
		||||
  media:
 | 
			
		||||
@@ -277,6 +277,35 @@ PAPERLESS_OCR_USER_ARG=<json>
 | 
			
		||||
 | 
			
		||||
        {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}    
 | 
			
		||||
    
 | 
			
		||||
.. _configuration-tika:
 | 
			
		||||
 | 
			
		||||
Tika settings
 | 
			
		||||
#############
 | 
			
		||||
 | 
			
		||||
Paperless can make use of `Tika <https://tika.apache.org/>`_ and 
 | 
			
		||||
`Gotenberg <https://thecodingmachine.github.io/gotenberg/>`_ for parsing and
 | 
			
		||||
converting "Office" documents (such as ".doc", ".xlsx" and ".odt"). If you
 | 
			
		||||
wish to use this, you must provide a Tika server and a Gotenberg server,
 | 
			
		||||
configure their endpoints, and enable the feature.
 | 
			
		||||
 | 
			
		||||
If you run paperless on docker, you can add those services to the docker-compose
 | 
			
		||||
file (see the examples provided).
 | 
			
		||||
 | 
			
		||||
PAPERLESS_TIKA_ENABLED=<bool>
 | 
			
		||||
    Enable (or disable) the Tika parser.
 | 
			
		||||
 | 
			
		||||
    Defaults to false.
 | 
			
		||||
 | 
			
		||||
PAPERLESS_TIKA_ENDPOINT=<url>
 | 
			
		||||
    Set the endpoint URL were Paperless can reach your Tika server.
 | 
			
		||||
 | 
			
		||||
    Defaults to "http://localhost:9998".
 | 
			
		||||
 | 
			
		||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT=<url>
 | 
			
		||||
    Set the endpoint URL were Paperless can reach your Gotenberg server.
 | 
			
		||||
 | 
			
		||||
    Defaults to "http://localhost:3000".
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
Software tweaks
 | 
			
		||||
###############
 | 
			
		||||
 
 | 
			
		||||
@@ -87,6 +87,7 @@ INSTALLED_APPS = [
 | 
			
		||||
    "documents.apps.DocumentsConfig",
 | 
			
		||||
    "paperless_tesseract.apps.PaperlessTesseractConfig",
 | 
			
		||||
    "paperless_text.apps.PaperlessTextConfig",
 | 
			
		||||
    "paperless_tika.apps.PaperlessTikaConfig",
 | 
			
		||||
    "paperless_mail.apps.PaperlessMailConfig",
 | 
			
		||||
 | 
			
		||||
    "django.contrib.admin",
 | 
			
		||||
@@ -424,3 +425,10 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
 | 
			
		||||
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
 | 
			
		||||
 | 
			
		||||
THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf")
 | 
			
		||||
 | 
			
		||||
# Tika settings
 | 
			
		||||
PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO")
 | 
			
		||||
PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998")
 | 
			
		||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
 | 
			
		||||
    "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
 | 
			
		||||
)
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										14
									
								
								src/paperless_tika/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								src/paperless_tika/apps.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,14 @@
 | 
			
		||||
from django.apps import AppConfig
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from paperless_tika.signals import tika_consumer_declaration
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PaperlessTikaConfig(AppConfig):
 | 
			
		||||
    name = "paperless_tika"
 | 
			
		||||
 | 
			
		||||
    def ready(self):
 | 
			
		||||
        from documents.signals import document_consumer_declaration
 | 
			
		||||
 | 
			
		||||
        if settings.PAPERLESS_TIKA_ENABLED:
 | 
			
		||||
            document_consumer_declaration.connect(tika_consumer_declaration)
 | 
			
		||||
        AppConfig.ready(self)
 | 
			
		||||
							
								
								
									
										115
									
								
								src/paperless_tika/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								src/paperless_tika/parsers.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,115 @@
 | 
			
		||||
import os
 | 
			
		||||
import subprocess
 | 
			
		||||
import tika
 | 
			
		||||
import requests
 | 
			
		||||
import dateutil.parser
 | 
			
		||||
 | 
			
		||||
from PIL import ImageDraw, ImageFont, Image
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError, run_convert
 | 
			
		||||
from paperless_tesseract.parsers import RasterisedDocumentParser
 | 
			
		||||
from tika import parser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TikaDocumentParser(DocumentParser):
 | 
			
		||||
    """
 | 
			
		||||
    This parser sends documents to a local tika server
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self, document_path, mime_type):
 | 
			
		||||
        self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}")
 | 
			
		||||
        archive_path = self.archive_path
 | 
			
		||||
 | 
			
		||||
        out_path = os.path.join(self.tempdir, "convert.png")
 | 
			
		||||
 | 
			
		||||
        # Run convert to get a decent thumbnail
 | 
			
		||||
        try:
 | 
			
		||||
            run_convert(
 | 
			
		||||
                density=300,
 | 
			
		||||
                scale="500x5000>",
 | 
			
		||||
                alpha="remove",
 | 
			
		||||
                strip=True,
 | 
			
		||||
                trim=False,
 | 
			
		||||
                input_file="{}[0]".format(archive_path),
 | 
			
		||||
                output_file=out_path,
 | 
			
		||||
                logging_group=self.logging_group,
 | 
			
		||||
            )
 | 
			
		||||
        except ParseError:
 | 
			
		||||
            # if convert fails, fall back to extracting
 | 
			
		||||
            # the first PDF page as a PNG using Ghostscript
 | 
			
		||||
            self.log(
 | 
			
		||||
                "warning",
 | 
			
		||||
                "Thumbnail generation with ImageMagick failed, falling back "
 | 
			
		||||
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
 | 
			
		||||
            )
 | 
			
		||||
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
 | 
			
		||||
            cmd = [
 | 
			
		||||
                settings.GS_BINARY,
 | 
			
		||||
                "-q",
 | 
			
		||||
                "-sDEVICE=pngalpha",
 | 
			
		||||
                "-o",
 | 
			
		||||
                gs_out_path,
 | 
			
		||||
                archive_path,
 | 
			
		||||
            ]
 | 
			
		||||
            if not subprocess.Popen(cmd).wait() == 0:
 | 
			
		||||
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
 | 
			
		||||
            # then run convert on the output from gs
 | 
			
		||||
            run_convert(
 | 
			
		||||
                density=300,
 | 
			
		||||
                scale="500x5000>",
 | 
			
		||||
                alpha="remove",
 | 
			
		||||
                strip=True,
 | 
			
		||||
                trim=False,
 | 
			
		||||
                input_file=gs_out_path,
 | 
			
		||||
                output_file=out_path,
 | 
			
		||||
                logging_group=self.logging_group,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        return out_path
 | 
			
		||||
 | 
			
		||||
    def parse(self, document_path, mime_type):
 | 
			
		||||
        self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server")
 | 
			
		||||
        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            parsed = parser.from_file(document_path, tika_server)
 | 
			
		||||
        except requests.exceptions.HTTPError as err:
 | 
			
		||||
            raise ParseError(
 | 
			
		||||
                f"Could not parse {document_path} with tika server at {tika_server}: {err}"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.text = parsed["content"].strip()
 | 
			
		||||
        except:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
 | 
			
		||||
        except:
 | 
			
		||||
            pass
 | 
			
		||||
 | 
			
		||||
        archive_path = os.path.join(self.tempdir, "convert.pdf")
 | 
			
		||||
        convert_to_pdf(document_path, archive_path)
 | 
			
		||||
        self.archive_path = archive_path
 | 
			
		||||
 | 
			
		||||
    def convert_to_pdf(document_path, pdf_path):
 | 
			
		||||
        pdf_path = os.path.join(self.tempdir, "convert.pdf")
 | 
			
		||||
        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
 | 
			
		||||
        url = gotenberg_server + "/convert/office"
 | 
			
		||||
 | 
			
		||||
        self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}")
 | 
			
		||||
        files = {"files": open(document_path, "rb")}
 | 
			
		||||
        headers = {}
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            response = requests.post(url, files=files, headers=headers)
 | 
			
		||||
            response.raise_for_status()  # ensure we notice bad responses
 | 
			
		||||
        except requests.exceptions.HTTPError as err:
 | 
			
		||||
            raise ParseError(
 | 
			
		||||
                f"Could not contact gotenberg server at {gotenberg_server}: {err}"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        file = open(pdf_path, "wb")
 | 
			
		||||
        file.write(response.content)
 | 
			
		||||
        file.close()
 | 
			
		||||
							
								
								
									
										20
									
								
								src/paperless_tika/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								src/paperless_tika/signals.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,20 @@
 | 
			
		||||
from .parsers import TikaDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def tika_consumer_declaration(sender, **kwargs):
 | 
			
		||||
    return {
 | 
			
		||||
        "parser": TikaDocumentParser,
 | 
			
		||||
        "weight": 10,
 | 
			
		||||
        "mime_types": {
 | 
			
		||||
            "application/msword": ".doc",
 | 
			
		||||
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
 | 
			
		||||
            "application/vnd.ms-excel": ".xls",
 | 
			
		||||
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
 | 
			
		||||
            "application/vnd.ms-powerpoint": ".ppt",
 | 
			
		||||
            "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
 | 
			
		||||
            "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
 | 
			
		||||
            "application/vnd.oasis.opendocument.presentation": ".odp",
 | 
			
		||||
            "application/vnd.oasis.opendocument.spreadsheet": ".ods",
 | 
			
		||||
            "application/vnd.oasis.opendocument.text": ".odt",
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
		Reference in New Issue
	
	Block a user