Add the new paperless_tika parser

This parser will use an external Tika and Gotenberg server to parse
"Office" documents (.doc, .xls, .odt, etc.)

Signed-off-by: Jo Vandeginste <Jo.Vandeginste@kuleuven.be>
This commit is contained in:
Jo Vandeginste
2020-12-29 01:23:40 +01:00
parent 99c7ff3123
commit bf8739864d
9 changed files with 276 additions and 0 deletions

View File

@@ -87,6 +87,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_tika.apps.PaperlessTikaConfig",
"paperless_mail.apps.PaperlessMailConfig",
"django.contrib.admin",
@@ -424,3 +425,7 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf")
# Tika settings
PAPERLESS_TIKA = __get_boolean("PAPERLESS_TIKA", "NO")
GOTENBERG_SERVER_ENDPOINT = os.getenv("GOTENBERG_SERVER_ENDPOINT", "http://localhost:3000")