Trenton H c8a62715ec Feature: Allow setting backend configuration settings via the UI (#5126)
* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00

125 lines
4.4 KiB
Python

from pathlib import Path
import httpx
from django.conf import settings
from django.utils import timezone
from gotenberg_client import GotenbergClient
from gotenberg_client.options import PdfAFormat
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OutputTypeConfig
from paperless.models import OutputTypeChoices
class TikaDocumentParser(DocumentParser):
"""
This parser sends documents to a local tika server
"""
logging_name = "paperless.parsing.tika"
def get_thumbnail(self, document_path, mime_type, file_name=None):
if not self.archive_path:
self.archive_path = self.convert_to_pdf(document_path, file_name)
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
)
def extract_metadata(self, document_path, mime_type):
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.metadata.from_file(document_path, mime_type)
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed.data[key],
}
for key in parsed.data
]
except Exception as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
)
return []
def parse(self, document_path: Path, mime_type: str, file_name=None):
self.log.info(f"Sending {document_path} to Tika server")
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
try:
parsed = client.tika.as_text.from_file(document_path, mime_type)
except httpx.HTTPStatusError as err:
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
# Tika fails with some files as multi-part form data
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
parsed = client.tika.as_text.from_buffer(
document_path.read_bytes(),
mime_type,
)
else: # pragma: no cover
raise
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self.text = parsed.content
if self.text is not None:
self.text = self.text.strip()
self.date = parsed.created
if self.date is not None and timezone.is_naive(self.date):
self.date = timezone.make_aware(self.date)
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path: Path, file_name):
pdf_path = Path(self.tempdir) / "convert.pdf"
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
with GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.libre_office.to_pdf() as route:
# Set the output format of the resulting PDF
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
route.pdf_format(PdfAFormat.A1a)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
route.pdf_format(PdfAFormat.A3b)
route.convert(document_path)
try:
response = route.run()
pdf_path.write_bytes(response.content)
return pdf_path
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
def get_settings(self) -> OutputTypeConfig:
"""
This parser only uses the PDF output type configuration currently
"""
return OutputTypeConfig()