mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-11 12:09:27 -05:00

* Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
125 lines
4.4 KiB
Python
125 lines
4.4 KiB
Python
from pathlib import Path
|
|
|
|
import httpx
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
from gotenberg_client import GotenbergClient
|
|
from gotenberg_client.options import PdfAFormat
|
|
from tika_client import TikaClient
|
|
|
|
from documents.parsers import DocumentParser
|
|
from documents.parsers import ParseError
|
|
from documents.parsers import make_thumbnail_from_pdf
|
|
from paperless.config import OutputTypeConfig
|
|
from paperless.models import OutputTypeChoices
|
|
|
|
|
|
class TikaDocumentParser(DocumentParser):
|
|
"""
|
|
This parser sends documents to a local tika server
|
|
"""
|
|
|
|
logging_name = "paperless.parsing.tika"
|
|
|
|
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
if not self.archive_path:
|
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
|
|
|
return make_thumbnail_from_pdf(
|
|
self.archive_path,
|
|
self.tempdir,
|
|
self.logging_group,
|
|
)
|
|
|
|
def extract_metadata(self, document_path, mime_type):
|
|
try:
|
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
|
parsed = client.metadata.from_file(document_path, mime_type)
|
|
return [
|
|
{
|
|
"namespace": "",
|
|
"prefix": "",
|
|
"key": key,
|
|
"value": parsed.data[key],
|
|
}
|
|
for key in parsed.data
|
|
]
|
|
except Exception as e:
|
|
self.log.warning(
|
|
f"Error while fetching document metadata for {document_path}: {e}",
|
|
)
|
|
return []
|
|
|
|
def parse(self, document_path: Path, mime_type: str, file_name=None):
|
|
self.log.info(f"Sending {document_path} to Tika server")
|
|
|
|
try:
|
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
|
try:
|
|
parsed = client.tika.as_text.from_file(document_path, mime_type)
|
|
except httpx.HTTPStatusError as err:
|
|
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
|
# Tika fails with some files as multi-part form data
|
|
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
|
parsed = client.tika.as_text.from_buffer(
|
|
document_path.read_bytes(),
|
|
mime_type,
|
|
)
|
|
else: # pragma: no cover
|
|
raise
|
|
except Exception as err:
|
|
raise ParseError(
|
|
f"Could not parse {document_path} with tika server at "
|
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
|
) from err
|
|
|
|
self.text = parsed.content
|
|
if self.text is not None:
|
|
self.text = self.text.strip()
|
|
|
|
self.date = parsed.created
|
|
if self.date is not None and timezone.is_naive(self.date):
|
|
self.date = timezone.make_aware(self.date)
|
|
|
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
|
|
|
def convert_to_pdf(self, document_path: Path, file_name):
|
|
pdf_path = Path(self.tempdir) / "convert.pdf"
|
|
|
|
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
|
|
|
|
with GotenbergClient(
|
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
|
) as client, client.libre_office.to_pdf() as route:
|
|
# Set the output format of the resulting PDF
|
|
if settings.OCR_OUTPUT_TYPE in {
|
|
OutputTypeChoices.PDF_A,
|
|
OutputTypeChoices.PDF_A2,
|
|
}:
|
|
route.pdf_format(PdfAFormat.A2b)
|
|
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
|
|
route.pdf_format(PdfAFormat.A1a)
|
|
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
|
|
route.pdf_format(PdfAFormat.A3b)
|
|
|
|
route.convert(document_path)
|
|
|
|
try:
|
|
response = route.run()
|
|
|
|
pdf_path.write_bytes(response.content)
|
|
|
|
return pdf_path
|
|
|
|
except Exception as err:
|
|
raise ParseError(
|
|
f"Error while converting document to PDF: {err}",
|
|
) from err
|
|
|
|
def get_settings(self) -> OutputTypeConfig:
|
|
"""
|
|
This parser only uses the PDF output type configuration currently
|
|
"""
|
|
return OutputTypeConfig()
|