From da38efebdf46c2a06b86a7615406522f93e66b0c Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 1 Jan 2023 08:59:43 -0800 Subject: [PATCH 1/3] Use correct direction for RTL content --- .../document-detail/document-detail.component.html | 2 +- .../document-detail/document-detail.component.scss | 4 ++++ .../document-detail/document-detail.component.ts | 7 +++++++ src-ui/src/app/data/paperless-document-metadata.ts | 2 ++ src/documents/views.py | 8 ++++++++ 5 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 0384de371..54ac665e0 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -91,7 +91,7 @@ Content
- +
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.scss b/src-ui/src/app/components/document-detail/document-detail.component.scss index 3ae922564..71d50ca61 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.scss +++ b/src-ui/src/app/components/document-detail/document-detail.component.scss @@ -28,3 +28,7 @@ left: 30%; right: 30%; } + +textarea.rtl { + direction: rtl; +} diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index 08d0b0e82..f99f547e6 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -135,6 +135,13 @@ export class DocumentDetailComponent : this.metadata?.original_mime_type } + get isRTL() { + if (!this.metadata || !this.metadata.lang) return false + else { + return ['ar', 'he', 'fe'].includes(this.metadata.lang) + } + } + ngOnInit(): void { this.documentForm.valueChanges .pipe(takeUntil(this.unsubscribeNotifier)) diff --git a/src-ui/src/app/data/paperless-document-metadata.ts b/src-ui/src/app/data/paperless-document-metadata.ts index 152f69046..b8c030ee8 100644 --- a/src-ui/src/app/data/paperless-document-metadata.ts +++ b/src-ui/src/app/data/paperless-document-metadata.ts @@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata { original_filename?: string has_archive_version?: boolean + + lang?: string } diff --git a/src/documents/views.py b/src/documents/views.py index e313ae17e..52b230b40 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control from django.views.generic import TemplateView from django_filters.rest_framework import DjangoFilterBackend from documents.tasks import consume_file +from langdetect import detect from packaging import version as packaging_version from paperless import version from paperless.db import GnuPG @@ -325,6 +326,13 @@ class DocumentViewSet( "original_filename": doc.original_filename, } + lang = "en" + try: + lang = detect(doc.content) + except Exception: + pass + meta["lang"] = lang + if doc.has_archive_version: meta["archive_size"] = self.get_filesize(doc.archive_path) meta["archive_metadata"] = self.get_metadata( From 7be9ae9c023d54f39ce4ec03e3b85602949b3492 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sun, 1 Jan 2023 15:57:22 -0800 Subject: [PATCH 2/3] Try a new way of extracting text from a given PDF file --- src/paperless_tesseract/parsers.py | 48 ++++++++------------ src/paperless_tesseract/tests/test_parser.py | 26 +++-------- 2 files changed, 26 insertions(+), 48 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 44671fa11..8e0bac5a7 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -2,6 +2,7 @@ import json import os import re import subprocess +import tempfile from pathlib import Path from typing import Optional @@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser): if not os.path.isfile(pdf_file): return None - from pdfminer.high_level import extract_text as pdfminer_extract_text - try: - stripped = post_process_text(pdfminer_extract_text(pdf_file)) + text = None + with tempfile.NamedTemporaryFile( + mode="w+", + dir=settings.SCRATCH_DIR, + ) as tmp: + subprocess.run( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + pdf_file, + tmp.name, + ], + ) + text = tmp.read() - self.log("debug", f"Extracted text from PDF file {pdf_file}") + return post_process_text(text) - # pdfminer.six does not handle RTL text - # as a hack, for some languages, return no text, to force - # OCRMyPdf/Tesseract do handle this correctly - from langdetect import detect - - lang = detect(stripped) - - self.log("debug", f"Detected language {lang}") - - if ( - lang - in { - "ar", # Arabic - "he", # Hebrew, - "fa", # Persian - } - and pdf_file.name != "archive-fallback.pdf" - ): - raise RtlLanguageException() - return stripped - except RtlLanguageException: - self.log("warning", f"Detected RTL language {lang}") - return None except Exception: # TODO catch all for various issues with PDFminer.six. # If PDFminer fails, fall back to OCR. @@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser): ) if original_has_text: self.text = text_original - except (NoTextFoundException, RtlLanguageException, InputFileError) as e: + except (NoTextFoundException, InputFileError) as e: self.log( "warning", f"Encountered an error while running OCR: {str(e)}. " diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 28af8dec1..53af68f8d 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase): - Text from the document is extracted """ parser = RasterisedDocumentParser(None) - with mock.patch.object( - parser, - "construct_ocrmypdf_parameters", - wraps=parser.construct_ocrmypdf_parameters, - ) as wrapped: - parser.parse( - os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), - "application/pdf", - ) + parser.parse( + os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), + "application/pdf", + ) - # There isn't a good way to actually check this working, with RTL correctly return - # as it would require tesseract-ocr-ara installed for everyone running the - # test suite. This test does provide the coverage though and attempts to ensure - # the force OCR happens - self.assertIsNotNone(parser.get_text()) - - self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2) - # Check the last call kwargs - self.assertTrue( - parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"], - ) + # Copied from the PDF to here. Don't even look at it + self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text()) class TestParserFileTypes(DirectoriesMixin, TestCase): From 1e4923835b7e7eeea49d680630d43d35d9891a9d Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 3 Jan 2023 13:05:44 -0800 Subject: [PATCH 3/3] Small tweak to use the existing tempdir instead of a new one --- src/paperless_tesseract/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 8e0bac5a7..14068cb26 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -142,7 +142,7 @@ class RasterisedDocumentParser(DocumentParser): text = None with tempfile.NamedTemporaryFile( mode="w+", - dir=settings.SCRATCH_DIR, + dir=self.tempdir, ) as tmp: subprocess.run( [