diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html
index 19ae590a7..b79e56ca0 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@@ -91,7 +91,7 @@
Content
-
+
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.scss b/src-ui/src/app/components/document-detail/document-detail.component.scss
index 3ae922564..71d50ca61 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.scss
+++ b/src-ui/src/app/components/document-detail/document-detail.component.scss
@@ -28,3 +28,7 @@
left: 30%;
right: 30%;
}
+
+textarea.rtl {
+ direction: rtl;
+}
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts
index 08d0b0e82..f99f547e6 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -135,6 +135,13 @@ export class DocumentDetailComponent
: this.metadata?.original_mime_type
}
+ get isRTL() {
+ if (!this.metadata || !this.metadata.lang) return false
+ else {
+ return ['ar', 'he', 'fe'].includes(this.metadata.lang)
+ }
+ }
+
ngOnInit(): void {
this.documentForm.valueChanges
.pipe(takeUntil(this.unsubscribeNotifier))
diff --git a/src-ui/src/app/data/paperless-document-metadata.ts b/src-ui/src/app/data/paperless-document-metadata.ts
index 152f69046..b8c030ee8 100644
--- a/src-ui/src/app/data/paperless-document-metadata.ts
+++ b/src-ui/src/app/data/paperless-document-metadata.ts
@@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata {
original_filename?: string
has_archive_version?: boolean
+
+ lang?: string
}
diff --git a/src/documents/views.py b/src/documents/views.py
index c65b6f0b4..46cf06cfd 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from documents.tasks import consume_file
+from langdetect import detect
from packaging import version as packaging_version
from paperless import version
from paperless.db import GnuPG
@@ -325,6 +326,13 @@ class DocumentViewSet(
"original_filename": doc.original_filename,
}
+ lang = "en"
+ try:
+ lang = detect(doc.content)
+ except Exception:
+ pass
+ meta["lang"] = lang
+
if doc.has_archive_version:
meta["archive_size"] = self.get_filesize(doc.archive_path)
meta["archive_metadata"] = self.get_metadata(
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index 44671fa11..14068cb26 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -2,6 +2,7 @@ import json
import os
import re
import subprocess
+import tempfile
from pathlib import Path
from typing import Optional
@@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file):
return None
- from pdfminer.high_level import extract_text as pdfminer_extract_text
-
try:
- stripped = post_process_text(pdfminer_extract_text(pdf_file))
+ text = None
+ with tempfile.NamedTemporaryFile(
+ mode="w+",
+ dir=self.tempdir,
+ ) as tmp:
+ subprocess.run(
+ [
+ "pdftotext",
+ "-q",
+ "-layout",
+ "-enc",
+ "UTF-8",
+ pdf_file,
+ tmp.name,
+ ],
+ )
+ text = tmp.read()
- self.log("debug", f"Extracted text from PDF file {pdf_file}")
+ return post_process_text(text)
- # pdfminer.six does not handle RTL text
- # as a hack, for some languages, return no text, to force
- # OCRMyPdf/Tesseract do handle this correctly
- from langdetect import detect
-
- lang = detect(stripped)
-
- self.log("debug", f"Detected language {lang}")
-
- if (
- lang
- in {
- "ar", # Arabic
- "he", # Hebrew,
- "fa", # Persian
- }
- and pdf_file.name != "archive-fallback.pdf"
- ):
- raise RtlLanguageException()
- return stripped
- except RtlLanguageException:
- self.log("warning", f"Detected RTL language {lang}")
- return None
except Exception:
# TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR.
@@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
)
if original_has_text:
self.text = text_original
- except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
+ except (NoTextFoundException, InputFileError) as e:
self.log(
"warning",
f"Encountered an error while running OCR: {str(e)}. "
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py
index 956c56862..7fa399c97 100644
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
- with mock.patch.object(
- parser,
- "construct_ocrmypdf_parameters",
- wraps=parser.construct_ocrmypdf_parameters,
- ) as wrapped:
- parser.parse(
- os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
- "application/pdf",
- )
+ parser.parse(
+ os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
+ "application/pdf",
+ )
- # There isn't a good way to actually check this working, with RTL correctly return
- # as it would require tesseract-ocr-ara installed for everyone running the
- # test suite. This test does provide the coverage though and attempts to ensure
- # the force OCR happens
- self.assertIsNotNone(parser.get_text())
-
- self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
- # Check the last call kwargs
- self.assertTrue(
- parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
- )
+ # Copied from the PDF to here. Don't even look at it
+ self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
class TestParserFileTypes(DirectoriesMixin, TestCase):