Merge pull request #2302 from paperless-ngx/feature-fix-display-rtl-content

This commit is contained in:
shamoon 2023-01-10 07:30:52 -08:00 committed by GitHub
commit 985f298c46
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 48 additions and 49 deletions

View File

@ -91,7 +91,7 @@
<a ngbNavLink i18n>Content</a>
<ng-template ngbNavContent>
<div class="mb-3">
<textarea class="form-control" id="content" rows="20" formControlName='content'></textarea>
<textarea class="form-control" id="content" rows="20" formControlName='content' [class.rtl]="isRTL"></textarea>
</div>
</ng-template>
</li>

View File

@ -28,3 +28,7 @@
left: 30%;
right: 30%;
}
textarea.rtl {
direction: rtl;
}

View File

@ -135,6 +135,13 @@ export class DocumentDetailComponent
: this.metadata?.original_mime_type
}
get isRTL() {
if (!this.metadata || !this.metadata.lang) return false
else {
return ['ar', 'he', 'fe'].includes(this.metadata.lang)
}
}
ngOnInit(): void {
this.documentForm.valueChanges
.pipe(takeUntil(this.unsubscribeNotifier))

View File

@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata {
original_filename?: string
has_archive_version?: boolean
lang?: string
}

View File

@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from documents.tasks import consume_file
from langdetect import detect
from packaging import version as packaging_version
from paperless import version
from paperless.db import GnuPG
@ -325,6 +326,13 @@ class DocumentViewSet(
"original_filename": doc.original_filename,
}
lang = "en"
try:
lang = detect(doc.content)
except Exception:
pass
meta["lang"] = lang
if doc.has_archive_version:
meta["archive_size"] = self.get_filesize(doc.archive_path)
meta["archive_metadata"] = self.get_metadata(

View File

@ -2,6 +2,7 @@ import json
import os
import re
import subprocess
import tempfile
from pathlib import Path
from typing import Optional
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file):
return None
from pdfminer.high_level import extract_text as pdfminer_extract_text
try:
stripped = post_process_text(pdfminer_extract_text(pdf_file))
text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=self.tempdir,
) as tmp:
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
)
text = tmp.read()
self.log("debug", f"Extracted text from PDF file {pdf_file}")
return post_process_text(text)
# pdfminer.six does not handle RTL text
# as a hack, for some languages, return no text, to force
# OCRMyPdf/Tesseract do handle this correctly
from langdetect import detect
lang = detect(stripped)
self.log("debug", f"Detected language {lang}")
if (
lang
in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException()
return stripped
except RtlLanguageException:
self.log("warning", f"Detected RTL language {lang}")
return None
except Exception:
# TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR.
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
)
if original_has_text:
self.text = text_original
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
except (NoTextFoundException, InputFileError) as e:
self.log(
"warning",
f"Encountered an error while running OCR: {str(e)}. "

View File

@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
# There isn't a good way to actually check this working, with RTL correctly return
# as it would require tesseract-ocr-ara installed for everyone running the
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
# Copied from the PDF to here. Don't even look at it
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
class TestParserFileTypes(DirectoriesMixin, TestCase):