Merge pull request #2302 from paperless-ngx/feature-fix-display-rtl-content

This commit is contained in:
shamoon 2023-01-10 07:30:52 -08:00 committed by GitHub
commit 985f298c46
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 48 additions and 49 deletions

View File

@ -91,7 +91,7 @@
<a ngbNavLink i18n>Content</a> <a ngbNavLink i18n>Content</a>
<ng-template ngbNavContent> <ng-template ngbNavContent>
<div class="mb-3"> <div class="mb-3">
<textarea class="form-control" id="content" rows="20" formControlName='content'></textarea> <textarea class="form-control" id="content" rows="20" formControlName='content' [class.rtl]="isRTL"></textarea>
</div> </div>
</ng-template> </ng-template>
</li> </li>

View File

@ -28,3 +28,7 @@
left: 30%; left: 30%;
right: 30%; right: 30%;
} }
textarea.rtl {
direction: rtl;
}

View File

@ -135,6 +135,13 @@ export class DocumentDetailComponent
: this.metadata?.original_mime_type : this.metadata?.original_mime_type
} }
get isRTL() {
if (!this.metadata || !this.metadata.lang) return false
else {
return ['ar', 'he', 'fe'].includes(this.metadata.lang)
}
}
ngOnInit(): void { ngOnInit(): void {
this.documentForm.valueChanges this.documentForm.valueChanges
.pipe(takeUntil(this.unsubscribeNotifier)) .pipe(takeUntil(this.unsubscribeNotifier))

View File

@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata {
original_filename?: string original_filename?: string
has_archive_version?: boolean has_archive_version?: boolean
lang?: string
} }

View File

@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend from django_filters.rest_framework import DjangoFilterBackend
from documents.tasks import consume_file from documents.tasks import consume_file
from langdetect import detect
from packaging import version as packaging_version from packaging import version as packaging_version
from paperless import version from paperless import version
from paperless.db import GnuPG from paperless.db import GnuPG
@ -325,6 +326,13 @@ class DocumentViewSet(
"original_filename": doc.original_filename, "original_filename": doc.original_filename,
} }
lang = "en"
try:
lang = detect(doc.content)
except Exception:
pass
meta["lang"] = lang
if doc.has_archive_version: if doc.has_archive_version:
meta["archive_size"] = self.get_filesize(doc.archive_path) meta["archive_size"] = self.get_filesize(doc.archive_path)
meta["archive_metadata"] = self.get_metadata( meta["archive_metadata"] = self.get_metadata(

View File

@ -2,6 +2,7 @@ import json
import os import os
import re import re
import subprocess import subprocess
import tempfile
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file): if not os.path.isfile(pdf_file):
return None return None
from pdfminer.high_level import extract_text as pdfminer_extract_text
try: try:
stripped = post_process_text(pdfminer_extract_text(pdf_file)) text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=self.tempdir,
) as tmp:
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
)
text = tmp.read()
self.log("debug", f"Extracted text from PDF file {pdf_file}") return post_process_text(text)
# pdfminer.six does not handle RTL text
# as a hack, for some languages, return no text, to force
# OCRMyPdf/Tesseract do handle this correctly
from langdetect import detect
lang = detect(stripped)
self.log("debug", f"Detected language {lang}")
if (
lang
in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException()
return stripped
except RtlLanguageException:
self.log("warning", f"Detected RTL language {lang}")
return None
except Exception: except Exception:
# TODO catch all for various issues with PDFminer.six. # TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR. # If PDFminer fails, fall back to OCR.
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
) )
if original_has_text: if original_has_text:
self.text = text_original self.text = text_original
except (NoTextFoundException, RtlLanguageException, InputFileError) as e: except (NoTextFoundException, InputFileError) as e:
self.log( self.log(
"warning", "warning",
f"Encountered an error while running OCR: {str(e)}. " f"Encountered an error while running OCR: {str(e)}. "

View File

@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted - Text from the document is extracted
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf", "application/pdf",
) )
# There isn't a good way to actually check this working, with RTL correctly return # Copied from the PDF to here. Don't even look at it
# as it would require tesseract-ocr-ara installed for everyone running the self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
class TestParserFileTypes(DirectoriesMixin, TestCase): class TestParserFileTypes(DirectoriesMixin, TestCase):