mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #2302 from paperless-ngx/feature-fix-display-rtl-content
This commit is contained in:
commit
985f298c46
@ -91,7 +91,7 @@
|
||||
<a ngbNavLink i18n>Content</a>
|
||||
<ng-template ngbNavContent>
|
||||
<div class="mb-3">
|
||||
<textarea class="form-control" id="content" rows="20" formControlName='content'></textarea>
|
||||
<textarea class="form-control" id="content" rows="20" formControlName='content' [class.rtl]="isRTL"></textarea>
|
||||
</div>
|
||||
</ng-template>
|
||||
</li>
|
||||
|
@ -28,3 +28,7 @@
|
||||
left: 30%;
|
||||
right: 30%;
|
||||
}
|
||||
|
||||
textarea.rtl {
|
||||
direction: rtl;
|
||||
}
|
||||
|
@ -135,6 +135,13 @@ export class DocumentDetailComponent
|
||||
: this.metadata?.original_mime_type
|
||||
}
|
||||
|
||||
get isRTL() {
|
||||
if (!this.metadata || !this.metadata.lang) return false
|
||||
else {
|
||||
return ['ar', 'he', 'fe'].includes(this.metadata.lang)
|
||||
}
|
||||
}
|
||||
|
||||
ngOnInit(): void {
|
||||
this.documentForm.valueChanges
|
||||
.pipe(takeUntil(this.unsubscribeNotifier))
|
||||
|
@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata {
|
||||
original_filename?: string
|
||||
|
||||
has_archive_version?: boolean
|
||||
|
||||
lang?: string
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control
|
||||
from django.views.generic import TemplateView
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
from documents.tasks import consume_file
|
||||
from langdetect import detect
|
||||
from packaging import version as packaging_version
|
||||
from paperless import version
|
||||
from paperless.db import GnuPG
|
||||
@ -325,6 +326,13 @@ class DocumentViewSet(
|
||||
"original_filename": doc.original_filename,
|
||||
}
|
||||
|
||||
lang = "en"
|
||||
try:
|
||||
lang = detect(doc.content)
|
||||
except Exception:
|
||||
pass
|
||||
meta["lang"] = lang
|
||||
|
||||
if doc.has_archive_version:
|
||||
meta["archive_size"] = self.get_filesize(doc.archive_path)
|
||||
meta["archive_metadata"] = self.get_metadata(
|
||||
|
@ -2,6 +2,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
|
||||
try:
|
||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
subprocess.run(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
pdf_file,
|
||||
tmp.name,
|
||||
],
|
||||
)
|
||||
text = tmp.read()
|
||||
|
||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||
return post_process_text(text)
|
||||
|
||||
# pdfminer.six does not handle RTL text
|
||||
# as a hack, for some languages, return no text, to force
|
||||
# OCRMyPdf/Tesseract do handle this correctly
|
||||
from langdetect import detect
|
||||
|
||||
lang = detect(stripped)
|
||||
|
||||
self.log("debug", f"Detected language {lang}")
|
||||
|
||||
if (
|
||||
lang
|
||||
in {
|
||||
"ar", # Arabic
|
||||
"he", # Hebrew,
|
||||
"fa", # Persian
|
||||
}
|
||||
and pdf_file.name != "archive-fallback.pdf"
|
||||
):
|
||||
raise RtlLanguageException()
|
||||
return stripped
|
||||
except RtlLanguageException:
|
||||
self.log("warning", f"Detected RTL language {lang}")
|
||||
return None
|
||||
except Exception:
|
||||
# TODO catch all for various issues with PDFminer.six.
|
||||
# If PDFminer fails, fall back to OCR.
|
||||
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
)
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Encountered an error while running OCR: {str(e)}. "
|
||||
|
@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
- Text from the document is extracted
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
with mock.patch.object(
|
||||
parser,
|
||||
"construct_ocrmypdf_parameters",
|
||||
wraps=parser.construct_ocrmypdf_parameters,
|
||||
) as wrapped:
|
||||
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
# There isn't a good way to actually check this working, with RTL correctly return
|
||||
# as it would require tesseract-ocr-ara installed for everyone running the
|
||||
# test suite. This test does provide the coverage though and attempts to ensure
|
||||
# the force OCR happens
|
||||
self.assertIsNotNone(parser.get_text())
|
||||
|
||||
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
|
||||
# Check the last call kwargs
|
||||
self.assertTrue(
|
||||
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
|
||||
)
|
||||
# Copied from the PDF to here. Don't even look at it
|
||||
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
|
||||
|
||||
|
||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
||||
|
Loading…
x
Reference in New Issue
Block a user