mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Merge pull request #2302 from paperless-ngx/feature-fix-display-rtl-content
This commit is contained in:
commit
985f298c46
@ -91,7 +91,7 @@
|
|||||||
<a ngbNavLink i18n>Content</a>
|
<a ngbNavLink i18n>Content</a>
|
||||||
<ng-template ngbNavContent>
|
<ng-template ngbNavContent>
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
<textarea class="form-control" id="content" rows="20" formControlName='content'></textarea>
|
<textarea class="form-control" id="content" rows="20" formControlName='content' [class.rtl]="isRTL"></textarea>
|
||||||
</div>
|
</div>
|
||||||
</ng-template>
|
</ng-template>
|
||||||
</li>
|
</li>
|
||||||
|
@ -28,3 +28,7 @@
|
|||||||
left: 30%;
|
left: 30%;
|
||||||
right: 30%;
|
right: 30%;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
textarea.rtl {
|
||||||
|
direction: rtl;
|
||||||
|
}
|
||||||
|
@ -135,6 +135,13 @@ export class DocumentDetailComponent
|
|||||||
: this.metadata?.original_mime_type
|
: this.metadata?.original_mime_type
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get isRTL() {
|
||||||
|
if (!this.metadata || !this.metadata.lang) return false
|
||||||
|
else {
|
||||||
|
return ['ar', 'he', 'fe'].includes(this.metadata.lang)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ngOnInit(): void {
|
ngOnInit(): void {
|
||||||
this.documentForm.valueChanges
|
this.documentForm.valueChanges
|
||||||
.pipe(takeUntil(this.unsubscribeNotifier))
|
.pipe(takeUntil(this.unsubscribeNotifier))
|
||||||
|
@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata {
|
|||||||
original_filename?: string
|
original_filename?: string
|
||||||
|
|
||||||
has_archive_version?: boolean
|
has_archive_version?: boolean
|
||||||
|
|
||||||
|
lang?: string
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control
|
|||||||
from django.views.generic import TemplateView
|
from django.views.generic import TemplateView
|
||||||
from django_filters.rest_framework import DjangoFilterBackend
|
from django_filters.rest_framework import DjangoFilterBackend
|
||||||
from documents.tasks import consume_file
|
from documents.tasks import consume_file
|
||||||
|
from langdetect import detect
|
||||||
from packaging import version as packaging_version
|
from packaging import version as packaging_version
|
||||||
from paperless import version
|
from paperless import version
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
@ -325,6 +326,13 @@ class DocumentViewSet(
|
|||||||
"original_filename": doc.original_filename,
|
"original_filename": doc.original_filename,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lang = "en"
|
||||||
|
try:
|
||||||
|
lang = detect(doc.content)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
meta["lang"] = lang
|
||||||
|
|
||||||
if doc.has_archive_version:
|
if doc.has_archive_version:
|
||||||
meta["archive_size"] = self.get_filesize(doc.archive_path)
|
meta["archive_size"] = self.get_filesize(doc.archive_path)
|
||||||
meta["archive_metadata"] = self.get_metadata(
|
meta["archive_metadata"] = self.get_metadata(
|
||||||
|
@ -2,6 +2,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if not os.path.isfile(pdf_file):
|
if not os.path.isfile(pdf_file):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
text = None
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w+",
|
||||||
|
dir=self.tempdir,
|
||||||
|
) as tmp:
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"pdftotext",
|
||||||
|
"-q",
|
||||||
|
"-layout",
|
||||||
|
"-enc",
|
||||||
|
"UTF-8",
|
||||||
|
pdf_file,
|
||||||
|
tmp.name,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
text = tmp.read()
|
||||||
|
|
||||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
return post_process_text(text)
|
||||||
|
|
||||||
# pdfminer.six does not handle RTL text
|
|
||||||
# as a hack, for some languages, return no text, to force
|
|
||||||
# OCRMyPdf/Tesseract do handle this correctly
|
|
||||||
from langdetect import detect
|
|
||||||
|
|
||||||
lang = detect(stripped)
|
|
||||||
|
|
||||||
self.log("debug", f"Detected language {lang}")
|
|
||||||
|
|
||||||
if (
|
|
||||||
lang
|
|
||||||
in {
|
|
||||||
"ar", # Arabic
|
|
||||||
"he", # Hebrew,
|
|
||||||
"fa", # Persian
|
|
||||||
}
|
|
||||||
and pdf_file.name != "archive-fallback.pdf"
|
|
||||||
):
|
|
||||||
raise RtlLanguageException()
|
|
||||||
return stripped
|
|
||||||
except RtlLanguageException:
|
|
||||||
self.log("warning", f"Detected RTL language {lang}")
|
|
||||||
return None
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# TODO catch all for various issues with PDFminer.six.
|
# TODO catch all for various issues with PDFminer.six.
|
||||||
# If PDFminer fails, fall back to OCR.
|
# If PDFminer fails, fall back to OCR.
|
||||||
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
)
|
)
|
||||||
if original_has_text:
|
if original_has_text:
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
|
except (NoTextFoundException, InputFileError) as e:
|
||||||
self.log(
|
self.log(
|
||||||
"warning",
|
"warning",
|
||||||
f"Encountered an error while running OCR: {str(e)}. "
|
f"Encountered an error while running OCR: {str(e)}. "
|
||||||
|
@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
- Text from the document is extracted
|
- Text from the document is extracted
|
||||||
"""
|
"""
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
with mock.patch.object(
|
|
||||||
parser,
|
|
||||||
"construct_ocrmypdf_parameters",
|
|
||||||
wraps=parser.construct_ocrmypdf_parameters,
|
|
||||||
) as wrapped:
|
|
||||||
|
|
||||||
parser.parse(
|
parser.parse(
|
||||||
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
)
|
)
|
||||||
|
|
||||||
# There isn't a good way to actually check this working, with RTL correctly return
|
# Copied from the PDF to here. Don't even look at it
|
||||||
# as it would require tesseract-ocr-ara installed for everyone running the
|
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
|
||||||
# test suite. This test does provide the coverage though and attempts to ensure
|
|
||||||
# the force OCR happens
|
|
||||||
self.assertIsNotNone(parser.get_text())
|
|
||||||
|
|
||||||
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
|
|
||||||
# Check the last call kwargs
|
|
||||||
self.assertTrue(
|
|
||||||
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user