mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Merge pull request #2302 from paperless-ngx/feature-fix-display-rtl-content
This commit is contained in:
		@@ -91,7 +91,7 @@
 | 
				
			|||||||
                    <a ngbNavLink i18n>Content</a>
 | 
					                    <a ngbNavLink i18n>Content</a>
 | 
				
			||||||
                    <ng-template ngbNavContent>
 | 
					                    <ng-template ngbNavContent>
 | 
				
			||||||
                        <div class="mb-3">
 | 
					                        <div class="mb-3">
 | 
				
			||||||
                            <textarea class="form-control" id="content" rows="20" formControlName='content'></textarea>
 | 
					                            <textarea class="form-control" id="content" rows="20" formControlName='content' [class.rtl]="isRTL"></textarea>
 | 
				
			||||||
                        </div>
 | 
					                        </div>
 | 
				
			||||||
                    </ng-template>
 | 
					                    </ng-template>
 | 
				
			||||||
                </li>
 | 
					                </li>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -28,3 +28,7 @@
 | 
				
			|||||||
  left: 30%;
 | 
					  left: 30%;
 | 
				
			||||||
  right: 30%;
 | 
					  right: 30%;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					textarea.rtl {
 | 
				
			||||||
 | 
					  direction: rtl;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -135,6 +135,13 @@ export class DocumentDetailComponent
 | 
				
			|||||||
      : this.metadata?.original_mime_type
 | 
					      : this.metadata?.original_mime_type
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  get isRTL() {
 | 
				
			||||||
 | 
					    if (!this.metadata || !this.metadata.lang) return false
 | 
				
			||||||
 | 
					    else {
 | 
				
			||||||
 | 
					      return ['ar', 'he', 'fe'].includes(this.metadata.lang)
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ngOnInit(): void {
 | 
					  ngOnInit(): void {
 | 
				
			||||||
    this.documentForm.valueChanges
 | 
					    this.documentForm.valueChanges
 | 
				
			||||||
      .pipe(takeUntil(this.unsubscribeNotifier))
 | 
					      .pipe(takeUntil(this.unsubscribeNotifier))
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -10,4 +10,6 @@ export interface PaperlessDocumentMetadata {
 | 
				
			|||||||
  original_filename?: string
 | 
					  original_filename?: string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  has_archive_version?: boolean
 | 
					  has_archive_version?: boolean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  lang?: string
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -29,6 +29,7 @@ from django.views.decorators.cache import cache_control
 | 
				
			|||||||
from django.views.generic import TemplateView
 | 
					from django.views.generic import TemplateView
 | 
				
			||||||
from django_filters.rest_framework import DjangoFilterBackend
 | 
					from django_filters.rest_framework import DjangoFilterBackend
 | 
				
			||||||
from documents.tasks import consume_file
 | 
					from documents.tasks import consume_file
 | 
				
			||||||
 | 
					from langdetect import detect
 | 
				
			||||||
from packaging import version as packaging_version
 | 
					from packaging import version as packaging_version
 | 
				
			||||||
from paperless import version
 | 
					from paperless import version
 | 
				
			||||||
from paperless.db import GnuPG
 | 
					from paperless.db import GnuPG
 | 
				
			||||||
@@ -325,6 +326,13 @@ class DocumentViewSet(
 | 
				
			|||||||
            "original_filename": doc.original_filename,
 | 
					            "original_filename": doc.original_filename,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        lang = "en"
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            lang = detect(doc.content)
 | 
				
			||||||
 | 
					        except Exception:
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
 | 
					        meta["lang"] = lang
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if doc.has_archive_version:
 | 
					        if doc.has_archive_version:
 | 
				
			||||||
            meta["archive_size"] = self.get_filesize(doc.archive_path)
 | 
					            meta["archive_size"] = self.get_filesize(doc.archive_path)
 | 
				
			||||||
            meta["archive_metadata"] = self.get_metadata(
 | 
					            meta["archive_metadata"] = self.get_metadata(
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2,6 +2,7 @@ import json
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Optional
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        if not os.path.isfile(pdf_file):
 | 
					        if not os.path.isfile(pdf_file):
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        from pdfminer.high_level import extract_text as pdfminer_extract_text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            stripped = post_process_text(pdfminer_extract_text(pdf_file))
 | 
					            text = None
 | 
				
			||||||
 | 
					            with tempfile.NamedTemporaryFile(
 | 
				
			||||||
 | 
					                mode="w+",
 | 
				
			||||||
 | 
					                dir=self.tempdir,
 | 
				
			||||||
 | 
					            ) as tmp:
 | 
				
			||||||
 | 
					                subprocess.run(
 | 
				
			||||||
 | 
					                    [
 | 
				
			||||||
 | 
					                        "pdftotext",
 | 
				
			||||||
 | 
					                        "-q",
 | 
				
			||||||
 | 
					                        "-layout",
 | 
				
			||||||
 | 
					                        "-enc",
 | 
				
			||||||
 | 
					                        "UTF-8",
 | 
				
			||||||
 | 
					                        pdf_file,
 | 
				
			||||||
 | 
					                        tmp.name,
 | 
				
			||||||
 | 
					                    ],
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                text = tmp.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.log("debug", f"Extracted text from PDF file {pdf_file}")
 | 
					            return post_process_text(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # pdfminer.six does not handle RTL text
 | 
					 | 
				
			||||||
            # as a hack, for some languages, return no text, to force
 | 
					 | 
				
			||||||
            # OCRMyPdf/Tesseract do handle this correctly
 | 
					 | 
				
			||||||
            from langdetect import detect
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            lang = detect(stripped)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.log("debug", f"Detected language {lang}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if (
 | 
					 | 
				
			||||||
                lang
 | 
					 | 
				
			||||||
                in {
 | 
					 | 
				
			||||||
                    "ar",  # Arabic
 | 
					 | 
				
			||||||
                    "he",  # Hebrew,
 | 
					 | 
				
			||||||
                    "fa",  # Persian
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
                and pdf_file.name != "archive-fallback.pdf"
 | 
					 | 
				
			||||||
            ):
 | 
					 | 
				
			||||||
                raise RtlLanguageException()
 | 
					 | 
				
			||||||
            return stripped
 | 
					 | 
				
			||||||
        except RtlLanguageException:
 | 
					 | 
				
			||||||
            self.log("warning", f"Detected RTL language {lang}")
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
        except Exception:
 | 
					        except Exception:
 | 
				
			||||||
            # TODO catch all for various issues with PDFminer.six.
 | 
					            # TODO catch all for various issues with PDFminer.six.
 | 
				
			||||||
            #  If PDFminer fails, fall back to OCR.
 | 
					            #  If PDFminer fails, fall back to OCR.
 | 
				
			||||||
@@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
            if original_has_text:
 | 
					            if original_has_text:
 | 
				
			||||||
                self.text = text_original
 | 
					                self.text = text_original
 | 
				
			||||||
        except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
 | 
					        except (NoTextFoundException, InputFileError) as e:
 | 
				
			||||||
            self.log(
 | 
					            self.log(
 | 
				
			||||||
                "warning",
 | 
					                "warning",
 | 
				
			||||||
                f"Encountered an error while running OCR: {str(e)}. "
 | 
					                f"Encountered an error while running OCR: {str(e)}. "
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
            - Text from the document is extracted
 | 
					            - Text from the document is extracted
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        parser = RasterisedDocumentParser(None)
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
        with mock.patch.object(
 | 
					 | 
				
			||||||
            parser,
 | 
					 | 
				
			||||||
            "construct_ocrmypdf_parameters",
 | 
					 | 
				
			||||||
            wraps=parser.construct_ocrmypdf_parameters,
 | 
					 | 
				
			||||||
        ) as wrapped:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            parser.parse(
 | 
					        parser.parse(
 | 
				
			||||||
                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
 | 
					            os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
 | 
				
			||||||
                "application/pdf",
 | 
					            "application/pdf",
 | 
				
			||||||
            )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # There isn't a good way to actually check this working, with RTL correctly return
 | 
					        # Copied from the PDF to here.  Don't even look at it
 | 
				
			||||||
            #  as it would require tesseract-ocr-ara installed for everyone running the
 | 
					        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
 | 
				
			||||||
            #  test suite.  This test does provide the coverage though and attempts to ensure
 | 
					 | 
				
			||||||
            # the force OCR happens
 | 
					 | 
				
			||||||
            self.assertIsNotNone(parser.get_text())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
 | 
					 | 
				
			||||||
            # Check the last call kwargs
 | 
					 | 
				
			||||||
            self.assertTrue(
 | 
					 | 
				
			||||||
                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
 | 
					class TestParserFileTypes(DirectoriesMixin, TestCase):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user