mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	reworked the interface of the parsers.
This commit is contained in:
		| @@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | |||||||
| from .file_handling import generate_filename, create_source_path_directory | from .file_handling import generate_filename, create_source_path_directory | ||||||
| from .loggers import LoggingMixin | from .loggers import LoggingMixin | ||||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||||
| from .parsers import ParseError, get_parser_class_for_mime_type | from .parsers import ParseError, get_parser_class_for_mime_type, parse_date | ||||||
| from .signals import ( | from .signals import ( | ||||||
|     document_consumption_finished, |     document_consumption_finished, | ||||||
|     document_consumption_started |     document_consumption_started | ||||||
| @@ -121,7 +121,7 @@ class Consumer(LoggingMixin): | |||||||
|  |  | ||||||
|         # This doesn't parse the document yet, but gives us a parser. |         # This doesn't parse the document yet, but gives us a parser. | ||||||
|  |  | ||||||
|         document_parser = parser_class(self.path, self.logging_group) |         document_parser = parser_class(self.logging_group) | ||||||
|  |  | ||||||
|         # However, this already created working directories which we have to |         # However, this already created working directories which we have to | ||||||
|         # clean up. |         # clean up. | ||||||
| @@ -129,12 +129,18 @@ class Consumer(LoggingMixin): | |||||||
|         # Parse the document. This may take some time. |         # Parse the document. This may take some time. | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") |  | ||||||
|             thumbnail = document_parser.get_optimised_thumbnail() |  | ||||||
|             self.log("debug", "Parsing {}...".format(self.filename)) |             self.log("debug", "Parsing {}...".format(self.filename)) | ||||||
|  |             document_parser.parse(self.path, mime_type) | ||||||
|  |  | ||||||
|  |             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||||
|  |             thumbnail = document_parser.get_optimised_thumbnail(self.path, mime_type) | ||||||
|  |  | ||||||
|             text = document_parser.get_text() |             text = document_parser.get_text() | ||||||
|             date = document_parser.get_date() |             date = document_parser.get_date() | ||||||
|  |             if not date: | ||||||
|  |                 date = parse_date(self.filename, text) | ||||||
|             archive_path = document_parser.get_archive_path() |             archive_path = document_parser.get_archive_path() | ||||||
|  |  | ||||||
|         except ParseError as e: |         except ParseError as e: | ||||||
|             document_parser.cleanup() |             document_parser.cleanup() | ||||||
|             raise ConsumerError(e) |             raise ConsumerError(e) | ||||||
|   | |||||||
| @@ -107,59 +107,7 @@ def run_convert(input_file, | |||||||
|         raise ParseError("Convert failed at {}".format(args)) |         raise ParseError("Convert failed at {}".format(args)) | ||||||
|  |  | ||||||
|  |  | ||||||
| class ParseError(Exception): | def parse_date(filename, text): | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DocumentParser(LoggingMixin): |  | ||||||
|     """ |  | ||||||
|     Subclass this to make your own parser.  Have a look at |  | ||||||
|     `paperless_tesseract.parsers` for inspiration. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group): |  | ||||||
|         super().__init__() |  | ||||||
|         self.logging_group = logging_group |  | ||||||
|         self.document_path = path |  | ||||||
|         self.tempdir = tempfile.mkdtemp( |  | ||||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) |  | ||||||
|  |  | ||||||
|     def get_archive_path(self): |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |  | ||||||
|         """ |  | ||||||
|         Returns the path to a file we can use as a thumbnail for this document. |  | ||||||
|         """ |  | ||||||
|         raise NotImplementedError() |  | ||||||
|  |  | ||||||
|     def optimise_thumbnail(self, in_path): |  | ||||||
|  |  | ||||||
|         if settings.OPTIMIZE_THUMBNAILS: |  | ||||||
|             out_path = os.path.join(self.tempdir, "optipng.png") |  | ||||||
|  |  | ||||||
|             args = (settings.OPTIPNG_BINARY, |  | ||||||
|                     "-silent", "-o5", in_path, "-out", out_path) |  | ||||||
|  |  | ||||||
|             self.log('debug', f"Execute: {' '.join(args)}") |  | ||||||
|  |  | ||||||
|             if not subprocess.Popen(args).wait() == 0: |  | ||||||
|                 raise ParseError("Optipng failed at {}".format(args)) |  | ||||||
|  |  | ||||||
|             return out_path |  | ||||||
|         else: |  | ||||||
|             return in_path |  | ||||||
|  |  | ||||||
|     def get_optimised_thumbnail(self): |  | ||||||
|         return self.optimise_thumbnail(self.get_thumbnail()) |  | ||||||
|  |  | ||||||
|     def get_text(self): |  | ||||||
|         """ |  | ||||||
|         Returns the text from the document and only the text. |  | ||||||
|         """ |  | ||||||
|         raise NotImplementedError() |  | ||||||
|  |  | ||||||
|     def get_date(self): |  | ||||||
|     """ |     """ | ||||||
|     Returns the date of the document. |     Returns the date of the document. | ||||||
|     """ |     """ | ||||||
| @@ -179,15 +127,12 @@ class DocumentParser(LoggingMixin): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     date = None |     date = None | ||||||
|         date_string = None |  | ||||||
|  |  | ||||||
|     next_year = timezone.now().year + 5  # Arbitrary 5 year future limit |     next_year = timezone.now().year + 5  # Arbitrary 5 year future limit | ||||||
|         title = os.path.basename(self.document_path) |  | ||||||
|  |  | ||||||
|     # if filename date parsing is enabled, search there first: |     # if filename date parsing is enabled, search there first: | ||||||
|     if settings.FILENAME_DATE_ORDER: |     if settings.FILENAME_DATE_ORDER: | ||||||
|             self.log("info", "Checking document title for date") |         for m in re.finditer(DATE_REGEX, filename): | ||||||
|             for m in re.finditer(DATE_REGEX, title): |  | ||||||
|             date_string = m.group(0) |             date_string = m.group(0) | ||||||
|  |  | ||||||
|             try: |             try: | ||||||
| @@ -197,21 +142,8 @@ class DocumentParser(LoggingMixin): | |||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             if date is not None and next_year > date.year > 1900: |             if date is not None and next_year > date.year > 1900: | ||||||
|                     self.log( |  | ||||||
|                         "info", |  | ||||||
|                         "Detected document date {} based on string {} " |  | ||||||
|                         "from document title" |  | ||||||
|                         "".format(date.isoformat(), date_string) |  | ||||||
|                     ) |  | ||||||
|                 return date |                 return date | ||||||
|  |  | ||||||
|         try: |  | ||||||
|             # getting text after checking filename will save time if only |  | ||||||
|             # looking at the filename instead of the whole text |  | ||||||
|             text = self.get_text() |  | ||||||
|         except ParseError: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|     # Iterate through all regex matches in text and try to parse the date |     # Iterate through all regex matches in text and try to parse the date | ||||||
|     for m in re.finditer(DATE_REGEX, text): |     for m in re.finditer(DATE_REGEX, text): | ||||||
|         date_string = m.group(0) |         date_string = m.group(0) | ||||||
| @@ -227,19 +159,64 @@ class DocumentParser(LoggingMixin): | |||||||
|         else: |         else: | ||||||
|             date = None |             date = None | ||||||
|  |  | ||||||
|         if date is not None: |  | ||||||
|             self.log( |  | ||||||
|                 "info", |  | ||||||
|                 "Detected document date {} based on string {}".format( |  | ||||||
|                     date.isoformat(), |  | ||||||
|                     date_string |  | ||||||
|                 ) |  | ||||||
|             ) |  | ||||||
|         else: |  | ||||||
|             self.log("info", "Unable to detect date for document") |  | ||||||
|  |  | ||||||
|     return date |     return date | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ParseError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DocumentParser(LoggingMixin): | ||||||
|  |     """ | ||||||
|  |     Subclass this to make your own parser.  Have a look at | ||||||
|  |     `paperless_tesseract.parsers` for inspiration. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, logging_group): | ||||||
|  |         super().__init__() | ||||||
|  |         self.logging_group = logging_group | ||||||
|  |         self.tempdir = tempfile.mkdtemp( | ||||||
|  |             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |  | ||||||
|  |         self.archive_path = None | ||||||
|  |         self.text = None | ||||||
|  |         self.date = None | ||||||
|  |  | ||||||
|  |     def parse(self, document_path, mime_type): | ||||||
|  |         raise NotImplementedError() | ||||||
|  |  | ||||||
|  |     def get_archive_path(self): | ||||||
|  |         return self.archive_path | ||||||
|  |  | ||||||
|  |     def get_thumbnail(self, document_path, mime_type): | ||||||
|  |         """ | ||||||
|  |         Returns the path to a file we can use as a thumbnail for this document. | ||||||
|  |         """ | ||||||
|  |         raise NotImplementedError() | ||||||
|  |  | ||||||
|  |     def get_optimised_thumbnail(self, document_path, mime_type): | ||||||
|  |         thumbnail = self.get_thumbnail(document_path, mime_type) | ||||||
|  |         if settings.OPTIMIZE_THUMBNAILS: | ||||||
|  |             out_path = os.path.join(self.tempdir, "thumb_optipng.png") | ||||||
|  |  | ||||||
|  |             args = (settings.OPTIPNG_BINARY, | ||||||
|  |                     "-silent", "-o5", thumbnail, "-out", out_path) | ||||||
|  |  | ||||||
|  |             self.log('debug', f"Execute: {' '.join(args)}") | ||||||
|  |  | ||||||
|  |             if not subprocess.Popen(args).wait() == 0: | ||||||
|  |                 raise ParseError("Optipng failed at {}".format(args)) | ||||||
|  |  | ||||||
|  |             return out_path | ||||||
|  |         else: | ||||||
|  |             return thumbnail | ||||||
|  |  | ||||||
|  |     def get_text(self): | ||||||
|  |         return self.text | ||||||
|  |  | ||||||
|  |     def get_date(self): | ||||||
|  |         return self.date | ||||||
|  |  | ||||||
|     def cleanup(self): |     def cleanup(self): | ||||||
|         self.log("debug", "Deleting directory {}".format(self.tempdir)) |         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||||
|         shutil.rmtree(self.tempdir) |         shutil.rmtree(self.tempdir) | ||||||
|   | |||||||
| @@ -2,7 +2,6 @@ import os | |||||||
| import re | import re | ||||||
| import subprocess | import subprocess | ||||||
|  |  | ||||||
| import langdetect |  | ||||||
| import ocrmypdf | import ocrmypdf | ||||||
| import pdftotext | import pdftotext | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| @@ -17,12 +16,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) |     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         super().__init__(path, logging_group) |  | ||||||
|         self._text = None |  | ||||||
|         self._archive_path = None |  | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |  | ||||||
|         """ |         """ | ||||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. |         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||||
|         """ |         """ | ||||||
| @@ -36,7 +30,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                         alpha="remove", |                         alpha="remove", | ||||||
|                         strip=True, |                         strip=True, | ||||||
|                         trim=True, |                         trim=True, | ||||||
|                         input_file="{}[0]".format(self.document_path), |                         input_file="{}[0]".format(document_path), | ||||||
|                         output_file=out_path, |                         output_file=out_path, | ||||||
|                         logging_group=self.logging_group) |                         logging_group=self.logging_group) | ||||||
|         except ParseError: |         except ParseError: | ||||||
| @@ -51,7 +45,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                    "-q", |                    "-q", | ||||||
|                    "-sDEVICE=pngalpha", |                    "-sDEVICE=pngalpha", | ||||||
|                    "-o", gs_out_path, |                    "-o", gs_out_path, | ||||||
|                    self.document_path] |                    document_path] | ||||||
|             if not subprocess.Popen(cmd).wait() == 0: |             if not subprocess.Popen(cmd).wait() == 0: | ||||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) |                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||||
|             # then run convert on the output from gs |             # then run convert on the output from gs | ||||||
| @@ -71,10 +65,11 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         if self._text: |         if self._text: | ||||||
|             return self._text |             return self._text | ||||||
|  |  | ||||||
|  |     def parse(self, document_path, mime_type): | ||||||
|         archive_path = os.path.join(self.tempdir, "archive.pdf") |         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||||
|  |  | ||||||
|         ocr_args = { |         ocr_args = { | ||||||
|             'input_file': self.document_path, |             'input_file': document_path, | ||||||
|             'output_file': archive_path, |             'output_file': archive_path, | ||||||
|             'use_threads': True, |             'use_threads': True, | ||||||
|             'jobs': settings.THREADS_PER_WORKER, |             'jobs': settings.THREADS_PER_WORKER, | ||||||
| @@ -96,17 +91,17 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             ocrmypdf.ocr(**ocr_args) |             ocrmypdf.ocr(**ocr_args) | ||||||
|             # success! announce that we have an archive document |             # success! announce results | ||||||
|             self._archive_path = archive_path |             self.archive_path = archive_path | ||||||
|             self._text = get_text_from_pdf(self._archive_path) |             self.text = get_text_from_pdf(archive_path) | ||||||
|  |  | ||||||
|         except InputFileError as e: |         except InputFileError as e: | ||||||
|             # This happens with some PDFs when used with the redo_ocr option. |             # This happens with some PDFs when used with the redo_ocr option. | ||||||
|             # This is not the end of the world, we'll just use what we already |             # This is not the end of the world, we'll just use what we already | ||||||
|             # have in the document. |             # have in the document. | ||||||
|             self._text = get_text_from_pdf(self.document_path) |             self.text = get_text_from_pdf(document_path) | ||||||
|             # Also, no archived file. |             # Also, no archived file. | ||||||
|             if not self._text: |             if not self.text: | ||||||
|                 # However, if we don't have anything, fail: |                 # However, if we don't have anything, fail: | ||||||
|                 raise ParseError(e) |                 raise ParseError(e) | ||||||
|  |  | ||||||
| @@ -114,27 +109,14 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|             # Anything else is probably serious. |             # Anything else is probably serious. | ||||||
|             raise ParseError(e) |             raise ParseError(e) | ||||||
|  |  | ||||||
|         if not self._text: |         if not self.text: | ||||||
|             # This may happen for files that don't have any text. |             # This may happen for files that don't have any text. | ||||||
|             self.log( |             self.log( | ||||||
|                 'warning', |                 'warning', | ||||||
|                 f"Document {self.document_path} does not have any text." |                 f"Document {document_path} does not have any text." | ||||||
|                 f"This is probably an error or you tried to add an image " |                 f"This is probably an error or you tried to add an image " | ||||||
|                 f"without text.") |                 f"without text.") | ||||||
|             return "" |             self.text = "" | ||||||
|  |  | ||||||
|         return self._text |  | ||||||
|  |  | ||||||
|     def get_archive_path(self): |  | ||||||
|         return self._archive_path |  | ||||||
|  |  | ||||||
|     def _guess_language(self, text): |  | ||||||
|         try: |  | ||||||
|             guess = langdetect.detect(text) |  | ||||||
|             return guess |  | ||||||
|         except Exception as e: |  | ||||||
|             self.log('warning', f"Language detection failed with: {e}") |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def strip_excess_whitespace(text): | def strip_excess_whitespace(text): | ||||||
|   | |||||||
| @@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser): | |||||||
|     This parser directly parses a text document (.txt, .md, or .csv) |     This parser directly parses a text document (.txt, .md, or .csv) | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         super().__init__(path, logging_group) |  | ||||||
|         self._text = None |  | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |  | ||||||
|         """ |         """ | ||||||
|         The thumbnail of a text file is just a 500px wide image of the text |         The thumbnail of a text file is just a 500px wide image of the text | ||||||
|         rendered onto a letter-sized page. |         rendered onto a letter-sized page. | ||||||
| @@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser): | |||||||
|             ) |             ) | ||||||
|  |  | ||||||
|         def read_text(): |         def read_text(): | ||||||
|             with open(self.document_path, 'r') as src: |             with open(document_path, 'r') as src: | ||||||
|                 lines = [line.strip() for line in src.readlines()] |                 lines = [line.strip() for line in src.readlines()] | ||||||
|                 text = "\n".join([line for line in lines[:n_lines]]) |                 text = "\n".join([line for line in lines[:n_lines]]) | ||||||
|                 return text.replace('"', "'") |                 return text.replace('"', "'") | ||||||
| @@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         return out_path |         return out_path | ||||||
|  |  | ||||||
|     def get_text(self): |     def parse(self, document_path, mime_type): | ||||||
|  |         with open(document_path, 'r') as f: | ||||||
|         if self._text is not None: |             self.text = f.read() | ||||||
|             return self._text |  | ||||||
|  |  | ||||||
|         with open(self.document_path, 'r') as f: |  | ||||||
|             self._text = f.read() |  | ||||||
|  |  | ||||||
|         return self._text |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def run_command(*args): | def run_command(*args): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler