mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	refactor
This commit is contained in:
		| @@ -30,19 +30,19 @@ class Consumer: | |||||||
|  |  | ||||||
|         self.logger = logging.getLogger(__name__) |         self.logger = logging.getLogger(__name__) | ||||||
|         self.logging_group = None |         self.logging_group = None | ||||||
|  |         self.path = None | ||||||
|  |         self.filename = None | ||||||
|  |         self.override_title = None | ||||||
|  |         self.override_correspondent_id = None | ||||||
|  |         self.override_tag_ids = None | ||||||
|  |         self.override_document_type_id = None | ||||||
|  |  | ||||||
|         self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED |     def pre_check_file_exists(self): | ||||||
|         if settings.PASSPHRASE: |         if not os.path.isfile(self.path): | ||||||
|             self.storage_type = Document.STORAGE_TYPE_GPG |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def pre_check_file_exists(filename): |  | ||||||
|         if not os.path.isfile(filename): |  | ||||||
|             raise ConsumerError("Cannot consume {}: It is not a file".format( |             raise ConsumerError("Cannot consume {}: It is not a file".format( | ||||||
|                 filename)) |                 self.path)) | ||||||
|  |  | ||||||
|     @staticmethod |     def pre_check_consumption_dir(self): | ||||||
|     def pre_check_consumption_dir(): |  | ||||||
|         if not settings.CONSUMPTION_DIR: |         if not settings.CONSUMPTION_DIR: | ||||||
|             raise ConsumerError( |             raise ConsumerError( | ||||||
|                 "The CONSUMPTION_DIR settings variable does not appear to be " |                 "The CONSUMPTION_DIR settings variable does not appear to be " | ||||||
| @@ -53,26 +53,23 @@ class Consumer: | |||||||
|                 "Consumption directory {} does not exist".format( |                 "Consumption directory {} does not exist".format( | ||||||
|                     settings.CONSUMPTION_DIR)) |                     settings.CONSUMPTION_DIR)) | ||||||
|  |  | ||||||
|     @staticmethod |     def pre_check_regex(self): | ||||||
|     def pre_check_regex(filename): |         if not re.match(FileInfo.REGEXES["title"], self.filename): | ||||||
|         if not re.match(FileInfo.REGEXES["title"], filename): |  | ||||||
|             raise ConsumerError( |             raise ConsumerError( | ||||||
|                 "Filename {} does not seem to be safe to " |                 "Filename {} does not seem to be safe to " | ||||||
|                 "consume".format(filename)) |                 "consume".format(self.filename)) | ||||||
|  |  | ||||||
|     @staticmethod |     def pre_check_duplicate(self): | ||||||
|     def pre_check_duplicate(filename): |         with open(self.path, "rb") as f: | ||||||
|         with open(filename, "rb") as f: |  | ||||||
|             checksum = hashlib.md5(f.read()).hexdigest() |             checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|         if Document.objects.filter(checksum=checksum).exists(): |         if Document.objects.filter(checksum=checksum).exists(): | ||||||
|             if settings.CONSUMER_DELETE_DUPLICATES: |             if settings.CONSUMER_DELETE_DUPLICATES: | ||||||
|                 os.unlink(filename) |                 os.unlink(self.path) | ||||||
|             raise ConsumerError( |             raise ConsumerError( | ||||||
|                 "Not consuming {}: It is a duplicate.".format(filename) |                 "Not consuming {}: It is a duplicate.".format(self.filename) | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|     @staticmethod |     def pre_check_directories(self): | ||||||
|     def pre_check_directories(): |  | ||||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) |         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) |         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) |         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||||
| @@ -83,16 +80,23 @@ class Consumer: | |||||||
|         }) |         }) | ||||||
|  |  | ||||||
|     def try_consume_file(self, |     def try_consume_file(self, | ||||||
|                          filename, |                          path, | ||||||
|                          original_filename=None, |                          override_filename=None, | ||||||
|                          force_title=None, |                          override_title=None, | ||||||
|                          force_correspondent_id=None, |                          override_correspondent_id=None, | ||||||
|                          force_document_type_id=None, |                          override_document_type_id=None, | ||||||
|                          force_tag_ids=None): |                          override_tag_ids=None): | ||||||
|         """ |         """ | ||||||
|         Return the document object if it was successfully created. |         Return the document object if it was successfully created. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|  |         self.path = path | ||||||
|  |         self.filename = override_filename or os.path.basename(path) | ||||||
|  |         self.override_title = override_title | ||||||
|  |         self.override_correspondent_id = override_correspondent_id | ||||||
|  |         self.override_document_type_id = override_document_type_id | ||||||
|  |         self.override_tag_ids = override_tag_ids | ||||||
|  |  | ||||||
|         # this is for grouping logging entries for this particular file |         # this is for grouping logging entries for this particular file | ||||||
|         # together. |         # together. | ||||||
|  |  | ||||||
| @@ -100,19 +104,19 @@ class Consumer: | |||||||
|  |  | ||||||
|         # Make sure that preconditions for consuming the file are met. |         # Make sure that preconditions for consuming the file are met. | ||||||
|  |  | ||||||
|         self.pre_check_file_exists(filename) |         self.pre_check_file_exists() | ||||||
|         self.pre_check_consumption_dir() |         self.pre_check_consumption_dir() | ||||||
|         self.pre_check_directories() |         self.pre_check_directories() | ||||||
|         self.pre_check_regex(filename) |         self.pre_check_regex() | ||||||
|         self.pre_check_duplicate(filename) |         self.pre_check_duplicate() | ||||||
|  |  | ||||||
|         self.log("info", "Consuming {}".format(filename)) |         self.log("info", "Consuming {}".format(self.filename)) | ||||||
|  |  | ||||||
|         # Determine the parser class. |         # Determine the parser class. | ||||||
|  |  | ||||||
|         parser_class = get_parser_class(original_filename or filename) |         parser_class = get_parser_class(self.filename) | ||||||
|         if not parser_class: |         if not parser_class: | ||||||
|             raise ConsumerError("No parsers abvailable for {}".format(filename)) |             raise ConsumerError("No parsers abvailable for {}".format(self.filename)) | ||||||
|         else: |         else: | ||||||
|             self.log("debug", "Parser: {}".format(parser_class.__name__)) |             self.log("debug", "Parser: {}".format(parser_class.__name__)) | ||||||
|  |  | ||||||
| @@ -120,13 +124,13 @@ class Consumer: | |||||||
|  |  | ||||||
|         document_consumption_started.send( |         document_consumption_started.send( | ||||||
|             sender=self.__class__, |             sender=self.__class__, | ||||||
|             filename=filename, |             filename=self.path, | ||||||
|             logging_group=self.logging_group |             logging_group=self.logging_group | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         # This doesn't parse the document yet, but gives us a parser. |         # This doesn't parse the document yet, but gives us a parser. | ||||||
|  |  | ||||||
|         document_parser = parser_class(filename, self.logging_group) |         document_parser = parser_class(self.path, self.logging_group) | ||||||
|  |  | ||||||
|         # However, this already created working directories which we have to |         # However, this already created working directories which we have to | ||||||
|         # clean up. |         # clean up. | ||||||
| @@ -134,9 +138,9 @@ class Consumer: | |||||||
|         # Parse the document. This may take some time. |         # Parse the document. This may take some time. | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             self.log("debug", "Generating thumbnail for {}...".format(filename)) |             self.log("debug", "Generating thumbnail for {}...".format(self.filename)) | ||||||
|             thumbnail = document_parser.get_optimised_thumbnail() |             thumbnail = document_parser.get_optimised_thumbnail() | ||||||
|             self.log("debug", "Parsing {}...".format(filename)) |             self.log("debug", "Parsing {}...".format(self.filename)) | ||||||
|             text = document_parser.get_text() |             text = document_parser.get_text() | ||||||
|             date = document_parser.get_date() |             date = document_parser.get_date() | ||||||
|         except ParseError as e: |         except ParseError as e: | ||||||
| @@ -165,14 +169,7 @@ class Consumer: | |||||||
|                 # store the document. |                 # store the document. | ||||||
|                 document = self._store( |                 document = self._store( | ||||||
|                     text=text, |                     text=text, | ||||||
|                     doc=filename, |                     date=date | ||||||
|                     thumbnail=thumbnail, |  | ||||||
|                     date=date, |  | ||||||
|                     original_filename=original_filename, |  | ||||||
|                     force_title=force_title, |  | ||||||
|                     force_correspondent_id=force_correspondent_id, |  | ||||||
|                     force_document_type_id=force_document_type_id, |  | ||||||
|                     force_tag_ids=force_tag_ids |  | ||||||
|                 ) |                 ) | ||||||
|  |  | ||||||
|                 # If we get here, it was successful. Proceed with post-consume |                 # If we get here, it was successful. Proceed with post-consume | ||||||
| @@ -189,12 +186,12 @@ class Consumer: | |||||||
|                 # place. If this fails, we'll also rollback the transaction. |                 # place. If this fails, we'll also rollback the transaction. | ||||||
|  |  | ||||||
|                 create_source_path_directory(document.source_path) |                 create_source_path_directory(document.source_path) | ||||||
|                 self._write(document, filename, document.source_path) |                 self._write(document, self.path, document.source_path) | ||||||
|                 self._write(document, thumbnail, document.thumbnail_path) |                 self._write(document, thumbnail, document.thumbnail_path) | ||||||
|  |  | ||||||
|                 # Delete the file only if it was successfully consumed |                 # Delete the file only if it was successfully consumed | ||||||
|                 self.log("debug", "Deleting document {}".format(filename)) |                 self.log("debug", "Deleting file {}".format(self.path)) | ||||||
|                 os.unlink(filename) |                 os.unlink(self.path) | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             raise ConsumerError(e) |             raise ConsumerError(e) | ||||||
|         finally: |         finally: | ||||||
| @@ -207,25 +204,25 @@ class Consumer: | |||||||
|  |  | ||||||
|         return document |         return document | ||||||
|  |  | ||||||
|     def _store(self, text, doc, thumbnail, date, |     def _store(self, text, date): | ||||||
|                original_filename=None, |  | ||||||
|                force_title=None, |  | ||||||
|                force_correspondent_id=None, |  | ||||||
|                force_document_type_id=None, |  | ||||||
|                force_tag_ids=None): |  | ||||||
|  |  | ||||||
|         # If someone gave us the original filename, use it instead of doc. |         # If someone gave us the original filename, use it instead of doc. | ||||||
|  |  | ||||||
|         file_info = FileInfo.from_path(original_filename or doc) |         file_info = FileInfo.from_path(self.filename) | ||||||
|  |  | ||||||
|         stats = os.stat(doc) |         stats = os.stat(self.path) | ||||||
|  |  | ||||||
|         self.log("debug", "Saving record to database") |         self.log("debug", "Saving record to database") | ||||||
|  |  | ||||||
|         created = file_info.created or date or timezone.make_aware( |         created = file_info.created or date or timezone.make_aware( | ||||||
|             datetime.datetime.fromtimestamp(stats.st_mtime)) |             datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||||
|  |  | ||||||
|         with open(doc, "rb") as f: |         if settings.PASSPHRASE: | ||||||
|  |             storage_type = Document.STORAGE_TYPE_GPG | ||||||
|  |         else: | ||||||
|  |             storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||||
|  |  | ||||||
|  |         with open(self.path, "rb") as f: | ||||||
|             document = Document.objects.create( |             document = Document.objects.create( | ||||||
|                 correspondent=file_info.correspondent, |                 correspondent=file_info.correspondent, | ||||||
|                 title=file_info.title, |                 title=file_info.title, | ||||||
| @@ -234,7 +231,7 @@ class Consumer: | |||||||
|                 checksum=hashlib.md5(f.read()).hexdigest(), |                 checksum=hashlib.md5(f.read()).hexdigest(), | ||||||
|                 created=created, |                 created=created, | ||||||
|                 modified=created, |                 modified=created, | ||||||
|                 storage_type=self.storage_type |                 storage_type=storage_type | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|         relevant_tags = set(file_info.tags) |         relevant_tags = set(file_info.tags) | ||||||
| @@ -243,18 +240,7 @@ class Consumer: | |||||||
|             self.log("debug", "Tagging with {}".format(tag_names)) |             self.log("debug", "Tagging with {}".format(tag_names)) | ||||||
|             document.tags.add(*relevant_tags) |             document.tags.add(*relevant_tags) | ||||||
|  |  | ||||||
|         if force_title: |         self.apply_overrides(document) | ||||||
|             document.title = force_title |  | ||||||
|  |  | ||||||
|         if force_correspondent_id: |  | ||||||
|             document.correspondent = Correspondent.objects.get(pk=force_correspondent_id) |  | ||||||
|  |  | ||||||
|         if force_document_type_id: |  | ||||||
|             document.document_type = DocumentType.objects.get(pk=force_document_type_id) |  | ||||||
|  |  | ||||||
|         if force_tag_ids: |  | ||||||
|             for tag_id in force_tag_ids: |  | ||||||
|                 document.tags.add(Tag.objects.get(pk=tag_id)) |  | ||||||
|  |  | ||||||
|         document.filename = generate_filename(document) |         document.filename = generate_filename(document) | ||||||
|  |  | ||||||
| @@ -264,6 +250,20 @@ class Consumer: | |||||||
|  |  | ||||||
|         return document |         return document | ||||||
|  |  | ||||||
|  |     def apply_overrides(self, document): | ||||||
|  |         if self.override_title: | ||||||
|  |             document.title = self.override_title | ||||||
|  |  | ||||||
|  |         if self.override_correspondent_id: | ||||||
|  |             document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id) | ||||||
|  |  | ||||||
|  |         if self.override_document_type_id: | ||||||
|  |             document.document_type = DocumentType.objects.get(pk=self.override_document_type_id) | ||||||
|  |  | ||||||
|  |         if self.override_tag_ids: | ||||||
|  |             for tag_id in self.override_tag_ids: | ||||||
|  |                 document.tags.add(Tag.objects.get(pk=tag_id)) | ||||||
|  |  | ||||||
|     def _write(self, document, source, target): |     def _write(self, document, source, target): | ||||||
|         with open(source, "rb") as read_file: |         with open(source, "rb") as read_file: | ||||||
|             with open(target, "wb") as write_file: |             with open(target, "wb") as write_file: | ||||||
|   | |||||||
| @@ -37,4 +37,4 @@ class UploadForm(forms.Form): | |||||||
|             f.write(document) |             f.write(document) | ||||||
|             os.utime(f.name, times=(t, t)) |             os.utime(f.name, times=(t, t)) | ||||||
|  |  | ||||||
|             async_task("documents.tasks.consume_file", f.name, original_filename, task_name=os.path.basename(original_filename)) |             async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename)) | ||||||
|   | |||||||
| @@ -113,6 +113,7 @@ class DocumentType(MatchingModel): | |||||||
|  |  | ||||||
| class Document(models.Model): | class Document(models.Model): | ||||||
|  |  | ||||||
|  |     # TODO: why do we need an explicit list | ||||||
|     TYPE_PDF = "pdf" |     TYPE_PDF = "pdf" | ||||||
|     TYPE_PNG = "png" |     TYPE_PNG = "png" | ||||||
|     TYPE_JPG = "jpg" |     TYPE_JPG = "jpg" | ||||||
| @@ -291,7 +292,7 @@ class FileInfo: | |||||||
|             non_separated_word=r"([\w,. ]|([^\s]-))" |             non_separated_word=r"([\w,. ]|([^\s]-))" | ||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
|  |     # TODO: what is this used for | ||||||
|     formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" |     formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" | ||||||
|     REGEXES = OrderedDict([ |     REGEXES = OrderedDict([ | ||||||
|         ("created-correspondent-title-tags", re.compile( |         ("created-correspondent-title-tags", re.compile( | ||||||
|   | |||||||
| @@ -57,20 +57,20 @@ def train_classifier(): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
| def consume_file(file, | def consume_file(path, | ||||||
|                  original_filename=None, |                  override_filename=None, | ||||||
|                  force_title=None, |                  override_title=None, | ||||||
|                  force_correspondent_id=None, |                  override_correspondent_id=None, | ||||||
|                  force_document_type_id=None, |                  override_document_type_id=None, | ||||||
|                  force_tag_ids=None): |                  override_tag_ids=None): | ||||||
|  |  | ||||||
|     document = Consumer().try_consume_file( |     document = Consumer().try_consume_file( | ||||||
|         file, |         path, | ||||||
|         original_filename=original_filename, |         override_filename=override_filename, | ||||||
|         force_title=force_title, |         override_title=override_title, | ||||||
|         force_correspondent_id=force_correspondent_id, |         override_correspondent_id=override_correspondent_id, | ||||||
|         force_document_type_id=force_document_type_id, |         override_document_type_id=override_document_type_id, | ||||||
|         force_tag_ids=force_tag_ids) |         override_tag_ids=override_tag_ids) | ||||||
|  |  | ||||||
|     if document: |     if document: | ||||||
|         return "Success. New document id {} created".format( |         return "Success. New document id {} created".format( | ||||||
|   | |||||||
| @@ -503,33 +503,33 @@ class TestConsumer(TestCase): | |||||||
|         filename = self.get_test_file() |         filename = self.get_test_file() | ||||||
|         overrideFilename = "My Bank - Statement for November.pdf" |         overrideFilename = "My Bank - Statement for November.pdf" | ||||||
|  |  | ||||||
|         document = self.consumer.try_consume_file(filename, original_filename=overrideFilename) |         document = self.consumer.try_consume_file(filename, override_filename=overrideFilename) | ||||||
|  |  | ||||||
|         self.assertEqual(document.correspondent.name, "My Bank") |         self.assertEqual(document.correspondent.name, "My Bank") | ||||||
|         self.assertEqual(document.title, "Statement for November") |         self.assertEqual(document.title, "Statement for November") | ||||||
|  |  | ||||||
|     def testOverrideTitle(self): |     def testOverrideTitle(self): | ||||||
|  |  | ||||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_title="Override Title") |         document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title") | ||||||
|         self.assertEqual(document.title, "Override Title") |         self.assertEqual(document.title, "Override Title") | ||||||
|  |  | ||||||
|     def testOverrideCorrespondent(self): |     def testOverrideCorrespondent(self): | ||||||
|         c = Correspondent.objects.create(name="test") |         c = Correspondent.objects.create(name="test") | ||||||
|  |  | ||||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_correspondent_id=c.pk) |         document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk) | ||||||
|         self.assertEqual(document.correspondent.id, c.id) |         self.assertEqual(document.correspondent.id, c.id) | ||||||
|  |  | ||||||
|     def testOverrideDocumentType(self): |     def testOverrideDocumentType(self): | ||||||
|         dt = DocumentType.objects.create(name="test") |         dt = DocumentType.objects.create(name="test") | ||||||
|  |  | ||||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_document_type_id=dt.pk) |         document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk) | ||||||
|         self.assertEqual(document.document_type.id, dt.id) |         self.assertEqual(document.document_type.id, dt.id) | ||||||
|  |  | ||||||
|     def testOverrideTags(self): |     def testOverrideTags(self): | ||||||
|         t1 = Tag.objects.create(name="t1") |         t1 = Tag.objects.create(name="t1") | ||||||
|         t2 = Tag.objects.create(name="t2") |         t2 = Tag.objects.create(name="t2") | ||||||
|         t3 = Tag.objects.create(name="t3") |         t3 = Tag.objects.create(name="t3") | ||||||
|         document = self.consumer.try_consume_file(self.get_test_file(), force_tag_ids=[t1.id, t3.id]) |         document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id]) | ||||||
|  |  | ||||||
|         self.assertIn(t1, document.tags.all()) |         self.assertIn(t1, document.tags.all()) | ||||||
|         self.assertNotIn(t2, document.tags.all()) |         self.assertNotIn(t2, document.tags.all()) | ||||||
| @@ -624,7 +624,7 @@ class TestConsumer(TestCase): | |||||||
|     def testFilenameHandling(self): |     def testFilenameHandling(self): | ||||||
|         filename = self.get_test_file() |         filename = self.get_test_file() | ||||||
|  |  | ||||||
|         document = self.consumer.try_consume_file(filename, original_filename="Bank - Test.pdf", force_title="new docs") |         document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs") | ||||||
|  |  | ||||||
|         print(document.source_path) |         print(document.source_path) | ||||||
|         print("===") |         print("===") | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler