Merge branch 'dev' into feature-websockets-status
| @@ -1 +1,2 @@ | ||||
| from .checks import changed_password_check | ||||
| # this is here so that django finds the checks. | ||||
| from .checks import * | ||||
|   | ||||
| @@ -4,12 +4,13 @@ import os | ||||
| import pickle | ||||
| import re | ||||
|  | ||||
| from django.conf import settings | ||||
| from sklearn.feature_extraction.text import CountVectorizer | ||||
| from sklearn.neural_network import MLPClassifier | ||||
| from sklearn.preprocessing import MultiLabelBinarizer | ||||
| from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer | ||||
| from sklearn.utils.multiclass import type_of_target | ||||
|  | ||||
| from documents.models import Document, MatchingModel | ||||
| from paperless import settings | ||||
|  | ||||
|  | ||||
| class IncompatibleClassifierVersionError(Exception): | ||||
| @@ -27,7 +28,7 @@ def preprocess_content(content): | ||||
|  | ||||
| class DocumentClassifier(object): | ||||
|  | ||||
|     FORMAT_VERSION = 5 | ||||
|     FORMAT_VERSION = 6 | ||||
|  | ||||
|     def __init__(self): | ||||
|         # mtime of the model file on disk. used to prevent reloading when | ||||
| @@ -54,6 +55,8 @@ class DocumentClassifier(object): | ||||
|                         "Cannor load classifier, incompatible versions.") | ||||
|                 else: | ||||
|                     if self.classifier_version > 0: | ||||
|                         # Don't be confused by this check. It's simply here | ||||
|                         # so that we wont log anything on initial reload. | ||||
|                         logger.info("Classifier updated on disk, " | ||||
|                                     "reloading classifier models") | ||||
|                     self.data_hash = pickle.load(f) | ||||
| @@ -122,9 +125,14 @@ class DocumentClassifier(object): | ||||
|         labels_tags_unique = set([tag for tags in labels_tags for tag in tags]) | ||||
|  | ||||
|         num_tags = len(labels_tags_unique) | ||||
|  | ||||
|         # substract 1 since -1 (null) is also part of the classes. | ||||
|         num_correspondents = len(set(labels_correspondent)) - 1 | ||||
|         num_document_types = len(set(labels_document_type)) - 1 | ||||
|  | ||||
|         # union with {-1} accounts for cases where all documents have | ||||
|         # correspondents and types assigned, so -1 isnt part of labels_x, which | ||||
|         # it usually is. | ||||
|         num_correspondents = len(set(labels_correspondent) | {-1}) - 1 | ||||
|         num_document_types = len(set(labels_document_type) | {-1}) - 1 | ||||
|  | ||||
|         logging.getLogger(__name__).debug( | ||||
|             "{} documents, {} tag(s), {} correspondent(s), " | ||||
| @@ -145,12 +153,23 @@ class DocumentClassifier(object): | ||||
|         ) | ||||
|         data_vectorized = self.data_vectorizer.fit_transform(data) | ||||
|  | ||||
|         self.tags_binarizer = MultiLabelBinarizer() | ||||
|         labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags) | ||||
|  | ||||
|         # Step 3: train the classifiers | ||||
|         if num_tags > 0: | ||||
|             logging.getLogger(__name__).debug("Training tags classifier...") | ||||
|  | ||||
|             if num_tags == 1: | ||||
|                 # Special case where only one tag has auto: | ||||
|                 # Fallback to binary classification. | ||||
|                 labels_tags = [label[0] if len(label) == 1 else -1 | ||||
|                                for label in labels_tags] | ||||
|                 self.tags_binarizer = LabelBinarizer() | ||||
|                 labels_tags_vectorized = self.tags_binarizer.fit_transform( | ||||
|                     labels_tags).ravel() | ||||
|             else: | ||||
|                 self.tags_binarizer = MultiLabelBinarizer() | ||||
|                 labels_tags_vectorized = self.tags_binarizer.fit_transform( | ||||
|                     labels_tags) | ||||
|  | ||||
|             self.tags_classifier = MLPClassifier(tol=0.01) | ||||
|             self.tags_classifier.fit(data_vectorized, labels_tags_vectorized) | ||||
|         else: | ||||
| @@ -222,6 +241,16 @@ class DocumentClassifier(object): | ||||
|             X = self.data_vectorizer.transform([preprocess_content(content)]) | ||||
|             y = self.tags_classifier.predict(X) | ||||
|             tags_ids = self.tags_binarizer.inverse_transform(y)[0] | ||||
|             return tags_ids | ||||
|             if type_of_target(y).startswith('multilabel'): | ||||
|                 # the usual case when there are multiple tags. | ||||
|                 return list(tags_ids) | ||||
|             elif type_of_target(y) == 'binary' and tags_ids != -1: | ||||
|                 # This is for when we have binary classification with only one | ||||
|                 # tag and the result is to assign this tag. | ||||
|                 return [tags_ids] | ||||
|             else: | ||||
|                 # Usually binary as well with -1 as the result, but we're | ||||
|                 # going to catch everything else here as well. | ||||
|                 return [] | ||||
|         else: | ||||
|             return [] | ||||
|   | ||||
| @@ -8,14 +8,15 @@ from asgiref.sync import async_to_sync | ||||
| from channels.layers import get_channel_layer | ||||
| from django.conf import settings | ||||
| from django.db import transaction | ||||
| from django.db.models import Q | ||||
| from django.utils import timezone | ||||
|  | ||||
| from paperless.db import GnuPG | ||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| from .file_handling import generate_filename, create_source_path_directory | ||||
| from .file_handling import create_source_path_directory | ||||
| from .loggers import LoggingMixin | ||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||
| from .parsers import ParseError, get_parser_class_for_mime_type | ||||
| from .parsers import ParseError, get_parser_class_for_mime_type, \ | ||||
|     get_supported_file_extensions, parse_date | ||||
| from .signals import ( | ||||
|     document_consumption_finished, | ||||
|     document_consumption_started | ||||
| @@ -58,21 +59,10 @@ class Consumer(LoggingMixin): | ||||
|             raise ConsumerError("Cannot consume {}: It is not a file".format( | ||||
|                 self.path)) | ||||
|  | ||||
|     def pre_check_consumption_dir(self): | ||||
|         if not settings.CONSUMPTION_DIR: | ||||
|             raise ConsumerError( | ||||
|                 "The CONSUMPTION_DIR settings variable does not appear to be " | ||||
|                 "set.") | ||||
|  | ||||
|         if not os.path.isdir(settings.CONSUMPTION_DIR): | ||||
|             raise ConsumerError( | ||||
|                 "Consumption directory {} does not exist".format( | ||||
|                     settings.CONSUMPTION_DIR)) | ||||
|  | ||||
|     def pre_check_duplicate(self): | ||||
|         with open(self.path, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         if Document.objects.filter(checksum=checksum).exists(): | ||||
|         if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501 | ||||
|             if settings.CONSUMER_DELETE_DUPLICATES: | ||||
|                 os.unlink(self.path) | ||||
|             raise ConsumerError( | ||||
| @@ -83,6 +73,7 @@ class Consumer(LoggingMixin): | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) | ||||
|  | ||||
|     def try_consume_file(self, | ||||
|                          path, | ||||
| @@ -110,7 +101,6 @@ class Consumer(LoggingMixin): | ||||
|         # Make sure that preconditions for consuming the file are met. | ||||
|  | ||||
|         self.pre_check_file_exists() | ||||
|         self.pre_check_consumption_dir() | ||||
|         self.pre_check_directories() | ||||
|         self.pre_check_duplicate() | ||||
|  | ||||
| @@ -145,7 +135,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         # This doesn't parse the document yet, but gives us a parser. | ||||
|  | ||||
|         document_parser = parser_class(self.path, self.logging_group, progress_callback) | ||||
|         document_parser = parser_class(self.logging_group, progress_callback) | ||||
|  | ||||
|         # However, this already created working directories which we have to | ||||
|         # clean up. | ||||
| @@ -153,19 +143,30 @@ class Consumer(LoggingMixin): | ||||
|         # Parse the document. This may take some time. | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||
|             self._send_progress(self.filename, 10, 100, 'WORKING', | ||||
|                                 'Generating thumbnail...') | ||||
|             thumbnail = document_parser.get_optimised_thumbnail() | ||||
|             self.log("debug", "Parsing {}...".format(self.filename)) | ||||
|             self._send_progress(self.filename, 20, 100, 'WORKING', | ||||
|                                 'Getting text from document...') | ||||
|                                 'Parsing document...') | ||||
|             self.log("debug", "Parsing {}...".format(self.filename)) | ||||
|             document_parser.parse(self.path, mime_type) | ||||
|  | ||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||
|             self._send_progress(self.filename, 70, 100, 'WORKING', | ||||
|                                 'Generating thumbnail...') | ||||
|             thumbnail = document_parser.get_optimised_thumbnail( | ||||
|                 self.path, mime_type) | ||||
|  | ||||
|             text = document_parser.get_text() | ||||
|             self._send_progress(self.filename, 80, 100, 'WORKING', | ||||
|                                 'Getting date from document...') | ||||
|             date = document_parser.get_date() | ||||
|             if not date: | ||||
|                 self._send_progress(self.filename, 90, 100, 'WORKING', | ||||
|                                     'Getting date from document...') | ||||
|                 date = parse_date(self.filename, text) | ||||
|             archive_path = document_parser.get_archive_path() | ||||
|  | ||||
|         except ParseError as e: | ||||
|             document_parser.cleanup() | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 f"Error while consuming document {self.filename}: {e}") | ||||
|             self._send_progress(self.filename, 100, 100, 'FAILED', | ||||
|                                 "Failed: {}".format(e)) | ||||
|             raise ConsumerError(e) | ||||
| @@ -183,7 +184,7 @@ class Consumer(LoggingMixin): | ||||
|             logging.getLogger(__name__).warning( | ||||
|                 "Cannot classify documents: {}.".format(e)) | ||||
|             classifier = None | ||||
|         self._send_progress(self.filename, 85, 100, 'WORKING', | ||||
|         self._send_progress(self.filename, 95, 100, 'WORKING', | ||||
|                             'Storing the document...') | ||||
|         # now that everything is done, we can start to store the document | ||||
|         # in the system. This will be a transaction and reasonably fast. | ||||
| @@ -200,9 +201,6 @@ class Consumer(LoggingMixin): | ||||
|                 # If we get here, it was successful. Proceed with post-consume | ||||
|                 # hooks. If they fail, nothing will get changed. | ||||
|  | ||||
|                 self._send_progress(self.filename, 90, 100, 'WORKING', | ||||
|                                     'Performing post-consumption tasks...') | ||||
|  | ||||
|                 document_consumption_finished.send( | ||||
|                     sender=self.__class__, | ||||
|                     document=document, | ||||
| @@ -213,14 +211,41 @@ class Consumer(LoggingMixin): | ||||
|                 # After everything is in the database, copy the files into | ||||
|                 # place. If this fails, we'll also rollback the transaction. | ||||
|  | ||||
|                 # TODO: not required, since this is done by the file handling | ||||
|                 #  logic | ||||
|                 create_source_path_directory(document.source_path) | ||||
|                 self._write(document, self.path, document.source_path) | ||||
|                 self._write(document, thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 self._write(document.storage_type, | ||||
|                             self.path, document.source_path) | ||||
|  | ||||
|                 self._write(document.storage_type, | ||||
|                             thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 if archive_path and os.path.isfile(archive_path): | ||||
|                     self._write(document.storage_type, | ||||
|                                 archive_path, document.archive_path) | ||||
|  | ||||
|                     with open(archive_path, 'rb') as f: | ||||
|                         document.archive_checksum = hashlib.md5( | ||||
|                             f.read()).hexdigest() | ||||
|                         document.save() | ||||
|  | ||||
|                 # Afte performing all database operations and moving files | ||||
|                 # into place, tell paperless where the file is. | ||||
|                 document.filename = os.path.basename(document.source_path) | ||||
|                 # Saving the document now will trigger the filename handling | ||||
|                 # logic. | ||||
|                 document.save() | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
|                 self.log("debug", "Deleting file {}".format(self.path)) | ||||
|                 os.unlink(self.path) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 f"The following error occured while consuming " | ||||
|                 f"{self.filename}: {e}" | ||||
|             ) | ||||
|             self._send_progress(self.filename, 100, 100, 'FAILED', | ||||
|                                 "Failed: {}".format(e)) | ||||
|             raise ConsumerError(e) | ||||
| @@ -250,10 +275,7 @@ class Consumer(LoggingMixin): | ||||
|         created = file_info.created or date or timezone.make_aware( | ||||
|             datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||
|  | ||||
|         if settings.PASSPHRASE: | ||||
|             storage_type = Document.STORAGE_TYPE_GPG | ||||
|         else: | ||||
|             storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
|         with open(self.path, "rb") as f: | ||||
|             document = Document.objects.create( | ||||
| @@ -275,12 +297,6 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         self.apply_overrides(document) | ||||
|  | ||||
|         document.filename = generate_filename(document) | ||||
|  | ||||
|         # We need to save the document twice, since we need the PK of the | ||||
|         # document in order to create its filename above. | ||||
|         document.save() | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def apply_overrides(self, document): | ||||
| @@ -299,11 +315,7 @@ class Consumer(LoggingMixin): | ||||
|             for tag_id in self.override_tag_ids: | ||||
|                 document.tags.add(Tag.objects.get(pk=tag_id)) | ||||
|  | ||||
|     def _write(self, document, source, target): | ||||
|     def _write(self, storage_type, source, target): | ||||
|         with open(source, "rb") as read_file: | ||||
|             with open(target, "wb") as write_file: | ||||
|                 if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: | ||||
|                     write_file.write(read_file.read()) | ||||
|                     return | ||||
|                 self.log("debug", "Encrypting") | ||||
|                 write_file.write(GnuPG.encrypted(read_file)) | ||||
|                 write_file.write(read_file.read()) | ||||
|   | ||||
| @@ -1,7 +1,9 @@ | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| from collections import defaultdict | ||||
|  | ||||
| import pathvalidate | ||||
| from django.conf import settings | ||||
| from django.template.defaultfilters import slugify | ||||
|  | ||||
| @@ -10,10 +12,13 @@ def create_source_path_directory(source_path): | ||||
|     os.makedirs(os.path.dirname(source_path), exist_ok=True) | ||||
|  | ||||
|  | ||||
| def delete_empty_directories(directory): | ||||
| def delete_empty_directories(directory, root): | ||||
|     if not os.path.isdir(directory): | ||||
|         return | ||||
|  | ||||
|     # Go up in the directory hierarchy and try to delete all directories | ||||
|     directory = os.path.normpath(directory) | ||||
|     root = os.path.normpath(settings.ORIGINALS_DIR) | ||||
|     root = os.path.normpath(root) | ||||
|  | ||||
|     if not directory.startswith(root + os.path.sep): | ||||
|         # don't do anything outside our originals folder. | ||||
| @@ -72,14 +77,31 @@ def generate_filename(doc): | ||||
|         if settings.PAPERLESS_FILENAME_FORMAT is not None: | ||||
|             tags = defaultdict(lambda: slugify(None), | ||||
|                                many_to_dictionary(doc.tags)) | ||||
|  | ||||
|             if doc.correspondent: | ||||
|                 correspondent = pathvalidate.sanitize_filename( | ||||
|                     doc.correspondent.name, replacement_text="-" | ||||
|                 ) | ||||
|             else: | ||||
|                 correspondent = "none" | ||||
|  | ||||
|             if doc.document_type: | ||||
|                 document_type = pathvalidate.sanitize_filename( | ||||
|                     doc.document_type.name, replacement_text="-" | ||||
|                 ) | ||||
|             else: | ||||
|                 document_type = "none" | ||||
|  | ||||
|             path = settings.PAPERLESS_FILENAME_FORMAT.format( | ||||
|                 correspondent=slugify(doc.correspondent), | ||||
|                 title=slugify(doc.title), | ||||
|                 created=slugify(doc.created), | ||||
|                 title=pathvalidate.sanitize_filename( | ||||
|                     doc.title, replacement_text="-"), | ||||
|                 correspondent=correspondent, | ||||
|                 document_type=document_type, | ||||
|                 created=datetime.date.isoformat(doc.created), | ||||
|                 created_year=doc.created.year if doc.created else "none", | ||||
|                 created_month=doc.created.month if doc.created else "none", | ||||
|                 created_day=doc.created.day if doc.created else "none", | ||||
|                 added=slugify(doc.added), | ||||
|                 added=datetime.date.isoformat(doc.added), | ||||
|                 added_year=doc.added.year if doc.added else "none", | ||||
|                 added_month=doc.added.month if doc.added else "none", | ||||
|                 added_day=doc.added.day if doc.added else "none", | ||||
| @@ -101,3 +123,8 @@ def generate_filename(doc): | ||||
|         filename += ".gpg" | ||||
|  | ||||
|     return filename | ||||
|  | ||||
|  | ||||
| def archive_name_from_filename(filename): | ||||
|  | ||||
|     return os.path.splitext(filename)[0] + ".pdf" | ||||
|   | ||||
| @@ -1,59 +0,0 @@ | ||||
| import os | ||||
| import tempfile | ||||
| from datetime import datetime | ||||
| from time import mktime | ||||
|  | ||||
| import magic | ||||
| from django import forms | ||||
| from django.conf import settings | ||||
| from django_q.tasks import async_task | ||||
| from pathvalidate import validate_filename, ValidationError | ||||
|  | ||||
| from documents.parsers import is_mime_type_supported | ||||
|  | ||||
|  | ||||
| class UploadForm(forms.Form): | ||||
|  | ||||
|     document = forms.FileField() | ||||
|  | ||||
|     def clean_document(self): | ||||
|         document_name = self.cleaned_data.get("document").name | ||||
|  | ||||
|         try: | ||||
|             validate_filename(document_name) | ||||
|         except ValidationError: | ||||
|             raise forms.ValidationError("That filename is suspicious.") | ||||
|  | ||||
|         document_data = self.cleaned_data.get("document").read() | ||||
|  | ||||
|         mime_type = magic.from_buffer(document_data, mime=True) | ||||
|  | ||||
|         if not is_mime_type_supported(mime_type): | ||||
|             raise forms.ValidationError("This mime type is not supported.") | ||||
|  | ||||
|         return document_name, document_data | ||||
|  | ||||
|     def save(self): | ||||
|         """ | ||||
|         Since the consumer already does a lot of work, it's easier just to save | ||||
|         to-be-consumed files to the consumption directory rather than have the | ||||
|         form do that as well.  Think of it as a poor-man's queue server. | ||||
|         """ | ||||
|  | ||||
|         original_filename, data = self.cleaned_data.get("document") | ||||
|  | ||||
|         t = int(mktime(datetime.now().timetuple())) | ||||
|  | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|  | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", | ||||
|                                          dir=settings.SCRATCH_DIR, | ||||
|                                          delete=False) as f: | ||||
|  | ||||
|             f.write(data) | ||||
|             os.utime(f.name, times=(t, t)) | ||||
|  | ||||
|             async_task("documents.tasks.consume_file", | ||||
|                        f.name, | ||||
|                        override_filename=original_filename, | ||||
|                        task_name=os.path.basename(original_filename)[:100]) | ||||
| @@ -4,10 +4,11 @@ from contextlib import contextmanager | ||||
|  | ||||
| from django.conf import settings | ||||
| from whoosh import highlight | ||||
| from whoosh.fields import Schema, TEXT, NUMERIC | ||||
| from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME | ||||
| from whoosh.highlight import Formatter, get_text | ||||
| from whoosh.index import create_in, exists_in, open_dir | ||||
| from whoosh.qparser import MultifieldParser | ||||
| from whoosh.qparser.dateparse import DateParserPlugin | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
|  | ||||
| @@ -59,33 +60,49 @@ def get_schema(): | ||||
|         id=NUMERIC(stored=True, unique=True, numtype=int), | ||||
|         title=TEXT(stored=True), | ||||
|         content=TEXT(), | ||||
|         correspondent=TEXT(stored=True) | ||||
|         correspondent=TEXT(stored=True), | ||||
|         tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True), | ||||
|         type=TEXT(stored=True), | ||||
|         created=DATETIME(stored=True, sortable=True), | ||||
|         modified=DATETIME(stored=True, sortable=True), | ||||
|         added=DATETIME(stored=True, sortable=True), | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def open_index(recreate=False): | ||||
|     if exists_in(settings.INDEX_DIR) and not recreate: | ||||
|         return open_dir(settings.INDEX_DIR) | ||||
|     else: | ||||
|         # TODO: this is not thread safe. If 2 instances try to create the index | ||||
|         #  at the same time, this fails. This currently prevents parallel | ||||
|         #  tests. | ||||
|         if not os.path.isdir(settings.INDEX_DIR): | ||||
|             os.makedirs(settings.INDEX_DIR, exist_ok=True) | ||||
|         return create_in(settings.INDEX_DIR, get_schema()) | ||||
|     try: | ||||
|         if exists_in(settings.INDEX_DIR) and not recreate: | ||||
|             return open_dir(settings.INDEX_DIR, schema=get_schema()) | ||||
|     except Exception as e: | ||||
|         logger.error(f"Error while opening the index: {e}, recreating.") | ||||
|  | ||||
|     if not os.path.isdir(settings.INDEX_DIR): | ||||
|         os.makedirs(settings.INDEX_DIR, exist_ok=True) | ||||
|     return create_in(settings.INDEX_DIR, get_schema()) | ||||
|  | ||||
|  | ||||
| def update_document(writer, doc): | ||||
|     # TODO: this line caused many issues all around, since: | ||||
|     #  We need to make sure that this method does not get called with | ||||
|     #  deserialized documents (i.e, document objects that don't come from | ||||
|     #  Django's ORM interfaces directly. | ||||
|     logger.debug("Indexing {}...".format(doc)) | ||||
|     tags = ",".join([t.name for t in doc.tags.all()]) | ||||
|     writer.update_document( | ||||
|         id=doc.pk, | ||||
|         title=doc.title, | ||||
|         content=doc.content, | ||||
|         correspondent=doc.correspondent.name if doc.correspondent else None | ||||
|         correspondent=doc.correspondent.name if doc.correspondent else None, | ||||
|         tag=tags if tags else None, | ||||
|         type=doc.document_type.name if doc.document_type else None, | ||||
|         created=doc.created, | ||||
|         added=doc.added, | ||||
|         modified=doc.modified, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def remove_document(writer, doc): | ||||
|     # TODO: see above. | ||||
|     logger.debug("Removing {} from index...".format(doc)) | ||||
|     writer.delete_by_term('id', doc.pk) | ||||
|  | ||||
| @@ -103,16 +120,27 @@ def remove_document_from_index(document): | ||||
|  | ||||
|  | ||||
| @contextmanager | ||||
| def query_page(ix, query, page): | ||||
| def query_page(ix, querystring, page): | ||||
|     searcher = ix.searcher() | ||||
|     try: | ||||
|         query_parser = MultifieldParser(["content", "title", "correspondent"], | ||||
|                                         ix.schema).parse(query) | ||||
|         result_page = searcher.search_page(query_parser, page) | ||||
|         qp = MultifieldParser( | ||||
|             ["content", "title", "correspondent", "tag", "type"], | ||||
|             ix.schema) | ||||
|         qp.add_plugin(DateParserPlugin()) | ||||
|  | ||||
|         q = qp.parse(querystring) | ||||
|         result_page = searcher.search_page(q, page) | ||||
|         result_page.results.fragmenter = highlight.ContextFragmenter( | ||||
|             surround=50) | ||||
|         result_page.results.formatter = JsonFormatter() | ||||
|         yield result_page | ||||
|  | ||||
|         corrected = searcher.correct_query(q, querystring) | ||||
|         if corrected.query != q: | ||||
|             corrected_query = corrected.string | ||||
|         else: | ||||
|             corrected_query = None | ||||
|  | ||||
|         yield result_page, corrected_query | ||||
|     finally: | ||||
|         searcher.close() | ||||
|  | ||||
|   | ||||
| @@ -1,9 +1,14 @@ | ||||
| import logging | ||||
| import uuid | ||||
|  | ||||
| from django.conf import settings | ||||
|  | ||||
|  | ||||
| class PaperlessHandler(logging.Handler): | ||||
|     def emit(self, record): | ||||
|         if settings.DISABLE_DBHANDLER: | ||||
|             return | ||||
|  | ||||
|         # We have to do the import here or Django will barf when it tries to | ||||
|         # load this because the apps aren't loaded at that point | ||||
|         from .models import Log | ||||
| @@ -23,10 +28,10 @@ class LoggingMixin: | ||||
|     def renew_logging_group(self): | ||||
|         self.logging_group = uuid.uuid4() | ||||
|  | ||||
|     def log(self, level, message): | ||||
|     def log(self, level, message, **kwargs): | ||||
|         target = ".".join([self.__class__.__module__, self.__class__.__name__]) | ||||
|         logger = logging.getLogger(target) | ||||
|  | ||||
|         getattr(logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
|         }) | ||||
|         }, **kwargs) | ||||
|   | ||||
| @@ -17,16 +17,6 @@ class Command(BaseCommand): | ||||
| 
 | ||||
|     def add_arguments(self, parser): | ||||
| 
 | ||||
|         parser.add_argument( | ||||
|             "from", | ||||
|             choices=("gpg", "unencrypted"), | ||||
|             help="The state you want to change your documents from" | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "to", | ||||
|             choices=("gpg", "unencrypted"), | ||||
|             help="The state you want to change your documents to" | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--passphrase", | ||||
|             help="If PAPERLESS_PASSPHRASE isn't set already, you need to " | ||||
| @@ -50,11 +40,6 @@ class Command(BaseCommand): | ||||
|         except KeyboardInterrupt: | ||||
|             return | ||||
| 
 | ||||
|         if options["from"] == options["to"]: | ||||
|             raise CommandError( | ||||
|                 'The "from" and "to" values can\'t be the same.' | ||||
|             ) | ||||
| 
 | ||||
|         passphrase = options["passphrase"] or settings.PASSPHRASE | ||||
|         if not passphrase: | ||||
|             raise CommandError( | ||||
| @@ -62,10 +47,7 @@ class Command(BaseCommand): | ||||
|                 "by declaring it in your environment or your config." | ||||
|             ) | ||||
| 
 | ||||
|         if options["from"] == "gpg" and options["to"] == "unencrypted": | ||||
|             self.__gpg_to_unencrypted(passphrase) | ||||
|         elif options["from"] == "unencrypted" and options["to"] == "gpg": | ||||
|             self.__unencrypted_to_gpg(passphrase) | ||||
|         self.__gpg_to_unencrypted(passphrase) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def __gpg_to_unencrypted(passphrase): | ||||
| @@ -79,42 +61,28 @@ class Command(BaseCommand): | ||||
|                 document).encode('utf-8'), "green")) | ||||
| 
 | ||||
|             old_paths = [document.source_path, document.thumbnail_path] | ||||
| 
 | ||||
|             raw_document = GnuPG.decrypted(document.source_file, passphrase) | ||||
|             raw_thumb = GnuPG.decrypted(document.thumbnail_file, passphrase) | ||||
| 
 | ||||
|             document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
| 
 | ||||
|             ext = os.path.splitext(document.filename)[1] | ||||
| 
 | ||||
|             if not ext == '.gpg': | ||||
|                 raise CommandError( | ||||
|                     f"Abort: encrypted file {document.source_path} does not " | ||||
|                     f"end with .gpg") | ||||
| 
 | ||||
|             document.filename = os.path.splitext(document.filename)[0] | ||||
| 
 | ||||
|             with open(document.source_path, "wb") as f: | ||||
|                 f.write(raw_document) | ||||
| 
 | ||||
|             with open(document.thumbnail_path, "wb") as f: | ||||
|                 f.write(raw_thumb) | ||||
| 
 | ||||
|             document.save(update_fields=("storage_type",)) | ||||
| 
 | ||||
|             for path in old_paths: | ||||
|                 os.unlink(path) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def __unencrypted_to_gpg(passphrase): | ||||
| 
 | ||||
|         unencrypted_files = Document.objects.filter( | ||||
|             storage_type=Document.STORAGE_TYPE_UNENCRYPTED) | ||||
| 
 | ||||
|         for document in unencrypted_files: | ||||
| 
 | ||||
|             print(coloured("Encrypting {}".format(document), "green")) | ||||
| 
 | ||||
|             old_paths = [document.source_path, document.thumbnail_path] | ||||
|             with open(document.source_path, "rb") as raw_document: | ||||
|                 with open(document.thumbnail_path, "rb") as raw_thumb: | ||||
|                     document.storage_type = Document.STORAGE_TYPE_GPG | ||||
|                     with open(document.source_path, "wb") as f: | ||||
|                         f.write(GnuPG.encrypted(raw_document, passphrase)) | ||||
|                     with open(document.thumbnail_path, "wb") as f: | ||||
|                         f.write(GnuPG.encrypted(raw_thumb, passphrase)) | ||||
| 
 | ||||
|             document.save(update_fields=("storage_type",)) | ||||
|             document.save(update_fields=("storage_type", "filename")) | ||||
| 
 | ||||
|             for path in old_paths: | ||||
|                 os.unlink(path) | ||||
							
								
								
									
										128
									
								
								src/documents/management/commands/document_archiver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,128 @@ | ||||
| import hashlib | ||||
| import multiprocessing | ||||
|  | ||||
| import logging | ||||
| import os | ||||
| import shutil | ||||
| import uuid | ||||
|  | ||||
| import tqdm | ||||
| from django import db | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand | ||||
| from django.db import transaction | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| from documents.models import Document | ||||
| from ... import index | ||||
| from ...file_handling import create_source_path_directory | ||||
| from ...mixins import Renderable | ||||
| from ...parsers import get_parser_class_for_mime_type | ||||
|  | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def handle_document(document_id): | ||||
|     document = Document.objects.get(id=document_id) | ||||
|  | ||||
|     mime_type = document.mime_type | ||||
|  | ||||
|     parser_class = get_parser_class_for_mime_type(mime_type) | ||||
|  | ||||
|     parser = parser_class(logging_group=uuid.uuid4()) | ||||
|  | ||||
|     try: | ||||
|         parser.parse(document.source_path, mime_type) | ||||
|  | ||||
|         if parser.get_archive_path(): | ||||
|             with transaction.atomic(): | ||||
|                 with open(parser.get_archive_path(), 'rb') as f: | ||||
|                     checksum = hashlib.md5(f.read()).hexdigest() | ||||
|                 # i'm going to save first so that in case the file move | ||||
|                 # fails, the database is rolled back. | ||||
|                 # we also don't use save() since that triggers the filehandling | ||||
|                 # logic, and we don't want that yet (file not yet in place) | ||||
|                 Document.objects.filter(pk=document.pk).update( | ||||
|                     archive_checksum=checksum, | ||||
|                     content=parser.get_text() | ||||
|                 ) | ||||
|                 create_source_path_directory(document.archive_path) | ||||
|                 shutil.move(parser.get_archive_path(), document.archive_path) | ||||
|  | ||||
|         with AsyncWriter(index.open_index()) as writer: | ||||
|             index.update_document(writer, document) | ||||
|  | ||||
|     except Exception as e: | ||||
|         logger.error(f"Error while parsing document {document}: {str(e)}") | ||||
|     finally: | ||||
|         parser.cleanup() | ||||
|  | ||||
|  | ||||
| class Command(Renderable, BaseCommand): | ||||
|  | ||||
|     help = """ | ||||
|         Using the current classification model, assigns correspondents, tags | ||||
|         and document types to all documents, effectively allowing you to | ||||
|         back-tag all previously indexed documents with metadata created (or | ||||
|         modified) after their initial import. | ||||
|     """.replace("    ", "") | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         self.verbosity = 0 | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument( | ||||
|             "-f", "--overwrite", | ||||
|             default=False, | ||||
|             action="store_true", | ||||
|             help="Recreates the archived document for documents that already " | ||||
|                  "have an archived version." | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "-d", "--document", | ||||
|             default=None, | ||||
|             type=int, | ||||
|             required=False, | ||||
|             help="Specify the ID of a document, and this command will only " | ||||
|                  "run on this specific document." | ||||
|         ) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|  | ||||
|         overwrite = options["overwrite"] | ||||
|  | ||||
|         if options['document']: | ||||
|             documents = Document.objects.filter(pk=options['document']) | ||||
|         else: | ||||
|             documents = Document.objects.all() | ||||
|  | ||||
|         document_ids = list(map( | ||||
|             lambda doc: doc.id, | ||||
|             filter( | ||||
|                 lambda d: overwrite or not d.archive_checksum, | ||||
|                 documents | ||||
|             ) | ||||
|         )) | ||||
|  | ||||
|         # Note to future self: this prevents django from reusing database | ||||
|         # conncetions between processes, which is bad and does not work | ||||
|         # with postgres. | ||||
|         db.connections.close_all() | ||||
|  | ||||
|         try: | ||||
|  | ||||
|             logging.getLogger().handlers[0].level = logging.ERROR | ||||
|             with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: | ||||
|                 list(tqdm.tqdm( | ||||
|                     pool.imap_unordered( | ||||
|                         handle_document, | ||||
|                         document_ids | ||||
|                     ), | ||||
|                     total=len(document_ids) | ||||
|                 )) | ||||
|         except KeyboardInterrupt: | ||||
|             print("Aborting...") | ||||
| @@ -1,37 +1,104 @@ | ||||
| import logging | ||||
| import os | ||||
| from pathlib import Path | ||||
| from time import sleep | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
| from django.utils.text import slugify | ||||
| from django_q.tasks import async_task | ||||
| from watchdog.events import FileSystemEventHandler | ||||
| from watchdog.observers import Observer | ||||
| from watchdog.observers.polling import PollingObserver | ||||
|  | ||||
| from documents.models import Tag | ||||
| from documents.parsers import is_file_ext_supported | ||||
|  | ||||
| try: | ||||
|     from inotify_simple import INotify, flags | ||||
|     from inotifyrecursive import INotify, flags | ||||
| except ImportError: | ||||
|     INotify = flags = None | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def _tags_from_path(filepath): | ||||
|     """Walk up the directory tree from filepath to CONSUMPTION_DIr | ||||
|        and get or create Tag IDs for every directory. | ||||
|     """ | ||||
|     tag_ids = set() | ||||
|     path_parts = Path(filepath).relative_to( | ||||
|                 settings.CONSUMPTION_DIR).parent.parts | ||||
|     for part in path_parts: | ||||
|         tag_ids.add(Tag.objects.get_or_create( | ||||
|             slug=slugify(part), | ||||
|             defaults={"name": part}, | ||||
|         )[0].pk) | ||||
|  | ||||
|     return tag_ids | ||||
|  | ||||
|  | ||||
| def _consume(filepath): | ||||
|     if os.path.isdir(filepath): | ||||
|         return | ||||
|  | ||||
|     if not os.path.isfile(filepath): | ||||
|         logger.debug( | ||||
|             f"Not consuming file {filepath}: File has moved.") | ||||
|         return | ||||
|  | ||||
|     if not is_file_ext_supported(os.path.splitext(filepath)[1]): | ||||
|         logger.debug( | ||||
|             f"Not consuming file {filepath}: Unknown file extension.") | ||||
|         return | ||||
|  | ||||
|     tag_ids = None | ||||
|     try: | ||||
|         if settings.CONSUMER_SUBDIRS_AS_TAGS: | ||||
|             tag_ids = _tags_from_path(filepath) | ||||
|     except Exception as e: | ||||
|         logger.error( | ||||
|             "Error creating tags from path: {}".format(e)) | ||||
|  | ||||
|     try: | ||||
|         async_task("documents.tasks.consume_file", | ||||
|                    filepath, | ||||
|                    override_tag_ids=tag_ids if tag_ids else None, | ||||
|                    task_name=os.path.basename(filepath)[:100]) | ||||
|     except Exception as e: | ||||
|         # Catch all so that the consumer won't crash. | ||||
|         # This is also what the test case is listening for to check for | ||||
|         # errors. | ||||
|         logger.error( | ||||
|             "Error while consuming document: {}".format(e)) | ||||
|  | ||||
|  | ||||
| def _consume_wait_unmodified(file, num_tries=20, wait_time=1): | ||||
|     mtime = -1 | ||||
|     current_try = 0 | ||||
|     while current_try < num_tries: | ||||
|         try: | ||||
|             new_mtime = os.stat(file).st_mtime | ||||
|         except FileNotFoundError: | ||||
|             logger.debug(f"File {file} moved while waiting for it to remain " | ||||
|                          f"unmodified.") | ||||
|             return | ||||
|         if new_mtime == mtime: | ||||
|             _consume(file) | ||||
|             return | ||||
|         mtime = new_mtime | ||||
|         sleep(wait_time) | ||||
|         current_try += 1 | ||||
|  | ||||
|     logger.error(f"Timeout while waiting on file {file} to remain unmodified.") | ||||
|  | ||||
|  | ||||
| class Handler(FileSystemEventHandler): | ||||
|  | ||||
|     def _consume(self, file): | ||||
|         if os.path.isfile(file): | ||||
|             try: | ||||
|                 async_task("documents.tasks.consume_file", | ||||
|                            file, | ||||
|                            task_name=os.path.basename(file)[:100]) | ||||
|             except Exception as e: | ||||
|                 # Catch all so that the consumer won't crash. | ||||
|                 logging.getLogger(__name__).error( | ||||
|                     "Error while consuming document: {}".format(e)) | ||||
|  | ||||
|     def on_created(self, event): | ||||
|         self._consume(event.src_path) | ||||
|         _consume_wait_unmodified(event.src_path) | ||||
|  | ||||
|     def on_moved(self, event): | ||||
|         self._consume(event.src_path) | ||||
|         _consume_wait_unmodified(event.dest_path) | ||||
|  | ||||
|  | ||||
| class Command(BaseCommand): | ||||
| @@ -40,12 +107,15 @@ class Command(BaseCommand): | ||||
|     consumption directory. | ||||
|     """ | ||||
|  | ||||
|     # This is here primarily for the tests and is irrelevant in production. | ||||
|     stop_flag = False | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|  | ||||
|         self.verbosity = 0 | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|  | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|         self.observer = None | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument( | ||||
| @@ -54,38 +124,81 @@ class Command(BaseCommand): | ||||
|             nargs="?", | ||||
|             help="The consumption directory." | ||||
|         ) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         self.verbosity = options["verbosity"] | ||||
|         directory = options["directory"] | ||||
|  | ||||
|         logging.getLogger(__name__).info( | ||||
|             "Starting document consumer at {}".format( | ||||
|                 directory | ||||
|             ) | ||||
|         parser.add_argument( | ||||
|             "--oneshot", | ||||
|             action="store_true", | ||||
|             help="Run only once." | ||||
|         ) | ||||
|  | ||||
|         # Consume all files as this is not done initially by the watchdog | ||||
|         for entry in os.scandir(directory): | ||||
|             if entry.is_file(): | ||||
|                 async_task("documents.tasks.consume_file", | ||||
|                            entry.path, | ||||
|                            task_name=os.path.basename(entry.path)[:100]) | ||||
|     def handle(self, *args, **options): | ||||
|         directory = options["directory"] | ||||
|         recursive = settings.CONSUMER_RECURSIVE | ||||
|  | ||||
|         # Start the watchdog. Woof! | ||||
|         if settings.CONSUMER_POLLING > 0: | ||||
|             logging.getLogger(__name__).info( | ||||
|                 "Using polling instead of file system notifications.") | ||||
|             observer = PollingObserver(timeout=settings.CONSUMER_POLLING) | ||||
|         if not directory: | ||||
|             raise CommandError( | ||||
|                 "CONSUMPTION_DIR does not appear to be set." | ||||
|             ) | ||||
|  | ||||
|         if not os.path.isdir(directory): | ||||
|             raise CommandError( | ||||
|                 f"Consumption directory {directory} does not exist") | ||||
|  | ||||
|         if recursive: | ||||
|             for dirpath, _, filenames in os.walk(directory): | ||||
|                 for filename in filenames: | ||||
|                     filepath = os.path.join(dirpath, filename) | ||||
|                     _consume(filepath) | ||||
|         else: | ||||
|             observer = Observer() | ||||
|         event_handler = Handler() | ||||
|         observer.schedule(event_handler, directory, recursive=True) | ||||
|         observer.start() | ||||
|             for entry in os.scandir(directory): | ||||
|                 _consume(entry.path) | ||||
|  | ||||
|         if options["oneshot"]: | ||||
|             return | ||||
|  | ||||
|         if settings.CONSUMER_POLLING == 0 and INotify: | ||||
|             self.handle_inotify(directory, recursive) | ||||
|         else: | ||||
|             self.handle_polling(directory, recursive) | ||||
|  | ||||
|         logger.debug("Consumer exiting.") | ||||
|  | ||||
|     def handle_polling(self, directory, recursive): | ||||
|         logging.getLogger(__name__).info( | ||||
|             f"Polling directory for changes: {directory}") | ||||
|         self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING) | ||||
|         self.observer.schedule(Handler(), directory, recursive=recursive) | ||||
|         self.observer.start() | ||||
|         try: | ||||
|             while observer.is_alive(): | ||||
|                 observer.join(1) | ||||
|             while self.observer.is_alive(): | ||||
|                 self.observer.join(1) | ||||
|                 if self.stop_flag: | ||||
|                     self.observer.stop() | ||||
|         except KeyboardInterrupt: | ||||
|             observer.stop() | ||||
|         observer.join() | ||||
|             self.observer.stop() | ||||
|         self.observer.join() | ||||
|  | ||||
|     def handle_inotify(self, directory, recursive): | ||||
|         logging.getLogger(__name__).info( | ||||
|             f"Using inotify to watch directory for changes: {directory}") | ||||
|  | ||||
|         inotify = INotify() | ||||
|         inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO | ||||
|         if recursive: | ||||
|             descriptor = inotify.add_watch_recursive(directory, inotify_flags) | ||||
|         else: | ||||
|             descriptor = inotify.add_watch(directory, inotify_flags) | ||||
|  | ||||
|         try: | ||||
|             while not self.stop_flag: | ||||
|                 for event in inotify.read(timeout=1000): | ||||
|                     if recursive: | ||||
|                         path = inotify.get_path(event.wd) | ||||
|                     else: | ||||
|                         path = directory | ||||
|                     filepath = os.path.join(path, event.name) | ||||
|                     _consume(filepath) | ||||
|         except KeyboardInterrupt: | ||||
|             pass | ||||
|  | ||||
|         inotify.rm_watch(descriptor) | ||||
|         inotify.close() | ||||
|   | ||||
| @@ -7,7 +7,8 @@ from django.core import serializers | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
|  | ||||
| from documents.models import Document, Correspondent, Tag, DocumentType | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||
|     EXPORTER_ARCHIVE_NAME | ||||
| from paperless.db import GnuPG | ||||
| from ...mixins import Renderable | ||||
|  | ||||
| @@ -22,13 +23,6 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument("target") | ||||
|         parser.add_argument( | ||||
|             "--legacy", | ||||
|             action="store_true", | ||||
|             help="Don't try to export all of the document data, just dump the " | ||||
|                  "original document files out in a format that makes " | ||||
|                  "re-consuming them easy." | ||||
|         ) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
| @@ -44,10 +38,10 @@ class Command(Renderable, BaseCommand): | ||||
|         if not os.access(self.target, os.W_OK): | ||||
|             raise CommandError("That path doesn't appear to be writable") | ||||
|  | ||||
|         if options["legacy"]: | ||||
|             self.dump_legacy() | ||||
|         else: | ||||
|             self.dump() | ||||
|         if os.listdir(self.target): | ||||
|             raise CommandError("That directory is not empty.") | ||||
|  | ||||
|         self.dump() | ||||
|  | ||||
|     def dump(self): | ||||
|  | ||||
| @@ -63,34 +57,56 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|             document = document_map[document_dict["pk"]] | ||||
|  | ||||
|             unique_filename = f"{document.pk:07}_{document.file_name}" | ||||
|             print(f"Exporting: {document}") | ||||
|  | ||||
|             file_target = os.path.join(self.target, unique_filename) | ||||
|             filename_counter = 0 | ||||
|             while True: | ||||
|                 original_name = document.get_public_filename( | ||||
|                     counter=filename_counter) | ||||
|                 original_target = os.path.join(self.target, original_name) | ||||
|  | ||||
|             thumbnail_name = unique_filename + "-thumbnail.png" | ||||
|                 if not os.path.exists(original_target): | ||||
|                     break | ||||
|                 else: | ||||
|                     filename_counter += 1 | ||||
|  | ||||
|             thumbnail_name = original_name + "-thumbnail.png" | ||||
|             thumbnail_target = os.path.join(self.target, thumbnail_name) | ||||
|  | ||||
|             document_dict[EXPORTER_FILE_NAME] = unique_filename | ||||
|             document_dict[EXPORTER_FILE_NAME] = original_name | ||||
|             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name | ||||
|  | ||||
|             print(f"Exporting: {file_target}") | ||||
|             if os.path.exists(document.archive_path): | ||||
|                 archive_name = document.get_public_filename( | ||||
|                     archive=True, counter=filename_counter, suffix="_archive") | ||||
|                 archive_target = os.path.join(self.target, archive_name) | ||||
|                 document_dict[EXPORTER_ARCHIVE_NAME] = archive_name | ||||
|             else: | ||||
|                 archive_target = None | ||||
|  | ||||
|             t = int(time.mktime(document.created.timetuple())) | ||||
|             if document.storage_type == Document.STORAGE_TYPE_GPG: | ||||
|  | ||||
|                 with open(file_target, "wb") as f: | ||||
|                 with open(original_target, "wb") as f: | ||||
|                     f.write(GnuPG.decrypted(document.source_file)) | ||||
|                     os.utime(file_target, times=(t, t)) | ||||
|                     os.utime(original_target, times=(t, t)) | ||||
|  | ||||
|                 with open(thumbnail_target, "wb") as f: | ||||
|                     f.write(GnuPG.decrypted(document.thumbnail_file)) | ||||
|                     os.utime(thumbnail_target, times=(t, t)) | ||||
|  | ||||
|                 if archive_target: | ||||
|                     with open(archive_target, "wb") as f: | ||||
|                         f.write(GnuPG.decrypted(document.archive_path)) | ||||
|                         os.utime(archive_target, times=(t, t)) | ||||
|             else: | ||||
|  | ||||
|                 shutil.copy(document.source_path, file_target) | ||||
|                 shutil.copy(document.source_path, original_target) | ||||
|                 shutil.copy(document.thumbnail_path, thumbnail_target) | ||||
|  | ||||
|                 if archive_target: | ||||
|                     shutil.copy(document.archive_path, archive_target) | ||||
|  | ||||
|         manifest += json.loads( | ||||
|             serializers.serialize("json", Correspondent.objects.all())) | ||||
|  | ||||
| @@ -102,33 +118,3 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|         with open(os.path.join(self.target, "manifest.json"), "w") as f: | ||||
|             json.dump(manifest, f, indent=2) | ||||
|  | ||||
|     def dump_legacy(self): | ||||
|  | ||||
|         for document in Document.objects.all(): | ||||
|  | ||||
|             target = os.path.join( | ||||
|                 self.target, self._get_legacy_file_name(document)) | ||||
|  | ||||
|             print("Exporting: {}".format(target)) | ||||
|  | ||||
|             with open(target, "wb") as f: | ||||
|                 f.write(GnuPG.decrypted(document.source_file)) | ||||
|                 t = int(time.mktime(document.created.timetuple())) | ||||
|                 os.utime(target, times=(t, t)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _get_legacy_file_name(doc): | ||||
|  | ||||
|         if not doc.correspondent and not doc.title: | ||||
|             return os.path.basename(doc.source_path) | ||||
|  | ||||
|         created = doc.created.strftime("%Y%m%d%H%M%SZ") | ||||
|         tags = ",".join([t.slug for t in doc.tags.all()]) | ||||
|  | ||||
|         if tags: | ||||
|             return "{} - {} - {} - {}{}".format( | ||||
|                 created, doc.correspondent, doc.title, tags, doc.file_type) | ||||
|  | ||||
|         return "{} - {} - {}{}".format( | ||||
|             created, doc.correspondent, doc.title, doc.file_type) | ||||
|   | ||||
| @@ -7,8 +7,8 @@ from django.core.management import call_command | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
|  | ||||
| from documents.models import Document | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME | ||||
| from paperless.db import GnuPG | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||
|     EXPORTER_ARCHIVE_NAME | ||||
| from ...file_handling import generate_filename, create_source_path_directory | ||||
| from ...mixins import Renderable | ||||
|  | ||||
| @@ -79,25 +79,41 @@ class Command(Renderable, BaseCommand): | ||||
|                     'appear to be in the source directory.'.format(doc_file) | ||||
|                 ) | ||||
|  | ||||
|             if EXPORTER_ARCHIVE_NAME in record: | ||||
|                 archive_file = record[EXPORTER_ARCHIVE_NAME] | ||||
|                 if not os.path.exists(os.path.join(self.source, archive_file)): | ||||
|                     raise CommandError( | ||||
|                         f"The manifest file refers to {archive_file} which " | ||||
|                         f"does not appear to be in the source directory." | ||||
|                     ) | ||||
|  | ||||
|     def _import_files_from_manifest(self): | ||||
|  | ||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|         if settings.PASSPHRASE: | ||||
|             storage_type = Document.STORAGE_TYPE_GPG | ||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) | ||||
|  | ||||
|         for record in self.manifest: | ||||
|  | ||||
|             if not record["model"] == "documents.document": | ||||
|                 continue | ||||
|  | ||||
|             doc_file = record[EXPORTER_FILE_NAME] | ||||
|             thumb_file = record[EXPORTER_THUMBNAIL_NAME] | ||||
|             document = Document.objects.get(pk=record["pk"]) | ||||
|  | ||||
|             doc_file = record[EXPORTER_FILE_NAME] | ||||
|             document_path = os.path.join(self.source, doc_file) | ||||
|  | ||||
|             thumb_file = record[EXPORTER_THUMBNAIL_NAME] | ||||
|             thumbnail_path = os.path.join(self.source, thumb_file) | ||||
|  | ||||
|             document.storage_type = storage_type | ||||
|             if EXPORTER_ARCHIVE_NAME in record: | ||||
|                 archive_file = record[EXPORTER_ARCHIVE_NAME] | ||||
|                 archive_path = os.path.join(self.source, archive_file) | ||||
|             else: | ||||
|                 archive_path = None | ||||
|  | ||||
|             document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
|             document.filename = generate_filename(document) | ||||
|  | ||||
|             if os.path.isfile(document.source_path): | ||||
| @@ -105,23 +121,10 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|             create_source_path_directory(document.source_path) | ||||
|  | ||||
|             if settings.PASSPHRASE: | ||||
|  | ||||
|                 with open(document_path, "rb") as unencrypted: | ||||
|                     with open(document.source_path, "wb") as encrypted: | ||||
|                         print("Encrypting {} and saving it to {}".format( | ||||
|                             doc_file, document.source_path)) | ||||
|                         encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|                 with open(thumbnail_path, "rb") as unencrypted: | ||||
|                     with open(document.thumbnail_path, "wb") as encrypted: | ||||
|                         print("Encrypting {} and saving it to {}".format( | ||||
|                             thumb_file, document.thumbnail_path)) | ||||
|                         encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|             else: | ||||
|                 print(f"Moving {document_path} to {document.source_path}") | ||||
|                 shutil.copy(document_path, document.source_path) | ||||
|                 shutil.copy(thumbnail_path, document.thumbnail_path) | ||||
|             print(f"Moving {document_path} to {document.source_path}") | ||||
|             shutil.copy(document_path, document.source_path) | ||||
|             shutil.copy(thumbnail_path, document.thumbnail_path) | ||||
|             if archive_path: | ||||
|                 shutil.copy(archive_path, document.archive_path) | ||||
|  | ||||
|             document.save() | ||||
|   | ||||
| @@ -5,23 +5,6 @@ from django.db import migrations, models | ||||
| import django.db.models.deletion | ||||
|  | ||||
|  | ||||
| def make_index(apps, schema_editor): | ||||
|     Document = apps.get_model("documents", "Document") | ||||
|     documents = Document.objects.all() | ||||
|     print() | ||||
|     try: | ||||
|         print("  --> Creating document index...") | ||||
|         from whoosh.writing import AsyncWriter | ||||
|         from documents import index | ||||
|         ix = index.open_index(recreate=True) | ||||
|         with AsyncWriter(ix) as writer: | ||||
|             for document in documents: | ||||
|                 index.update_document(writer, document) | ||||
|     except ImportError: | ||||
|         # index may not be relevant anymore | ||||
|         print("  --> Cannot create document index.") | ||||
|  | ||||
|  | ||||
| def logs_set_default_group(apps, schema_editor): | ||||
|     Log = apps.get_model('documents', 'Log') | ||||
|     for log in Log.objects.all(): | ||||
| @@ -99,8 +82,4 @@ class Migration(migrations.Migration): | ||||
|             code=django.db.migrations.operations.special.RunPython.noop, | ||||
|             reverse_code=logs_set_default_group | ||||
|         ), | ||||
|         migrations.RunPython( | ||||
|             code=make_index, | ||||
|             reverse_code=django.db.migrations.operations.special.RunPython.noop, | ||||
|         ), | ||||
|     ] | ||||
|   | ||||
							
								
								
									
										26
									
								
								src/documents/migrations/1004_sanity_check_schedule.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,26 @@ | ||||
| # Generated by Django 3.1.3 on 2020-11-25 14:53 | ||||
|  | ||||
| from django.db import migrations | ||||
| from django.db.migrations import RunPython | ||||
| from django_q.models import Schedule | ||||
| from django_q.tasks import schedule | ||||
|  | ||||
|  | ||||
| def add_schedules(apps, schema_editor): | ||||
|     schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY) | ||||
|  | ||||
|  | ||||
| def remove_schedules(apps, schema_editor): | ||||
|     Schedule.objects.filter(func='documents.tasks.sanity_check').delete() | ||||
|  | ||||
|  | ||||
| class Migration(migrations.Migration): | ||||
|  | ||||
|     dependencies = [ | ||||
|         ('documents', '1003_mime_types'), | ||||
|         ('django_q', '0013_task_attempt_count'), | ||||
|     ] | ||||
|  | ||||
|     operations = [ | ||||
|         RunPython(add_schedules, remove_schedules) | ||||
|     ] | ||||
							
								
								
									
										23
									
								
								src/documents/migrations/1005_checksums.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,23 @@ | ||||
| # Generated by Django 3.1.3 on 2020-11-29 00:48 | ||||
|  | ||||
| from django.db import migrations, models | ||||
|  | ||||
|  | ||||
| class Migration(migrations.Migration): | ||||
|  | ||||
|     dependencies = [ | ||||
|         ('documents', '1004_sanity_check_schedule'), | ||||
|     ] | ||||
|  | ||||
|     operations = [ | ||||
|         migrations.AddField( | ||||
|             model_name='document', | ||||
|             name='archive_checksum', | ||||
|             field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True), | ||||
|         ), | ||||
|         migrations.AlterField( | ||||
|             model_name='document', | ||||
|             name='checksum', | ||||
|             field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True), | ||||
|         ), | ||||
|     ] | ||||
| @@ -1,17 +1,21 @@ | ||||
| # coding=utf-8 | ||||
|  | ||||
| import datetime | ||||
| import logging | ||||
| import mimetypes | ||||
| import os | ||||
| import re | ||||
| from collections import OrderedDict | ||||
|  | ||||
| import pathvalidate | ||||
|  | ||||
| import dateutil.parser | ||||
| from django.conf import settings | ||||
| from django.db import models | ||||
| from django.utils import timezone | ||||
| from django.utils.text import slugify | ||||
|  | ||||
| from documents.file_handling import archive_name_from_filename | ||||
| from documents.parsers import get_default_file_extension | ||||
|  | ||||
|  | ||||
| class MatchingModel(models.Model): | ||||
|  | ||||
| @@ -157,9 +161,15 @@ class Document(models.Model): | ||||
|         max_length=32, | ||||
|         editable=False, | ||||
|         unique=True, | ||||
|         help_text="The checksum of the original document (before it was " | ||||
|                   "encrypted).  We use this to prevent duplicate document " | ||||
|                   "imports." | ||||
|         help_text="The checksum of the original document." | ||||
|     ) | ||||
|  | ||||
|     archive_checksum = models.CharField( | ||||
|         max_length=32, | ||||
|         editable=False, | ||||
|         blank=True, | ||||
|         null=True, | ||||
|         help_text="The checksum of the archived document." | ||||
|     ) | ||||
|  | ||||
|     created = models.DateTimeField( | ||||
| @@ -198,13 +208,11 @@ class Document(models.Model): | ||||
|         ordering = ("correspondent", "title") | ||||
|  | ||||
|     def __str__(self): | ||||
|         created = self.created.strftime("%Y%m%d%H%M%S") | ||||
|         created = datetime.date.isoformat(self.created) | ||||
|         if self.correspondent and self.title: | ||||
|             return "{}: {} - {}".format( | ||||
|                 created, self.correspondent, self.title) | ||||
|         if self.correspondent or self.title: | ||||
|             return "{}: {}".format(created, self.correspondent or self.title) | ||||
|         return str(created) | ||||
|             return f"{created} {self.correspondent} {self.title}" | ||||
|         else: | ||||
|             return f"{created} {self.title}" | ||||
|  | ||||
|     @property | ||||
|     def source_path(self): | ||||
| @@ -225,12 +233,40 @@ class Document(models.Model): | ||||
|         return open(self.source_path, "rb") | ||||
|  | ||||
|     @property | ||||
|     def file_name(self): | ||||
|         return slugify(str(self)) + self.file_type | ||||
|     def archive_path(self): | ||||
|         if self.filename: | ||||
|             fname = archive_name_from_filename(self.filename) | ||||
|         else: | ||||
|             fname = "{:07}.pdf".format(self.pk) | ||||
|  | ||||
|         return os.path.join( | ||||
|             settings.ARCHIVE_DIR, | ||||
|             fname | ||||
|         ) | ||||
|  | ||||
|     @property | ||||
|     def archive_file(self): | ||||
|         return open(self.archive_path, "rb") | ||||
|  | ||||
|     def get_public_filename(self, archive=False, counter=0, suffix=None): | ||||
|         result = str(self) | ||||
|  | ||||
|         if counter: | ||||
|             result += f"_{counter:02}" | ||||
|  | ||||
|         if suffix: | ||||
|             result += suffix | ||||
|  | ||||
|         if archive: | ||||
|             result += ".pdf" | ||||
|         else: | ||||
|             result += self.file_type | ||||
|  | ||||
|         return pathvalidate.sanitize_filename(result, replacement_text="-") | ||||
|  | ||||
|     @property | ||||
|     def file_type(self): | ||||
|         return mimetypes.guess_extension(str(self.mime_type)) | ||||
|         return get_default_file_extension(self.mime_type) | ||||
|  | ||||
|     @property | ||||
|     def thumbnail_path(self): | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| import logging | ||||
| import mimetypes | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| @@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type): | ||||
|     return get_parser_class_for_mime_type(mime_type) is not None | ||||
|  | ||||
|  | ||||
| def get_default_file_extension(mime_type): | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parser_declaration = response[1] | ||||
|         supported_mime_types = parser_declaration["mime_types"] | ||||
|  | ||||
|         if mime_type in supported_mime_types: | ||||
|             return supported_mime_types[mime_type] | ||||
|  | ||||
|     ext = mimetypes.guess_extension(mime_type) | ||||
|     if ext: | ||||
|         return ext | ||||
|     else: | ||||
|         return "" | ||||
|  | ||||
|  | ||||
| def is_file_ext_supported(ext): | ||||
|     if ext: | ||||
|         return ext.lower() in get_supported_file_extensions() | ||||
|     else: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| def get_supported_file_extensions(): | ||||
|     extensions = set() | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parser_declaration = response[1] | ||||
|         supported_mime_types = parser_declaration["mime_types"] | ||||
|  | ||||
|         for mime_type in supported_mime_types: | ||||
|             extensions.update(mimetypes.guess_all_extensions(mime_type)) | ||||
|  | ||||
|     return extensions | ||||
|  | ||||
|  | ||||
| def get_parser_class_for_mime_type(mime_type): | ||||
|  | ||||
|     options = [] | ||||
| @@ -107,21 +142,59 @@ def run_convert(input_file, | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
|  | ||||
|  | ||||
| def run_unpaper(pnm, logging_group=None): | ||||
|     pnm_out = pnm.replace(".pnm", ".unpaper.pnm") | ||||
| def parse_date(filename, text): | ||||
|     """ | ||||
|     Returns the date of the document. | ||||
|     """ | ||||
|  | ||||
|     command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, | ||||
|                     pnm_out) | ||||
|     def __parser(ds, date_order): | ||||
|         """ | ||||
|         Call dateparser.parse with a particular date ordering | ||||
|         """ | ||||
|         return dateparser.parse( | ||||
|             ds, | ||||
|             settings={ | ||||
|                 "DATE_ORDER": date_order, | ||||
|                 "PREFER_DAY_OF_MONTH": "first", | ||||
|                 "RETURN_AS_TIMEZONE_AWARE": | ||||
|                 True | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|     logger.debug(f"Execute: {' '.join(command_args)}", | ||||
|                  extra={'group': logging_group}) | ||||
|     date = None | ||||
|  | ||||
|     if not subprocess.Popen(command_args, | ||||
|                             stdout=subprocess.DEVNULL, | ||||
|                             stderr=subprocess.DEVNULL).wait() == 0: | ||||
|         raise ParseError(f"Unpaper failed at {command_args}") | ||||
|     next_year = timezone.now().year + 5  # Arbitrary 5 year future limit | ||||
|  | ||||
|     return pnm_out | ||||
|     # if filename date parsing is enabled, search there first: | ||||
|     if settings.FILENAME_DATE_ORDER: | ||||
|         for m in re.finditer(DATE_REGEX, filename): | ||||
|             date_string = m.group(0) | ||||
|  | ||||
|             try: | ||||
|                 date = __parser(date_string, settings.FILENAME_DATE_ORDER) | ||||
|             except (TypeError, ValueError): | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             if date is not None and next_year > date.year > 1900: | ||||
|                 return date | ||||
|  | ||||
|     # Iterate through all regex matches in text and try to parse the date | ||||
|     for m in re.finditer(DATE_REGEX, text): | ||||
|         date_string = m.group(0) | ||||
|  | ||||
|         try: | ||||
|             date = __parser(date_string, settings.DATE_ORDER) | ||||
|         except (TypeError, ValueError): | ||||
|             # Skip all matches that do not parse to a proper date | ||||
|             continue | ||||
|  | ||||
|         if date is not None and next_year > date.year > 1900: | ||||
|             break | ||||
|         else: | ||||
|             date = None | ||||
|  | ||||
|     return date | ||||
|  | ||||
|  | ||||
| class ParseError(Exception): | ||||
| @@ -134,27 +207,36 @@ class DocumentParser(LoggingMixin): | ||||
|     `paperless_tesseract.parsers` for inspiration. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path, logging_group, progress_callback): | ||||
|     def __init__(self, logging_group, progress_callback): | ||||
|         super().__init__() | ||||
|         self.logging_group = logging_group | ||||
|         self.document_path = path | ||||
|         self.tempdir = tempfile.mkdtemp( | ||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
|         self.archive_path = None | ||||
|         self.text = None | ||||
|         self.date = None | ||||
|         self.progress_callback = progress_callback | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def parse(self, document_path, mime_type): | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_archive_path(self): | ||||
|         return self.archive_path | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         Returns the path to a file we can use as a thumbnail for this document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def optimise_thumbnail(self, in_path): | ||||
|  | ||||
|     def get_optimised_thumbnail(self, document_path, mime_type): | ||||
|         thumbnail = self.get_thumbnail(document_path, mime_type) | ||||
|         if settings.OPTIMIZE_THUMBNAILS: | ||||
|             out_path = os.path.join(self.tempdir, "optipng.png") | ||||
|             out_path = os.path.join(self.tempdir, "thumb_optipng.png") | ||||
|  | ||||
|             args = (settings.OPTIPNG_BINARY, | ||||
|                     "-silent", "-o5", in_path, "-out", out_path) | ||||
|                     "-silent", "-o5", thumbnail, "-out", out_path) | ||||
|  | ||||
|             self.log('debug', f"Execute: {' '.join(args)}") | ||||
|  | ||||
| @@ -163,97 +245,13 @@ class DocumentParser(LoggingMixin): | ||||
|  | ||||
|             return out_path | ||||
|         else: | ||||
|             return in_path | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|         return self.optimise_thumbnail(self.get_thumbnail()) | ||||
|             return thumbnail | ||||
|  | ||||
|     def get_text(self): | ||||
|         """ | ||||
|         Returns the text from the document and only the text. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|         return self.text | ||||
|  | ||||
|     def get_date(self): | ||||
|         """ | ||||
|         Returns the date of the document. | ||||
|         """ | ||||
|  | ||||
|         def __parser(ds, date_order): | ||||
|             """ | ||||
|             Call dateparser.parse with a particular date ordering | ||||
|             """ | ||||
|             return dateparser.parse( | ||||
|                 ds, | ||||
|                 settings={ | ||||
|                     "DATE_ORDER": date_order, | ||||
|                     "PREFER_DAY_OF_MONTH": "first", | ||||
|                     "RETURN_AS_TIMEZONE_AWARE": | ||||
|                     True | ||||
|                 } | ||||
|             ) | ||||
|  | ||||
|         date = None | ||||
|         date_string = None | ||||
|  | ||||
|         next_year = timezone.now().year + 5  # Arbitrary 5 year future limit | ||||
|         title = os.path.basename(self.document_path) | ||||
|  | ||||
|         # if filename date parsing is enabled, search there first: | ||||
|         if settings.FILENAME_DATE_ORDER: | ||||
|             self.log("info", "Checking document title for date") | ||||
|             for m in re.finditer(DATE_REGEX, title): | ||||
|                 date_string = m.group(0) | ||||
|  | ||||
|                 try: | ||||
|                     date = __parser(date_string, settings.FILENAME_DATE_ORDER) | ||||
|                 except (TypeError, ValueError): | ||||
|                     # Skip all matches that do not parse to a proper date | ||||
|                     continue | ||||
|  | ||||
|                 if date is not None and next_year > date.year > 1900: | ||||
|                     self.log( | ||||
|                         "info", | ||||
|                         "Detected document date {} based on string {} " | ||||
|                         "from document title" | ||||
|                         "".format(date.isoformat(), date_string) | ||||
|                     ) | ||||
|                     return date | ||||
|  | ||||
|         try: | ||||
|             # getting text after checking filename will save time if only | ||||
|             # looking at the filename instead of the whole text | ||||
|             text = self.get_text() | ||||
|         except ParseError: | ||||
|             return None | ||||
|  | ||||
|         # Iterate through all regex matches in text and try to parse the date | ||||
|         for m in re.finditer(DATE_REGEX, text): | ||||
|             date_string = m.group(0) | ||||
|  | ||||
|             try: | ||||
|                 date = __parser(date_string, settings.DATE_ORDER) | ||||
|             except (TypeError, ValueError): | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             if date is not None and next_year > date.year > 1900: | ||||
|                 break | ||||
|             else: | ||||
|                 date = None | ||||
|  | ||||
|         if date is not None: | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "Detected document date {} based on string {}".format( | ||||
|                     date.isoformat(), | ||||
|                     date_string | ||||
|                 ) | ||||
|             ) | ||||
|         else: | ||||
|             self.log("info", "Unable to detect date for document") | ||||
|  | ||||
|         return date | ||||
|         return self.date | ||||
|  | ||||
|     def cleanup(self): | ||||
|         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||
|   | ||||
							
								
								
									
										117
									
								
								src/documents/sanity_checker.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,117 @@ | ||||
| import hashlib | ||||
| import os | ||||
|  | ||||
| from django.conf import settings | ||||
|  | ||||
| from documents.models import Document | ||||
|  | ||||
|  | ||||
| class SanityMessage: | ||||
|     message = None | ||||
|  | ||||
|  | ||||
| class SanityWarning(SanityMessage): | ||||
|     def __init__(self, message): | ||||
|         self.message = message | ||||
|  | ||||
|     def __str__(self): | ||||
|         return f"Warning: {self.message}" | ||||
|  | ||||
|  | ||||
| class SanityError(SanityMessage): | ||||
|     def __init__(self, message): | ||||
|         self.message = message | ||||
|  | ||||
|     def __str__(self): | ||||
|         return f"ERROR: {self.message}" | ||||
|  | ||||
|  | ||||
| class SanityFailedError(Exception): | ||||
|  | ||||
|     def __init__(self, messages): | ||||
|         self.messages = messages | ||||
|  | ||||
|     def __str__(self): | ||||
|         message_string = "\n".join([str(m) for m in self.messages]) | ||||
|         return ( | ||||
|             f"The following issuse were found by the sanity checker:\n" | ||||
|             f"{message_string}\n\n===============\n\n") | ||||
|  | ||||
|  | ||||
| def check_sanity(): | ||||
|     messages = [] | ||||
|  | ||||
|     present_files = [] | ||||
|     for root, subdirs, files in os.walk(settings.MEDIA_ROOT): | ||||
|         for f in files: | ||||
|             present_files.append(os.path.normpath(os.path.join(root, f))) | ||||
|  | ||||
|     for doc in Document.objects.all(): | ||||
|         # Check sanity of the thumbnail | ||||
|         if not os.path.isfile(doc.thumbnail_path): | ||||
|             messages.append(SanityError( | ||||
|                 f"Thumbnail of document {doc.pk} does not exist.")) | ||||
|         else: | ||||
|             present_files.remove(os.path.normpath(doc.thumbnail_path)) | ||||
|             try: | ||||
|                 with doc.thumbnail_file as f: | ||||
|                     f.read() | ||||
|             except OSError as e: | ||||
|                 messages.append(SanityError( | ||||
|                     f"Cannot read thumbnail file of document {doc.pk}: {e}" | ||||
|                 )) | ||||
|  | ||||
|         # Check sanity of the original file | ||||
|         # TODO: extract method | ||||
|         if not os.path.isfile(doc.source_path): | ||||
|             messages.append(SanityError( | ||||
|                 f"Original of document {doc.pk} does not exist.")) | ||||
|         else: | ||||
|             present_files.remove(os.path.normpath(doc.source_path)) | ||||
|             try: | ||||
|                 with doc.source_file as f: | ||||
|                     checksum = hashlib.md5(f.read()).hexdigest() | ||||
|             except OSError as e: | ||||
|                 messages.append(SanityError( | ||||
|                     f"Cannot read original file of document {doc.pk}: {e}")) | ||||
|             else: | ||||
|                 if not checksum == doc.checksum: | ||||
|                     messages.append(SanityError( | ||||
|                         f"Checksum mismatch of document {doc.pk}. " | ||||
|                         f"Stored: {doc.checksum}, actual: {checksum}." | ||||
|                     )) | ||||
|  | ||||
|         # Check sanity of the archive file. | ||||
|         if doc.archive_checksum: | ||||
|             if not os.path.isfile(doc.archive_path): | ||||
|                 messages.append(SanityError( | ||||
|                     f"Archived version of document {doc.pk} does not exist." | ||||
|                 )) | ||||
|             else: | ||||
|                 present_files.remove(os.path.normpath(doc.archive_path)) | ||||
|                 try: | ||||
|                     with doc.archive_file as f: | ||||
|                         checksum = hashlib.md5(f.read()).hexdigest() | ||||
|                 except OSError as e: | ||||
|                     messages.append(SanityError( | ||||
|                         f"Cannot read archive file of document {doc.pk}: {e}" | ||||
|                     )) | ||||
|                 else: | ||||
|                     if not checksum == doc.archive_checksum: | ||||
|                         messages.append(SanityError( | ||||
|                             f"Checksum mismatch of archive {doc.pk}. " | ||||
|                             f"Stored: {doc.checksum}, actual: {checksum}." | ||||
|                         )) | ||||
|  | ||||
|         # other document checks | ||||
|         if not doc.content: | ||||
|             messages.append(SanityWarning( | ||||
|                 f"Document {doc.pk} has no content." | ||||
|             )) | ||||
|  | ||||
|     for extra_file in present_files: | ||||
|         messages.append(SanityWarning( | ||||
|             f"Orphaned file in media dir: {extra_file}" | ||||
|         )) | ||||
|  | ||||
|     return messages | ||||
| @@ -1,6 +1,9 @@ | ||||
| import magic | ||||
| from pathvalidate import validate_filename, ValidationError | ||||
| from rest_framework import serializers | ||||
|  | ||||
| from .models import Correspondent, Tag, Document, Log, DocumentType | ||||
| from .parsers import is_mime_type_supported | ||||
|  | ||||
|  | ||||
| class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): | ||||
| @@ -76,11 +79,9 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField): | ||||
|  | ||||
| class DocumentSerializer(serializers.ModelSerializer): | ||||
|  | ||||
|     correspondent_id = CorrespondentField( | ||||
|         allow_null=True, source='correspondent') | ||||
|     tags_id = TagsField(many=True, source='tags') | ||||
|     document_type_id = DocumentTypeField( | ||||
|         allow_null=True, source='document_type') | ||||
|     correspondent = CorrespondentField(allow_null=True) | ||||
|     tags = TagsField(many=True) | ||||
|     document_type = DocumentTypeField(allow_null=True) | ||||
|  | ||||
|     class Meta: | ||||
|         model = Document | ||||
| @@ -88,19 +89,13 @@ class DocumentSerializer(serializers.ModelSerializer): | ||||
|         fields = ( | ||||
|             "id", | ||||
|             "correspondent", | ||||
|             "correspondent_id", | ||||
|             "document_type", | ||||
|             "document_type_id", | ||||
|             "title", | ||||
|             "content", | ||||
|             "mime_type", | ||||
|             "tags", | ||||
|             "tags_id", | ||||
|             "checksum", | ||||
|             "created", | ||||
|             "modified", | ||||
|             "added", | ||||
|             "file_name", | ||||
|             "archive_serial_number" | ||||
|         ) | ||||
|  | ||||
| @@ -116,3 +111,82 @@ class LogSerializer(serializers.ModelSerializer): | ||||
|             "group", | ||||
|             "level" | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class PostDocumentSerializer(serializers.Serializer): | ||||
|  | ||||
|     document = serializers.FileField( | ||||
|         label="Document", | ||||
|         write_only=True, | ||||
|     ) | ||||
|  | ||||
|     title = serializers.CharField( | ||||
|         label="Title", | ||||
|         write_only=True, | ||||
|         required=False, | ||||
|     ) | ||||
|  | ||||
|     correspondent = serializers.PrimaryKeyRelatedField( | ||||
|         queryset=Correspondent.objects.all(), | ||||
|         label="Correspondent", | ||||
|         allow_null=True, | ||||
|         write_only=True, | ||||
|         required=False, | ||||
|     ) | ||||
|  | ||||
|     document_type = serializers.PrimaryKeyRelatedField( | ||||
|         queryset=DocumentType.objects.all(), | ||||
|         label="Document type", | ||||
|         allow_null=True, | ||||
|         write_only=True, | ||||
|         required=False, | ||||
|     ) | ||||
|  | ||||
|     tags = serializers.PrimaryKeyRelatedField( | ||||
|         many=True, | ||||
|         queryset=Tag.objects.all(), | ||||
|         label="Tags", | ||||
|         write_only=True, | ||||
|         required=False, | ||||
|     ) | ||||
|  | ||||
|     def validate_document(self, document): | ||||
|  | ||||
|         try: | ||||
|             validate_filename(document.name) | ||||
|         except ValidationError: | ||||
|             raise serializers.ValidationError("Invalid filename.") | ||||
|  | ||||
|         document_data = document.file.read() | ||||
|         mime_type = magic.from_buffer(document_data, mime=True) | ||||
|  | ||||
|         if not is_mime_type_supported(mime_type): | ||||
|             raise serializers.ValidationError( | ||||
|                 "This file type is not supported.") | ||||
|  | ||||
|         return document.name, document_data | ||||
|  | ||||
|     def validate_title(self, title): | ||||
|         if title: | ||||
|             return title | ||||
|         else: | ||||
|             # do not return empty strings. | ||||
|             return None | ||||
|  | ||||
|     def validate_correspondent(self, correspondent): | ||||
|         if correspondent: | ||||
|             return correspondent.id | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|     def validate_document_type(self, document_type): | ||||
|         if document_type: | ||||
|             return document_type.id | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|     def validate_tags(self, tags): | ||||
|         if tags: | ||||
|             return [tag.id for tag in tags] | ||||
|         else: | ||||
|             return None | ||||
|   | ||||
| @@ -2,3 +2,4 @@ | ||||
| # for exporting/importing commands | ||||
| EXPORTER_FILE_NAME = "__exported_file_name__" | ||||
| EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__" | ||||
| EXPORTER_ARCHIVE_NAME = "__exported_archive_name__" | ||||
|   | ||||
| @@ -9,10 +9,11 @@ from django.contrib.contenttypes.models import ContentType | ||||
| from django.db import models, DatabaseError | ||||
| from django.dispatch import receiver | ||||
| from django.utils import timezone | ||||
| from rest_framework.reverse import reverse | ||||
|  | ||||
| from .. import index, matching | ||||
| from ..file_handling import delete_empty_directories, generate_filename, \ | ||||
|     create_source_path_directory | ||||
|     create_source_path_directory, archive_name_from_filename | ||||
| from ..models import Document, Tag | ||||
|  | ||||
|  | ||||
| @@ -156,11 +157,11 @@ def run_post_consume_script(sender, document, **kwargs): | ||||
|     Popen(( | ||||
|         settings.POST_CONSUME_SCRIPT, | ||||
|         str(document.pk), | ||||
|         document.file_name, | ||||
|         document.source_path, | ||||
|         document.thumbnail_path, | ||||
|         None, | ||||
|         None, | ||||
|         document.get_public_filename(), | ||||
|         os.path.normpath(document.source_path), | ||||
|         os.path.normpath(document.thumbnail_path), | ||||
|         reverse("document-download", kwargs={"pk": document.pk}), | ||||
|         reverse("document-thumb", kwargs={"pk": document.pk}), | ||||
|         str(document.correspondent), | ||||
|         str(",".join(document.tags.all().values_list("slug", flat=True))) | ||||
|     )).wait() | ||||
| @@ -168,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs): | ||||
|  | ||||
| @receiver(models.signals.post_delete, sender=Document) | ||||
| def cleanup_document_deletion(sender, instance, using, **kwargs): | ||||
|     for f in (instance.source_path, instance.thumbnail_path): | ||||
|         try: | ||||
|             os.unlink(f) | ||||
|         except FileNotFoundError: | ||||
|             pass  # The file's already gone, so we're cool with it. | ||||
|     for f in (instance.source_path, | ||||
|               instance.archive_path, | ||||
|               instance.thumbnail_path): | ||||
|         if os.path.isfile(f): | ||||
|             try: | ||||
|                 os.unlink(f) | ||||
|                 logging.getLogger(__name__).debug( | ||||
|                     f"Deleted file {f}.") | ||||
|             except OSError as e: | ||||
|                 logging.getLogger(__name__).warning( | ||||
|                     f"While deleting document {str(instance)}, the file " | ||||
|                     f"{f} could not be deleted: {e}" | ||||
|                 ) | ||||
|  | ||||
|     delete_empty_directories(os.path.dirname(instance.source_path)) | ||||
|     delete_empty_directories( | ||||
|         os.path.dirname(instance.source_path), | ||||
|         root=settings.ORIGINALS_DIR | ||||
|     ) | ||||
|  | ||||
|     delete_empty_directories( | ||||
|         os.path.dirname(instance.archive_path), | ||||
|         root=settings.ARCHIVE_DIR | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def validate_move(instance, old_path, new_path): | ||||
|     if not os.path.isfile(old_path): | ||||
|         # Can't do anything if the old file does not exist anymore. | ||||
|         logging.getLogger(__name__).fatal( | ||||
|             f"Document {str(instance)}: File {old_path} has gone.") | ||||
|         return False | ||||
|  | ||||
|     if os.path.isfile(new_path): | ||||
|         # Can't do anything if the new file already exists. Skip updating file. | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Document {str(instance)}: Cannot rename file " | ||||
|             f"since target path {new_path} already exists.") | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| @receiver(models.signals.m2m_changed, sender=Document.tags.through) | ||||
| @@ -182,51 +216,91 @@ def cleanup_document_deletion(sender, instance, using, **kwargs): | ||||
| def update_filename_and_move_files(sender, instance, **kwargs): | ||||
|  | ||||
|     if not instance.filename: | ||||
|         # Can't update the filename if there is not filename to begin with | ||||
|         # This happens after the consumer creates a new document. | ||||
|         # The PK needs to be set first by saving the document once. When this | ||||
|         # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be | ||||
|         # renamed anyway. In all other cases, instance.filename will be set. | ||||
|         # Can't update the filename if there is no filename to begin with | ||||
|         # This happens when the consumer creates a new document. | ||||
|         # The document is modified and saved multiple times, and only after | ||||
|         # everything is done (i.e., the generated filename is final), | ||||
|         # filename will be set to the location where the consumer has put | ||||
|         # the file. | ||||
|         # | ||||
|         # This will in turn cause this logic to move the file where it belongs. | ||||
|         return | ||||
|  | ||||
|     old_filename = instance.filename | ||||
|     old_path = instance.source_path | ||||
|     new_filename = generate_filename(instance) | ||||
|  | ||||
|     if new_filename == instance.filename: | ||||
|         # Don't do anything if its the same. | ||||
|         return | ||||
|  | ||||
|     new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||
|     old_source_path = instance.source_path | ||||
|     new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||
|  | ||||
|     if not os.path.isfile(old_path): | ||||
|         # Can't do anything if the old file does not exist anymore. | ||||
|         logging.getLogger(__name__).fatal( | ||||
|             f"Document {str(instance)}: File {old_path} has gone.") | ||||
|     if not validate_move(instance, old_source_path, new_source_path): | ||||
|         return | ||||
|  | ||||
|     if os.path.isfile(new_path): | ||||
|         # Can't do anything if the new file already exists. Skip updating file. | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Document {str(instance)}: Cannot rename file " | ||||
|             f"since target path {new_path} already exists.") | ||||
|         return | ||||
|     # archive files are optional, archive checksum tells us if we have one, | ||||
|     # since this is None for documents without archived files. | ||||
|     if instance.archive_checksum: | ||||
|         new_archive_filename = archive_name_from_filename(new_filename) | ||||
|         old_archive_path = instance.archive_path | ||||
|         new_archive_path = os.path.join(settings.ARCHIVE_DIR, | ||||
|                                         new_archive_filename) | ||||
|  | ||||
|     create_source_path_directory(new_path) | ||||
|         if not validate_move(instance, old_archive_path, new_archive_path): | ||||
|             return | ||||
|  | ||||
|         create_source_path_directory(new_archive_path) | ||||
|     else: | ||||
|         old_archive_path = None | ||||
|         new_archive_path = None | ||||
|  | ||||
|     create_source_path_directory(new_source_path) | ||||
|  | ||||
|     try: | ||||
|         os.rename(old_path, new_path) | ||||
|         os.rename(old_source_path, new_source_path) | ||||
|         if instance.archive_checksum: | ||||
|             os.rename(old_archive_path, new_archive_path) | ||||
|         instance.filename = new_filename | ||||
|         instance.save() | ||||
|         # Don't save here to prevent infinite recursion. | ||||
|         Document.objects.filter(pk=instance.pk).update(filename=new_filename) | ||||
|  | ||||
|         logging.getLogger(__name__).debug( | ||||
|             f"Moved file {old_source_path} to {new_source_path}.") | ||||
|  | ||||
|         if instance.archive_checksum: | ||||
|             logging.getLogger(__name__).debug( | ||||
|                 f"Moved file {old_archive_path} to {new_archive_path}.") | ||||
|  | ||||
|     except OSError as e: | ||||
|         instance.filename = old_filename | ||||
|         # this happens when we can't move a file. If that's the case for the | ||||
|         # archive file, we try our best to revert the changes. | ||||
|         try: | ||||
|             os.rename(new_source_path, old_source_path) | ||||
|             os.rename(new_archive_path, old_archive_path) | ||||
|         except Exception as e: | ||||
|             # This is fine, since: | ||||
|             # A: if we managed to move source from A to B, we will also manage | ||||
|             #  to move it from B to A. If not, we have a serious issue | ||||
|             #  that's going to get caught by the santiy checker. | ||||
|             #  all files remain in place and will never be overwritten, | ||||
|             #  so this is not the end of the world. | ||||
|             # B: if moving the orignal file failed, nothing has changed anyway. | ||||
|             pass | ||||
|     except DatabaseError as e: | ||||
|         os.rename(new_path, old_path) | ||||
|         os.rename(new_source_path, old_source_path) | ||||
|         if instance.archive_checksum: | ||||
|             os.rename(new_archive_path, old_archive_path) | ||||
|         instance.filename = old_filename | ||||
|  | ||||
|     if not os.path.isfile(old_path): | ||||
|         delete_empty_directories(os.path.dirname(old_path)) | ||||
|     if not os.path.isfile(old_source_path): | ||||
|         delete_empty_directories(os.path.dirname(old_source_path), | ||||
|                                  root=settings.ORIGINALS_DIR) | ||||
|  | ||||
|     if old_archive_path and not os.path.isfile(old_archive_path): | ||||
|         delete_empty_directories(os.path.dirname(old_archive_path), | ||||
|                                  root=settings.ARCHIVE_DIR) | ||||
|  | ||||
|  | ||||
| def set_log_entry(sender, document=None, logging_group=None, **kwargs): | ||||
|   | ||||
| @@ -3,15 +3,18 @@ import logging | ||||
| from django.conf import settings | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| from documents import index | ||||
| from documents import index, sanity_checker | ||||
| from documents.classifier import DocumentClassifier, \ | ||||
|     IncompatibleClassifierVersionError | ||||
| from documents.consumer import Consumer, ConsumerError | ||||
| from documents.models import Document | ||||
| from documents.sanity_checker import SanityFailedError | ||||
|  | ||||
|  | ||||
| def index_optimize(): | ||||
|     index.open_index().optimize() | ||||
|     ix = index.open_index() | ||||
|     writer = AsyncWriter(ix) | ||||
|     writer.commit(optimize=True) | ||||
|  | ||||
|  | ||||
| def index_reindex(): | ||||
| @@ -74,3 +77,12 @@ def consume_file(path, | ||||
|     else: | ||||
|         raise ConsumerError("Unknown error: Returned document was null, but " | ||||
|                             "no error message was given.") | ||||
|  | ||||
|  | ||||
| def sanity_check(): | ||||
|     messages = sanity_checker.check_sanity() | ||||
|  | ||||
|     if len(messages) > 0: | ||||
|         raise SanityFailedError(messages) | ||||
|     else: | ||||
|         return "No issues detected." | ||||
|   | ||||
| Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/archive/0000001.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000002.pdf.gpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/thumbnails/0000001.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 7.7 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/thumbnails/0000002.png.gpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/simple.zip
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -1,40 +1,25 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from unittest import mock | ||||
|  | ||||
| from django.contrib.auth.models import User | ||||
| from django.test import override_settings | ||||
| from pathvalidate import ValidationError | ||||
| from rest_framework.test import APITestCase | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| from documents import index | ||||
| from documents.models import Document, Correspondent, DocumentType, Tag | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| class DocumentApiTest(APITestCase): | ||||
| class TestDocumentApi(DirectoriesMixin, APITestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch_dir = tempfile.mkdtemp() | ||||
|         self.media_dir = tempfile.mkdtemp() | ||||
|         self.originals_dir = os.path.join(self.media_dir, "documents", "originals") | ||||
|         self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails") | ||||
|  | ||||
|         os.makedirs(self.originals_dir, exist_ok=True) | ||||
|         os.makedirs(self.thumbnail_dir, exist_ok=True) | ||||
|  | ||||
|         override_settings( | ||||
|             SCRATCH_DIR=self.scratch_dir, | ||||
|             MEDIA_ROOT=self.media_dir, | ||||
|             ORIGINALS_DIR=self.originals_dir, | ||||
|             THUMBNAIL_DIR=self.thumbnail_dir | ||||
|         ).enable() | ||||
|         super(TestDocumentApi, self).setUp() | ||||
|  | ||||
|         user = User.objects.create_superuser(username="temp_admin") | ||||
|         self.client.force_login(user=user) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch_dir, ignore_errors=True) | ||||
|         shutil.rmtree(self.media_dir, ignore_errors=True) | ||||
|  | ||||
|     def testDocuments(self): | ||||
|  | ||||
|         response = self.client.get("/api/documents/").data | ||||
| @@ -56,20 +41,13 @@ class DocumentApiTest(APITestCase): | ||||
|         returned_doc = response.data['results'][0] | ||||
|         self.assertEqual(returned_doc['id'], doc.id) | ||||
|         self.assertEqual(returned_doc['title'], doc.title) | ||||
|         self.assertEqual(returned_doc['correspondent']['name'], c.name) | ||||
|         self.assertEqual(returned_doc['document_type']['name'], dt.name) | ||||
|         self.assertEqual(returned_doc['correspondent']['id'], c.id) | ||||
|         self.assertEqual(returned_doc['document_type']['id'], dt.id) | ||||
|         self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id']) | ||||
|         self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id']) | ||||
|         self.assertEqual(len(returned_doc['tags']), 1) | ||||
|         self.assertEqual(returned_doc['tags'][0]['name'], tag.name) | ||||
|         self.assertEqual(returned_doc['tags'][0]['id'], tag.id) | ||||
|         self.assertListEqual(returned_doc['tags_id'], [tag.id]) | ||||
|         self.assertEqual(returned_doc['correspondent'], c.id) | ||||
|         self.assertEqual(returned_doc['document_type'], dt.id) | ||||
|         self.assertListEqual(returned_doc['tags'], [tag.id]) | ||||
|  | ||||
|         c2 = Correspondent.objects.create(name="c2") | ||||
|  | ||||
|         returned_doc['correspondent_id'] = c2.pk | ||||
|         returned_doc['correspondent'] = c2.pk | ||||
|         returned_doc['title'] = "the new title" | ||||
|  | ||||
|         response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json') | ||||
| @@ -87,7 +65,7 @@ class DocumentApiTest(APITestCase): | ||||
|  | ||||
|     def test_document_actions(self): | ||||
|  | ||||
|         _, filename = tempfile.mkstemp(dir=self.originals_dir) | ||||
|         _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir) | ||||
|  | ||||
|         content = b"This is a test" | ||||
|         content_thumbnail = b"thumbnail content" | ||||
| @@ -97,7 +75,7 @@ class DocumentApiTest(APITestCase): | ||||
|  | ||||
|         doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf") | ||||
|  | ||||
|         with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f: | ||||
|         with open(os.path.join(self.dirs.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f: | ||||
|             f.write(content_thumbnail) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) | ||||
| @@ -115,6 +93,44 @@ class DocumentApiTest(APITestCase): | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content_thumbnail) | ||||
|  | ||||
|     def test_download_with_archive(self): | ||||
|  | ||||
|         _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir) | ||||
|  | ||||
|         content = b"This is a test" | ||||
|         content_archive = b"This is the same test but archived" | ||||
|  | ||||
|         with open(filename, "wb") as f: | ||||
|             f.write(content) | ||||
|  | ||||
|         filename = os.path.basename(filename) | ||||
|  | ||||
|         doc = Document.objects.create(title="none", filename=filename, | ||||
|                                       mime_type="application/pdf") | ||||
|  | ||||
|         with open(doc.archive_path, "wb") as f: | ||||
|             f.write(content_archive) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content_archive) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/preview/'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content_archive) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content) | ||||
|  | ||||
|     def test_document_actions_not_existing_file(self): | ||||
|  | ||||
|         doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf") | ||||
| @@ -179,6 +195,109 @@ class DocumentApiTest(APITestCase): | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(len(results), 3) | ||||
|  | ||||
|     def test_search_no_query(self): | ||||
|         response = self.client.get("/api/search/") | ||||
|         results = response.data['results'] | ||||
|  | ||||
|         self.assertEqual(len(results), 0) | ||||
|  | ||||
|     def test_search(self): | ||||
|         d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1) | ||||
|         d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B") | ||||
|         d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C") | ||||
|         with AsyncWriter(index.open_index()) as writer: | ||||
|             # Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once | ||||
|             # (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer. | ||||
|             # That's why we cant open the writer in a model on_save handler or something. | ||||
|             index.update_document(writer, d1) | ||||
|             index.update_document(writer, d2) | ||||
|             index.update_document(writer, d3) | ||||
|         response = self.client.get("/api/search/?query=bank") | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(response.data['count'], 3) | ||||
|         self.assertEqual(response.data['page'], 1) | ||||
|         self.assertEqual(response.data['page_count'], 1) | ||||
|         self.assertEqual(len(results), 3) | ||||
|  | ||||
|         response = self.client.get("/api/search/?query=september") | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(response.data['count'], 1) | ||||
|         self.assertEqual(response.data['page'], 1) | ||||
|         self.assertEqual(response.data['page_count'], 1) | ||||
|         self.assertEqual(len(results), 1) | ||||
|  | ||||
|         response = self.client.get("/api/search/?query=statement") | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(response.data['count'], 2) | ||||
|         self.assertEqual(response.data['page'], 1) | ||||
|         self.assertEqual(response.data['page_count'], 1) | ||||
|         self.assertEqual(len(results), 2) | ||||
|  | ||||
|         response = self.client.get("/api/search/?query=sfegdfg") | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(response.data['count'], 0) | ||||
|         self.assertEqual(response.data['page'], 0) | ||||
|         self.assertEqual(response.data['page_count'], 0) | ||||
|         self.assertEqual(len(results), 0) | ||||
|  | ||||
|     def test_search_multi_page(self): | ||||
|         with AsyncWriter(index.open_index()) as writer: | ||||
|             for i in range(55): | ||||
|                 doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content") | ||||
|                 index.update_document(writer, doc) | ||||
|  | ||||
|         # This is here so that we test that no document gets returned twice (might happen if the paging is not working) | ||||
|         seen_ids = [] | ||||
|  | ||||
|         for i in range(1, 6): | ||||
|             response = self.client.get(f"/api/search/?query=content&page={i}") | ||||
|             results = response.data['results'] | ||||
|             self.assertEqual(response.data['count'], 55) | ||||
|             self.assertEqual(response.data['page'], i) | ||||
|             self.assertEqual(response.data['page_count'], 6) | ||||
|             self.assertEqual(len(results), 10) | ||||
|  | ||||
|             for result in results: | ||||
|                 self.assertNotIn(result['id'], seen_ids) | ||||
|                 seen_ids.append(result['id']) | ||||
|  | ||||
|         response = self.client.get(f"/api/search/?query=content&page=6") | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(response.data['count'], 55) | ||||
|         self.assertEqual(response.data['page'], 6) | ||||
|         self.assertEqual(response.data['page_count'], 6) | ||||
|         self.assertEqual(len(results), 5) | ||||
|  | ||||
|         for result in results: | ||||
|             self.assertNotIn(result['id'], seen_ids) | ||||
|             seen_ids.append(result['id']) | ||||
|  | ||||
|         response = self.client.get(f"/api/search/?query=content&page=7") | ||||
|         results = response.data['results'] | ||||
|         self.assertEqual(response.data['count'], 55) | ||||
|         self.assertEqual(response.data['page'], 6) | ||||
|         self.assertEqual(response.data['page_count'], 6) | ||||
|         self.assertEqual(len(results), 5) | ||||
|  | ||||
|     def test_search_invalid_page(self): | ||||
|         with AsyncWriter(index.open_index()) as writer: | ||||
|             for i in range(15): | ||||
|                 doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content") | ||||
|                 index.update_document(writer, doc) | ||||
|  | ||||
|         first_page = self.client.get(f"/api/search/?query=content&page=1").data | ||||
|         second_page = self.client.get(f"/api/search/?query=content&page=2").data | ||||
|         should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data | ||||
|         should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data | ||||
|         should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data | ||||
|         should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data | ||||
|  | ||||
|         self.assertDictEqual(first_page, should_be_first_page_1) | ||||
|         self.assertDictEqual(first_page, should_be_first_page_2) | ||||
|         self.assertDictEqual(first_page, should_be_first_page_3) | ||||
|         self.assertDictEqual(first_page, should_be_first_page_4) | ||||
|         self.assertNotEqual(len(first_page['results']), len(second_page['results'])) | ||||
|  | ||||
|     @mock.patch("documents.index.autocomplete") | ||||
|     def test_search_autocomplete(self, m): | ||||
|         m.side_effect = lambda ix, term, limit: [term for _ in range(limit)] | ||||
| @@ -201,6 +320,22 @@ class DocumentApiTest(APITestCase): | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(len(response.data), 10) | ||||
|  | ||||
|     def test_search_spelling_correction(self): | ||||
|         with AsyncWriter(index.open_index()) as writer: | ||||
|             for i in range(55): | ||||
|                 doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}") | ||||
|                 index.update_document(writer, doc) | ||||
|  | ||||
|         response = self.client.get("/api/search/?query=thing") | ||||
|         correction = response.data['corrected_query'] | ||||
|  | ||||
|         self.assertEqual(correction, "things") | ||||
|  | ||||
|         response = self.client.get("/api/search/?query=things") | ||||
|         correction = response.data['corrected_query'] | ||||
|  | ||||
|         self.assertEqual(correction, None) | ||||
|  | ||||
|     def test_statistics(self): | ||||
|  | ||||
|         doc1 = Document.objects.create(title="none1", checksum="A") | ||||
| @@ -215,3 +350,128 @@ class DocumentApiTest(APITestCase): | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.data['documents_total'], 3) | ||||
|         self.assertEqual(response.data['documents_inbox'], 1) | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload(self, m): | ||||
|  | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f}) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         m.assert_called_once() | ||||
|  | ||||
|         args, kwargs = m.call_args | ||||
|         self.assertEqual(kwargs['override_filename'], "simple.pdf") | ||||
|         self.assertIsNone(kwargs['override_title']) | ||||
|         self.assertIsNone(kwargs['override_correspondent_id']) | ||||
|         self.assertIsNone(kwargs['override_document_type_id']) | ||||
|         self.assertIsNone(kwargs['override_tag_ids']) | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_invalid_form(self, m): | ||||
|  | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"documenst": f}) | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|         m.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_invalid_file(self, m): | ||||
|  | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f}) | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|         m.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     @mock.patch("documents.serialisers.validate_filename") | ||||
|     def test_upload_invalid_filename(self, validate_filename, async_task): | ||||
|         validate_filename.side_effect = ValidationError() | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f}) | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|  | ||||
|         async_task.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_title(self, async_task): | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f, "title": "my custom title"}) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         async_task.assert_called_once() | ||||
|  | ||||
|         args, kwargs = async_task.call_args | ||||
|  | ||||
|         self.assertEqual(kwargs['override_title'], "my custom title") | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_correspondent(self, async_task): | ||||
|         c = Correspondent.objects.create(name="test-corres") | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": c.id}) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         async_task.assert_called_once() | ||||
|  | ||||
|         args, kwargs = async_task.call_args | ||||
|  | ||||
|         self.assertEqual(kwargs['override_correspondent_id'], c.id) | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_invalid_correspondent(self, async_task): | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": 3456}) | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|  | ||||
|         async_task.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_document_type(self, async_task): | ||||
|         dt = DocumentType.objects.create(name="invoice") | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": dt.id}) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         async_task.assert_called_once() | ||||
|  | ||||
|         args, kwargs = async_task.call_args | ||||
|  | ||||
|         self.assertEqual(kwargs['override_document_type_id'], dt.id) | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_invalid_document_type(self, async_task): | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": 34578}) | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|  | ||||
|         async_task.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_tags(self, async_task): | ||||
|         t1 = Tag.objects.create(name="tag1") | ||||
|         t2 = Tag.objects.create(name="tag2") | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post( | ||||
|                 "/api/documents/post_document/", | ||||
|                 {"document": f, "tags": [t2.id, t1.id]}) | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|  | ||||
|         async_task.assert_called_once() | ||||
|  | ||||
|         args, kwargs = async_task.call_args | ||||
|  | ||||
|         self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id]) | ||||
|  | ||||
|     @mock.patch("documents.views.async_task") | ||||
|     def test_upload_with_invalid_tags(self, async_task): | ||||
|         t1 = Tag.objects.create(name="tag1") | ||||
|         t2 = Tag.objects.create(name="tag2") | ||||
|         with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: | ||||
|             response = self.client.post( | ||||
|                 "/api/documents/post_document/", | ||||
|                 {"document": f, "tags": [t2.id, t1.id, 734563]}) | ||||
|         self.assertEqual(response.status_code, 400) | ||||
|  | ||||
|         async_task.assert_not_called() | ||||
|   | ||||
| @@ -1,24 +1,29 @@ | ||||
| import tempfile | ||||
| from time import sleep | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.classifier import DocumentClassifier | ||||
| from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| from documents.models import Correspondent, Document, Tag, DocumentType | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| class TestClassifier(TestCase): | ||||
| class TestClassifier(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|  | ||||
|         super(TestClassifier, self).setUp() | ||||
|         self.classifier = DocumentClassifier() | ||||
|  | ||||
|     def generate_test_data(self): | ||||
|         self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO) | ||||
|         self.c2 = Correspondent.objects.create(name="c2") | ||||
|         self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO) | ||||
|         self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|         self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True) | ||||
|         self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45) | ||||
|         self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO) | ||||
|         self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO) | ||||
|  | ||||
|         self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt) | ||||
|         self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B") | ||||
| @@ -59,8 +64,8 @@ class TestClassifier(TestCase): | ||||
|         self.classifier.train() | ||||
|         self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk) | ||||
|         self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None) | ||||
|         self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,)) | ||||
|         self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk)) | ||||
|         self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk]) | ||||
|         self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk]) | ||||
|         self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk) | ||||
|         self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None) | ||||
|  | ||||
| @@ -71,6 +76,44 @@ class TestClassifier(TestCase): | ||||
|         self.assertTrue(self.classifier.train()) | ||||
|         self.assertFalse(self.classifier.train()) | ||||
|  | ||||
|     def testVersionIncreased(self): | ||||
|  | ||||
|         self.generate_test_data() | ||||
|         self.assertTrue(self.classifier.train()) | ||||
|         self.assertFalse(self.classifier.train()) | ||||
|  | ||||
|         self.classifier.save_classifier() | ||||
|  | ||||
|         classifier2 = DocumentClassifier() | ||||
|  | ||||
|         current_ver = DocumentClassifier.FORMAT_VERSION | ||||
|         with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1): | ||||
|             # assure that we won't load old classifiers. | ||||
|             self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload) | ||||
|  | ||||
|             self.classifier.save_classifier() | ||||
|  | ||||
|             # assure that we can load the classifier after saving it. | ||||
|             classifier2.reload() | ||||
|  | ||||
|     def testReload(self): | ||||
|  | ||||
|         self.generate_test_data() | ||||
|         self.assertTrue(self.classifier.train()) | ||||
|         self.classifier.save_classifier() | ||||
|  | ||||
|         classifier2 = DocumentClassifier() | ||||
|         classifier2.reload() | ||||
|         v1 = classifier2.classifier_version | ||||
|  | ||||
|         # change the classifier after some time. | ||||
|         sleep(1) | ||||
|         self.classifier.save_classifier() | ||||
|  | ||||
|         classifier2.reload() | ||||
|         v2 = classifier2.classifier_version | ||||
|         self.assertNotEqual(v1, v2) | ||||
|  | ||||
|     @override_settings(DATA_DIR=tempfile.mkdtemp()) | ||||
|     def testSaveClassifier(self): | ||||
|  | ||||
| @@ -83,3 +126,112 @@ class TestClassifier(TestCase): | ||||
|         new_classifier = DocumentClassifier() | ||||
|         new_classifier.reload() | ||||
|         self.assertFalse(new_classifier.train()) | ||||
|  | ||||
|     def test_one_correspondent_predict(self): | ||||
|         c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO) | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A") | ||||
|  | ||||
|         self.classifier.train() | ||||
|         self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk) | ||||
|  | ||||
|     def test_one_correspondent_predict_manydocs(self): | ||||
|         c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO) | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A") | ||||
|         doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B") | ||||
|  | ||||
|         self.classifier.train() | ||||
|         self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk) | ||||
|         self.assertIsNone(self.classifier.predict_correspondent(doc2.content)) | ||||
|  | ||||
|     def test_one_type_predict(self): | ||||
|         dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", | ||||
|                                             checksum="A", document_type=dt) | ||||
|  | ||||
|         self.classifier.train() | ||||
|         self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk) | ||||
|  | ||||
|     def test_one_type_predict_manydocs(self): | ||||
|         dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", | ||||
|                                             checksum="A", document_type=dt) | ||||
|  | ||||
|         doc2 = Document.objects.create(title="doc1", content="this is a document from c2", | ||||
|                                             checksum="B") | ||||
|  | ||||
|         self.classifier.train() | ||||
|         self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk) | ||||
|         self.assertIsNone(self.classifier.predict_document_type(doc2.content)) | ||||
|  | ||||
|     def test_one_tag_predict(self): | ||||
|         t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A") | ||||
|  | ||||
|         doc1.tags.add(t1) | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk]) | ||||
|  | ||||
|     def test_one_tag_predict_unassigned(self): | ||||
|         t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A") | ||||
|  | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc1.content), []) | ||||
|  | ||||
|     def test_two_tags_predict_singledoc(self): | ||||
|         t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|         t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121) | ||||
|  | ||||
|         doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D") | ||||
|  | ||||
|         doc4.tags.add(t1) | ||||
|         doc4.tags.add(t2) | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk]) | ||||
|  | ||||
|     def test_two_tags_predict(self): | ||||
|         t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|         t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A") | ||||
|         doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B") | ||||
|         doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C") | ||||
|         doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D") | ||||
|  | ||||
|         doc1.tags.add(t1) | ||||
|         doc2.tags.add(t2) | ||||
|  | ||||
|         doc4.tags.add(t1) | ||||
|         doc4.tags.add(t2) | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk]) | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc2.content), [t2.pk]) | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc3.content), []) | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk]) | ||||
|  | ||||
|     def test_one_tag_predict_multi(self): | ||||
|         t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A") | ||||
|         doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B") | ||||
|  | ||||
|         doc1.tags.add(t1) | ||||
|         doc2.tags.add(t1) | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk]) | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc2.content), [t1.pk]) | ||||
|  | ||||
|     def test_one_tag_predict_multi_2(self): | ||||
|         t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12) | ||||
|  | ||||
|         doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A") | ||||
|         doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B") | ||||
|  | ||||
|         doc1.tags.add(t1) | ||||
|         self.classifier.train() | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk]) | ||||
|         self.assertListEqual(self.classifier.predict_tags(doc2.content), []) | ||||
|   | ||||
| @@ -7,6 +7,7 @@ from unittest.mock import MagicMock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from .utils import DirectoriesMixin | ||||
| from ..consumer import Consumer, ConsumerError | ||||
| from ..models import FileInfo, Tag, Correspondent, DocumentType, Document | ||||
| from ..parsers import DocumentParser, ParseError | ||||
| @@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase): | ||||
|  | ||||
| class DummyParser(DocumentParser): | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         # not important during tests | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def __init__(self, path, logging_group, scratch_dir): | ||||
|         super(DummyParser, self).__init__(path, logging_group) | ||||
|     def __init__(self, logging_group, scratch_dir, archive_path): | ||||
|         super(DummyParser, self).__init__(logging_group) | ||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||
|         self.archive_path = archive_path | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|     def get_optimised_thumbnail(self, document_path, mime_type): | ||||
|         return self.fake_thumb | ||||
|  | ||||
|     def get_text(self): | ||||
|         return "The Text" | ||||
|     def parse(self, document_path, mime_type): | ||||
|         self.text = "The Text" | ||||
|  | ||||
|  | ||||
| class FaultyParser(DocumentParser): | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         # not important during tests | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def __init__(self, path, logging_group, scratch_dir): | ||||
|         super(FaultyParser, self).__init__(path, logging_group) | ||||
|     def __init__(self, logging_group, scratch_dir): | ||||
|         super(FaultyParser, self).__init__(logging_group) | ||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|     def get_optimised_thumbnail(self, document_path, mime_type): | ||||
|         return self.fake_thumb | ||||
|  | ||||
|     def get_text(self): | ||||
|     def parse(self, document_path, mime_type): | ||||
|         raise ParseError("Does not compute.") | ||||
|  | ||||
|  | ||||
| @@ -408,32 +410,22 @@ def fake_magic_from_file(file, mime=False): | ||||
|  | ||||
|  | ||||
| @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) | ||||
| class TestConsumer(TestCase): | ||||
| class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_dummy_parser(self, path, logging_group): | ||||
|         return DummyParser(path, logging_group, self.scratch_dir) | ||||
|     def make_dummy_parser(self, logging_group): | ||||
|         return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file()) | ||||
|  | ||||
|     def make_faulty_parser(self, path, logging_group): | ||||
|         return FaultyParser(path, logging_group, self.scratch_dir) | ||||
|     def make_faulty_parser(self, logging_group): | ||||
|         return FaultyParser(logging_group, self.dirs.scratch_dir) | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch_dir = tempfile.mkdtemp() | ||||
|         self.media_dir = tempfile.mkdtemp() | ||||
|         self.consumption_dir = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings( | ||||
|             SCRATCH_DIR=self.scratch_dir, | ||||
|             MEDIA_ROOT=self.media_dir, | ||||
|             ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"), | ||||
|             THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"), | ||||
|             CONSUMPTION_DIR=self.consumption_dir | ||||
|         ).enable() | ||||
|         super(TestConsumer, self).setUp() | ||||
|  | ||||
|         patcher = mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|         m = patcher.start() | ||||
|         m.return_value = [(None, { | ||||
|             "parser": self.make_dummy_parser, | ||||
|             "mime_types": ["application/pdf"], | ||||
|             "mime_types": {"application/pdf": ".pdf"}, | ||||
|             "weight": 0 | ||||
|         })] | ||||
|  | ||||
| @@ -441,15 +433,19 @@ class TestConsumer(TestCase): | ||||
|  | ||||
|         self.consumer = Consumer() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch_dir, ignore_errors=True) | ||||
|         shutil.rmtree(self.media_dir, ignore_errors=True) | ||||
|         shutil.rmtree(self.consumption_dir, ignore_errors=True) | ||||
|  | ||||
|     def get_test_file(self): | ||||
|         fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir) | ||||
|         return f | ||||
|         src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf") | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "sample.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|         return dst | ||||
|  | ||||
|     def get_test_archive_file(self): | ||||
|         src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf") | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|         return dst | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT=None) | ||||
|     def testNormalOperation(self): | ||||
|  | ||||
|         filename = self.get_test_file() | ||||
| @@ -469,6 +465,13 @@ class TestConsumer(TestCase): | ||||
|             document.thumbnail_path | ||||
|         )) | ||||
|  | ||||
|         self.assertTrue(os.path.isfile( | ||||
|             document.archive_path | ||||
|         )) | ||||
|  | ||||
|         self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1") | ||||
|         self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b") | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(filename)) | ||||
|  | ||||
|     def testOverrideFilename(self): | ||||
| @@ -516,27 +519,7 @@ class TestConsumer(TestCase): | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     @override_settings(CONSUMPTION_DIR=None) | ||||
|     def testConsumptionDirUnset(self): | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.") | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     @override_settings(CONSUMPTION_DIR="asd") | ||||
|     def testNoConsumptionDir(self): | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertEqual(str(e), "Consumption directory asd does not exist") | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates(self): | ||||
|     def testDuplicates1(self): | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|         try: | ||||
| @@ -547,6 +530,21 @@ class TestConsumer(TestCase): | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates2(self): | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_archive_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertTrue(str(e).endswith("It is a duplicate.")) | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates3(self): | ||||
|         self.consumer.try_consume_file(self.get_test_archive_file()) | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def testNoParsers(self, m): | ||||
|         m.return_value = [] | ||||
| @@ -554,7 +552,7 @@ class TestConsumer(TestCase): | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertTrue(str(e).startswith("No parsers abvailable")) | ||||
|             self.assertTrue("No parsers abvailable for" in str(e)) | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
| @@ -563,7 +561,7 @@ class TestConsumer(TestCase): | ||||
|     def testFaultyParser(self, m): | ||||
|         m.return_value = [(None, { | ||||
|             "parser": self.make_faulty_parser, | ||||
|             "mime_types": ["application/pdf"], | ||||
|             "mime_types": {"application/pdf": ".pdf"}, | ||||
|             "weight": 0 | ||||
|         })] | ||||
|  | ||||
| @@ -598,12 +596,33 @@ class TestConsumer(TestCase): | ||||
|  | ||||
|         document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs") | ||||
|  | ||||
|         print(document.source_path) | ||||
|         print("===") | ||||
|         self.assertEqual(document.title, "new docs") | ||||
|         self.assertEqual(document.correspondent.name, "Bank") | ||||
|         self.assertEqual(document.filename, "Bank/new docs-0000001.pdf") | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     @mock.patch("documents.signals.handlers.generate_filename") | ||||
|     def testFilenameHandlingUnstableFormat(self, m): | ||||
|  | ||||
|         filenames = ["this", "that", "now this", "i cant decide"] | ||||
|  | ||||
|         def get_filename(): | ||||
|             f = filenames.pop() | ||||
|             filenames.insert(0, f) | ||||
|             return f | ||||
|  | ||||
|         m.side_effect = lambda f: get_filename() | ||||
|  | ||||
|         filename = self.get_test_file() | ||||
|  | ||||
|         Tag.objects.create(name="test", is_inbox_tag=True) | ||||
|  | ||||
|         document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs") | ||||
|  | ||||
|         self.assertEqual(document.title, "new docs") | ||||
|         self.assertEqual(document.correspondent.name, "Bank") | ||||
|         self.assertEqual(document.filename, "bank/new-docs-0000001.pdf") | ||||
|         self.assertIsNotNone(os.path.isfile(document.title)) | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|  | ||||
|     @mock.patch("documents.consumer.DocumentClassifier") | ||||
|     def testClassifyDocument(self, m): | ||||
|   | ||||
							
								
								
									
										140
									
								
								src/documents/tests/test_date_parsing.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,140 @@ | ||||
| import datetime | ||||
| import os | ||||
| import shutil | ||||
| from unittest import mock | ||||
| from uuid import uuid4 | ||||
|  | ||||
| from dateutil import tz | ||||
| from django.conf import settings | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.parsers import parse_date | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples") | ||||
|     SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||
|  | ||||
|     def setUp(self): | ||||
|         os.makedirs(self.SCRATCH, exist_ok=True) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.SCRATCH) | ||||
|  | ||||
|     def test_date_format_1(self): | ||||
|         text = "lorem ipsum 130218 lorem ipsum" | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_2(self): | ||||
|         text = "lorem ipsum 2018 lorem ipsum" | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_3(self): | ||||
|         text = "lorem ipsum 20180213 lorem ipsum" | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_4(self): | ||||
|         text = "lorem ipsum 13.02.2018 lorem ipsum" | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_date_format_5(self): | ||||
|         text = ( | ||||
|             "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " | ||||
|             "ipsum" | ||||
|         ) | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_date_format_6(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_7(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "März 2019\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2019, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_date_format_8(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum\n" | ||||
|             "März 2020" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             parse_date("", text), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_9(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "27. Nullmonth 2020\n" | ||||
|             "März 2020\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             parse_date("", text), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_crazy_date_past(self, *args): | ||||
|         self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) | ||||
|  | ||||
|     def test_crazy_date_future(self, *args): | ||||
|         self.assertIsNone(parse_date("", "01-07-2350 00:00:00")) | ||||
|  | ||||
|     def test_crazy_date_with_spaces(self, *args): | ||||
|         self.assertIsNone(parse_date("", "20 408000l 2475")) | ||||
|  | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     def test_filename_date_parse_invalid(self, *args): | ||||
|         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) | ||||
| @@ -1,12 +1,29 @@ | ||||
| import shutil | ||||
| import tempfile | ||||
| from datetime import datetime | ||||
| from pathlib import Path | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from ..models import Document, Correspondent | ||||
|  | ||||
|  | ||||
| class TestDocument(TestCase): | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|         self.originals_dir = tempfile.mkdtemp() | ||||
|         self.thumb_dir = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings( | ||||
|             ORIGINALS_DIR=self.originals_dir, | ||||
|             THUMBNAIL_DIR=self.thumb_dir, | ||||
|         ).enable() | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         shutil.rmtree(self.originals_dir) | ||||
|         shutil.rmtree(self.thumb_dir) | ||||
|  | ||||
|     def test_file_deletion(self): | ||||
|         document = Document.objects.create( | ||||
|             correspondent=Correspondent.objects.create(name="Test0"), | ||||
| @@ -19,8 +36,31 @@ class TestDocument(TestCase): | ||||
|         file_path = document.source_path | ||||
|         thumb_path = document.thumbnail_path | ||||
|  | ||||
|         Path(file_path).touch() | ||||
|         Path(thumb_path).touch() | ||||
|  | ||||
|         with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: | ||||
|             document.delete() | ||||
|             mock_unlink.assert_any_call(file_path) | ||||
|             mock_unlink.assert_any_call(thumb_path) | ||||
|             self.assertEqual(mock_unlink.call_count, 2) | ||||
|  | ||||
|     def test_file_name(self): | ||||
|  | ||||
|         doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf") | ||||
|  | ||||
|     def test_file_name_jpg(self): | ||||
|  | ||||
|         doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg") | ||||
|  | ||||
|     def test_file_name_unknown(self): | ||||
|  | ||||
|         doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip") | ||||
|  | ||||
|     def test_file_name_invalid_type(self): | ||||
|  | ||||
|         doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25)) | ||||
|         self.assertEqual(doc.get_public_filename(), "2020-12-25 test") | ||||
|   | ||||
| @@ -1,32 +1,18 @@ | ||||
| import datetime | ||||
| import os | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from uuid import uuid4 | ||||
| from unittest import mock | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.db import DatabaseError | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from .utils import DirectoriesMixin | ||||
| from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories | ||||
| from ..models import Document, Correspondent | ||||
| from ..signals.handlers import update_filename_and_move_files | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
|     deletion_list = [] | ||||
|  | ||||
|     def add_to_deletion_list(self, dirname): | ||||
|         self.deletion_list.append(dirname) | ||||
|  | ||||
|     def setUp(self): | ||||
|         folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||
|         os.makedirs(folder + "/documents/originals") | ||||
|         override_settings(MEDIA_ROOT=folder).enable() | ||||
|         override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable() | ||||
|         self.add_to_deletion_list(folder) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         for dirname in self.deletion_list: | ||||
|             shutil.rmtree(dirname, ignore_errors=True) | ||||
| class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="") | ||||
|     def test_generate_source_filename(self): | ||||
| @@ -103,7 +89,7 @@ class TestDate(TestCase): | ||||
|         document.save() | ||||
|  | ||||
|         # Check proper handling of files | ||||
|         self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|  | ||||
|         os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) | ||||
| @@ -133,18 +119,14 @@ class TestDate(TestCase): | ||||
|         document.correspondent = Correspondent.objects.get_or_create( | ||||
|             name="test")[0] | ||||
|  | ||||
|         # This will cause save() to fail. | ||||
|         document.checksum = document1.checksum | ||||
|         with mock.patch("documents.signals.handlers.Document.objects.filter") as m: | ||||
|             m.side_effect = DatabaseError() | ||||
|             document.save() | ||||
|  | ||||
|         # Assume saving the document initially works, this gets called. | ||||
|         # After renaming, an error occurs, and filename is not saved: | ||||
|         # document should still be available at document.filename. | ||||
|         update_filename_and_move_files(None, document) | ||||
|  | ||||
|         # Check proper handling of files | ||||
|         self.assertTrue(os.path.isfile(document.source_path)) | ||||
|         self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|             # Check proper handling of files | ||||
|             self.assertTrue(os.path.isfile(document.source_path)) | ||||
|             self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|             self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") | ||||
|     def test_document_delete(self): | ||||
| @@ -199,8 +181,8 @@ class TestDate(TestCase): | ||||
|         document.save() | ||||
|  | ||||
|         # Check proper handling of files | ||||
|         self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True) | ||||
|         self.assertTrue(os.path.isfile(important_file)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") | ||||
| @@ -318,13 +300,12 @@ class TestDate(TestCase): | ||||
|         # Create our working directory | ||||
|         tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") | ||||
|         os.makedirs(tmp) | ||||
|         self.add_to_deletion_list(tmp) | ||||
|  | ||||
|         os.makedirs(os.path.join(tmp, "notempty")) | ||||
|         Path(os.path.join(tmp, "notempty", "file")).touch() | ||||
|         os.makedirs(os.path.join(tmp, "notempty", "empty")) | ||||
|  | ||||
|         delete_empty_directories(os.path.join(tmp, "notempty", "empty")) | ||||
|         delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR) | ||||
|         self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) | ||||
|         self.assertEqual(os.path.isfile( | ||||
|             os.path.join(tmp, "notempty", "file")), True) | ||||
| @@ -348,3 +329,179 @@ class TestDate(TestCase): | ||||
|         document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
|         self.assertEqual(generate_filename(document), "0000001.pdf") | ||||
|  | ||||
|  | ||||
| class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT=None) | ||||
|     def test_create_no_format(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_create_with_format(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(original)) | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|         self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf")) | ||||
|         self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_archive_gone(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         #Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertFalse(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_archive_exists(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none")) | ||||
|         Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     @mock.patch("documents.signals.handlers.os.rename") | ||||
|     def test_move_archive_error(self, m): | ||||
|  | ||||
|         def fake_rename(src, dst): | ||||
|             if "archive" in src: | ||||
|                 raise OSError() | ||||
|             else: | ||||
|                 os.remove(src) | ||||
|                 Path(dst).touch() | ||||
|  | ||||
|         m.side_effect = fake_rename | ||||
|  | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_file_gone(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         #Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertFalse(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     @mock.patch("documents.signals.handlers.os.rename") | ||||
|     def test_move_file_error(self, m): | ||||
|  | ||||
|         def fake_rename(src, dst): | ||||
|             if "original" in src: | ||||
|                 raise OSError() | ||||
|             else: | ||||
|                 os.remove(src) | ||||
|                 Path(dst).touch() | ||||
|  | ||||
|         m.side_effect = fake_rename | ||||
|  | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     def test_archive_deleted(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|         doc.delete() | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(original)) | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertFalse(os.path.isfile(doc.source_path)) | ||||
|         self.assertFalse(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_database_error(self): | ||||
|  | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|         with mock.patch("documents.signals.handlers.Document.objects.filter") as m: | ||||
|             m.side_effect = DatabaseError() | ||||
|             doc.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
| class TestFilenameGeneration(TestCase): | ||||
|  | ||||
|     @override_settings( | ||||
|         PAPERLESS_FILENAME_FORMAT="{title}" | ||||
|     ) | ||||
|     def test_invalid_characters(self): | ||||
|  | ||||
|         doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1") | ||||
|         self.assertEqual(generate_filename(doc), "This. is the title-0000001.pdf") | ||||
|  | ||||
|         doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2") | ||||
|         self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay-0000002.pdf") | ||||
|  | ||||
|     @override_settings( | ||||
|         PAPERLESS_FILENAME_FORMAT="{created}" | ||||
|     ) | ||||
|     def test_date(self): | ||||
|         doc = Document.objects.create(title="does not matter", created=datetime.datetime(2020,5,21, 7,36,51, 153), mime_type="application/pdf", pk=2, checksum="2") | ||||
|         self.assertEqual(generate_filename(doc), "2020-05-21-0000002.pdf") | ||||
|   | ||||
| @@ -2,7 +2,7 @@ import logging | ||||
| import uuid | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from ..models import Log | ||||
|  | ||||
| @@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase): | ||||
|         self.logger = logging.getLogger( | ||||
|             "documents.management.commands.document_consumer") | ||||
|  | ||||
|     @override_settings(DISABLE_DBHANDLER=False) | ||||
|     def test_that_it_saves_at_all(self): | ||||
|  | ||||
|         kw = {"group": uuid.uuid4()} | ||||
| @@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase): | ||||
|             self.logger.critical("This is a critical message", extra=kw) | ||||
|             self.assertEqual(Log.objects.all().count(), 5) | ||||
|  | ||||
|     @override_settings(DISABLE_DBHANDLER=False) | ||||
|     def test_groups(self): | ||||
|  | ||||
|         kw1 = {"group": uuid.uuid4()} | ||||
|   | ||||
							
								
								
									
										42
									
								
								src/documents/tests/test_management_archiver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,42 @@ | ||||
| import filecmp | ||||
| import os | ||||
| import shutil | ||||
|  | ||||
| from django.core.management import call_command | ||||
| from django.test import TestCase | ||||
|  | ||||
| from documents.management.commands.document_archiver import handle_document | ||||
| from documents.models import Document | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||
|  | ||||
|  | ||||
| class TestArchiver(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_models(self): | ||||
|         self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf") | ||||
|         #self.d2 = Document.objects.create(checksum="B", title="B", content="second document") | ||||
|         #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document") | ||||
|  | ||||
|     def test_archiver(self): | ||||
|  | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         self.make_models() | ||||
|  | ||||
|         call_command('document_archiver') | ||||
|  | ||||
|     def test_handle_document(self): | ||||
|  | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         self.make_models() | ||||
|  | ||||
|         handle_document(self.d1.pk) | ||||
|  | ||||
|         doc = Document.objects.get(id=self.d1.id) | ||||
|  | ||||
|         self.assertIsNotNone(doc.checksum) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(filecmp.cmp(sample_file, doc.source_path)) | ||||
							
								
								
									
										262
									
								
								src/documents/tests/test_management_consumer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,262 @@ | ||||
| import filecmp | ||||
| import os | ||||
| import shutil | ||||
| from threading import Thread | ||||
| from time import sleep | ||||
| from unittest import mock | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management import call_command, CommandError | ||||
| from django.test import override_settings, TransactionTestCase | ||||
|  | ||||
| from documents.models import Tag | ||||
| from documents.consumer import ConsumerError | ||||
| from documents.management.commands import document_consumer | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| class ConsumerThread(Thread): | ||||
|  | ||||
|     def __init__(self): | ||||
|         super().__init__() | ||||
|         self.cmd = document_consumer.Command() | ||||
|  | ||||
|     def run(self) -> None: | ||||
|         self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False) | ||||
|  | ||||
|     def stop(self): | ||||
|         # Consumer checks this every second. | ||||
|         self.cmd.stop_flag = True | ||||
|  | ||||
|  | ||||
| def chunked(size, source): | ||||
|     for i in range(0, len(source), size): | ||||
|         yield source[i:i+size] | ||||
|  | ||||
|  | ||||
| class ConsumerMixin: | ||||
|  | ||||
|     sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|         super(ConsumerMixin, self).setUp() | ||||
|         self.t = None | ||||
|         patcher = mock.patch("documents.management.commands.document_consumer.async_task") | ||||
|         self.task_mock = patcher.start() | ||||
|         self.addCleanup(patcher.stop) | ||||
|  | ||||
|     def t_start(self): | ||||
|         self.t = ConsumerThread() | ||||
|         self.t.start() | ||||
|         # give the consumer some time to do initial work | ||||
|         sleep(1) | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         if self.t: | ||||
|             # set the stop flag | ||||
|             self.t.stop() | ||||
|             # wait for the consumer to exit. | ||||
|             self.t.join() | ||||
|  | ||||
|         super(ConsumerMixin, self).tearDown() | ||||
|  | ||||
|     def wait_for_task_mock_call(self): | ||||
|         n = 0 | ||||
|         while n < 100: | ||||
|             if self.task_mock.call_count > 0: | ||||
|                 # give task_mock some time to finish and raise errors | ||||
|                 sleep(1) | ||||
|                 return | ||||
|             n += 1 | ||||
|             sleep(0.1) | ||||
|  | ||||
|     # A bogus async_task that will simply check the file for | ||||
|     # completeness and raise an exception otherwise. | ||||
|     def bogus_task(self, func, filename, **kwargs): | ||||
|         eq = filecmp.cmp(filename, self.sample_file, shallow=False) | ||||
|         if not eq: | ||||
|             print("Consumed an INVALID file.") | ||||
|             raise ConsumerError("Incomplete File READ FAILED") | ||||
|         else: | ||||
|             print("Consumed a perfectly valid file.") | ||||
|  | ||||
|     def slow_write_file(self, target, incomplete=False): | ||||
|         with open(self.sample_file, 'rb') as f: | ||||
|             pdf_bytes = f.read() | ||||
|  | ||||
|         if incomplete: | ||||
|             pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100] | ||||
|  | ||||
|         with open(target, 'wb') as f: | ||||
|             # this will take 2 seconds, since the file is about 20k. | ||||
|             print("Start writing file.") | ||||
|             for b in chunked(1000, pdf_bytes): | ||||
|                 f.write(b) | ||||
|                 sleep(0.1) | ||||
|             print("file completed.") | ||||
|  | ||||
|  | ||||
| class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase): | ||||
|  | ||||
|     def test_consume_file(self): | ||||
|         self.t_start() | ||||
|  | ||||
|         f = os.path.join(self.dirs.consumption_dir, "my_file.pdf") | ||||
|         shutil.copy(self.sample_file, f) | ||||
|  | ||||
|         self.wait_for_task_mock_call() | ||||
|  | ||||
|         self.task_mock.assert_called_once() | ||||
|  | ||||
|         args, kwargs = self.task_mock.call_args | ||||
|         self.assertEqual(args[1], f) | ||||
|  | ||||
|     def test_consume_file_invalid_ext(self): | ||||
|         self.t_start() | ||||
|  | ||||
|         f = os.path.join(self.dirs.consumption_dir, "my_file.wow") | ||||
|         shutil.copy(self.sample_file, f) | ||||
|  | ||||
|         self.wait_for_task_mock_call() | ||||
|  | ||||
|         self.task_mock.assert_not_called() | ||||
|  | ||||
|     def test_consume_existing_file(self): | ||||
|         f = os.path.join(self.dirs.consumption_dir, "my_file.pdf") | ||||
|         shutil.copy(self.sample_file, f) | ||||
|  | ||||
|         self.t_start() | ||||
|         self.task_mock.assert_called_once() | ||||
|  | ||||
|         args, kwargs = self.task_mock.call_args | ||||
|         self.assertEqual(args[1], f) | ||||
|  | ||||
|     @mock.patch("documents.management.commands.document_consumer.logger.error") | ||||
|     def test_slow_write_pdf(self, error_logger): | ||||
|  | ||||
|         self.task_mock.side_effect = self.bogus_task | ||||
|  | ||||
|         self.t_start() | ||||
|  | ||||
|         fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf") | ||||
|  | ||||
|         self.slow_write_file(fname) | ||||
|  | ||||
|         self.wait_for_task_mock_call() | ||||
|  | ||||
|         error_logger.assert_not_called() | ||||
|  | ||||
|         self.task_mock.assert_called_once() | ||||
|  | ||||
|         args, kwargs = self.task_mock.call_args | ||||
|         self.assertEqual(args[1], fname) | ||||
|  | ||||
|     @mock.patch("documents.management.commands.document_consumer.logger.error") | ||||
|     def test_slow_write_and_move(self, error_logger): | ||||
|  | ||||
|         self.task_mock.side_effect = self.bogus_task | ||||
|  | ||||
|         self.t_start() | ||||
|  | ||||
|         fname = os.path.join(self.dirs.consumption_dir, "my_file.~df") | ||||
|         fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf") | ||||
|  | ||||
|         self.slow_write_file(fname) | ||||
|         shutil.move(fname, fname2) | ||||
|  | ||||
|         self.wait_for_task_mock_call() | ||||
|  | ||||
|         self.task_mock.assert_called_once() | ||||
|  | ||||
|         args, kwargs = self.task_mock.call_args | ||||
|         self.assertEqual(args[1], fname2) | ||||
|  | ||||
|         error_logger.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.management.commands.document_consumer.logger.error") | ||||
|     def test_slow_write_incomplete(self, error_logger): | ||||
|  | ||||
|         self.task_mock.side_effect = self.bogus_task | ||||
|  | ||||
|         self.t_start() | ||||
|  | ||||
|         fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf") | ||||
|         self.slow_write_file(fname, incomplete=True) | ||||
|  | ||||
|         self.wait_for_task_mock_call() | ||||
|  | ||||
|         self.task_mock.assert_called_once() | ||||
|         args, kwargs = self.task_mock.call_args | ||||
|         self.assertEqual(args[1], fname) | ||||
|  | ||||
|         # assert that we have an error logged with this invalid file. | ||||
|         error_logger.assert_called_once() | ||||
|  | ||||
|     @override_settings(CONSUMPTION_DIR="does_not_exist") | ||||
|     def test_consumption_directory_invalid(self): | ||||
|  | ||||
|         self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot') | ||||
|  | ||||
|     @override_settings(CONSUMPTION_DIR="") | ||||
|     def test_consumption_directory_unset(self): | ||||
|  | ||||
|         self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot') | ||||
|  | ||||
|  | ||||
| @override_settings(CONSUMER_POLLING=1) | ||||
| class TestConsumerPolling(TestConsumer): | ||||
|     # just do all the tests with polling | ||||
|     pass | ||||
|  | ||||
|  | ||||
| @override_settings(CONSUMER_RECURSIVE=True) | ||||
| class TestConsumerRecursive(TestConsumer): | ||||
|     # just do all the tests with recursive | ||||
|     pass | ||||
|  | ||||
|  | ||||
| @override_settings(CONSUMER_RECURSIVE=True) | ||||
| @override_settings(CONSUMER_POLLING=1) | ||||
| class TestConsumerRecursivePolling(TestConsumer): | ||||
|     # just do all the tests with polling and recursive | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase): | ||||
|  | ||||
|     @override_settings(CONSUMER_RECURSIVE=True) | ||||
|     @override_settings(CONSUMER_SUBDIRS_AS_TAGS=True) | ||||
|     def test_consume_file_with_path_tags(self): | ||||
|  | ||||
|         tag_names = ("existingTag", "Space Tag") | ||||
|         # Create a Tag prior to consuming a file using it in path | ||||
|         tag_ids = [Tag.objects.create(name=tag_names[0]).pk,] | ||||
|  | ||||
|         self.t_start() | ||||
|  | ||||
|         path = os.path.join(self.dirs.consumption_dir, *tag_names) | ||||
|         os.makedirs(path, exist_ok=True) | ||||
|         f = os.path.join(path, "my_file.pdf") | ||||
|         # Wait at least inotify read_delay for recursive watchers | ||||
|         # to be created for the new directories | ||||
|         sleep(1) | ||||
|         shutil.copy(self.sample_file, f) | ||||
|  | ||||
|         self.wait_for_task_mock_call() | ||||
|  | ||||
|         self.task_mock.assert_called_once() | ||||
|  | ||||
|         # Add the pk of the Tag created by _consume() | ||||
|         tag_ids.append(Tag.objects.get(name=tag_names[1]).pk) | ||||
|  | ||||
|         args, kwargs = self.task_mock.call_args | ||||
|         self.assertEqual(args[1], f) | ||||
|  | ||||
|         # assertCountEqual has a bad name, but test that the first | ||||
|         # sequence contains the same elements as second, regardless of | ||||
|         # their order. | ||||
|         self.assertCountEqual(kwargs["override_tag_ids"], tag_ids) | ||||
|  | ||||
|     @override_settings(CONSUMER_POLLING=1) | ||||
|     def test_consume_file_with_path_tags_polling(self): | ||||
|         self.test_consume_file_with_path_tags() | ||||
							
								
								
									
										57
									
								
								src/documents/tests/test_management_decrypt.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,57 @@ | ||||
| import hashlib | ||||
| import json | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from unittest import mock | ||||
|  | ||||
| from django.core.management import call_command | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.management.commands import document_exporter | ||||
| from documents.models import Document, Tag, DocumentType, Correspondent | ||||
|  | ||||
|  | ||||
| class TestDecryptDocuments(TestCase): | ||||
|  | ||||
|     @override_settings( | ||||
|         ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"), | ||||
|         THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"), | ||||
|         PASSPHRASE="test", | ||||
|         PAPERLESS_FILENAME_FORMAT=None | ||||
|     ) | ||||
|     @mock.patch("documents.management.commands.decrypt_documents.input") | ||||
|     def test_decrypt(self, m): | ||||
|  | ||||
|         media_dir = tempfile.mkdtemp() | ||||
|         originals_dir = os.path.join(media_dir, "documents", "originals") | ||||
|         thumb_dir = os.path.join(media_dir, "documents", "thumbnails") | ||||
|         os.makedirs(originals_dir, exist_ok=True) | ||||
|         os.makedirs(thumb_dir, exist_ok=True) | ||||
|  | ||||
|         override_settings( | ||||
|             ORIGINALS_DIR=originals_dir, | ||||
|             THUMBNAIL_DIR=thumb_dir, | ||||
|             PASSPHRASE="test" | ||||
|         ).enable() | ||||
|  | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg")) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg")) | ||||
|  | ||||
|         Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|  | ||||
|         call_command('decrypt_documents') | ||||
|  | ||||
|         doc = Document.objects.get(id=2) | ||||
|  | ||||
|         self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED) | ||||
|         self.assertEqual(doc.filename, "0000002.pdf") | ||||
|         self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf"))) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png"))) | ||||
|         self.assertTrue(os.path.isfile(doc.thumbnail_path)) | ||||
|  | ||||
|         with doc.source_file as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|             self.assertEqual(checksum, doc.checksum) | ||||
|  | ||||
							
								
								
									
										74
									
								
								src/documents/tests/test_management_exporter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,74 @@ | ||||
| import hashlib | ||||
| import json | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
|  | ||||
| from django.core.management import call_command | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.management.commands import document_exporter | ||||
| from documents.models import Document, Tag, DocumentType, Correspondent | ||||
| from documents.sanity_checker import check_sanity | ||||
| from documents.tests.utils import DirectoriesMixin, paperless_environment | ||||
|  | ||||
|  | ||||
| class TestExportImport(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     @override_settings( | ||||
|         PASSPHRASE="test" | ||||
|     ) | ||||
|     def test_exporter(self): | ||||
|         shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) | ||||
|         shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) | ||||
|  | ||||
|         file = os.path.join(self.dirs.originals_dir, "0000001.pdf") | ||||
|  | ||||
|         Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") | ||||
|         Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|         Tag.objects.create(name="t") | ||||
|         DocumentType.objects.create(name="dt") | ||||
|         Correspondent.objects.create(name="c") | ||||
|  | ||||
|         target = tempfile.mkdtemp() | ||||
|  | ||||
|         call_command('document_exporter', target) | ||||
|  | ||||
|         with open(os.path.join(target, "manifest.json")) as f: | ||||
|             manifest = json.load(f) | ||||
|  | ||||
|         self.assertEqual(len(manifest), 5) | ||||
|  | ||||
|         for element in manifest: | ||||
|             if element['model'] == 'documents.document': | ||||
|                 fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME]) | ||||
|                 self.assertTrue(os.path.exists(fname)) | ||||
|                 self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME]))) | ||||
|  | ||||
|                 with open(fname, "rb") as f: | ||||
|                     checksum = hashlib.md5(f.read()).hexdigest() | ||||
|                 self.assertEqual(checksum, element['fields']['checksum']) | ||||
|  | ||||
|                 if document_exporter.EXPORTER_ARCHIVE_NAME in element: | ||||
|                     fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME]) | ||||
|                     self.assertTrue(os.path.exists(fname)) | ||||
|  | ||||
|                     with open(fname, "rb") as f: | ||||
|                         checksum = hashlib.md5(f.read()).hexdigest() | ||||
|                     self.assertEqual(checksum, element['fields']['archive_checksum']) | ||||
|  | ||||
|         with paperless_environment() as dirs: | ||||
|             call_command('document_importer', target) | ||||
|             messages = check_sanity() | ||||
|             # everything is alright after the test | ||||
|             self.assertEqual(len(messages), 0, str([str(m) for m in messages])) | ||||
|  | ||||
|     def test_export_missing_files(self): | ||||
|  | ||||
|         target = tempfile.mkdtemp() | ||||
|         Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") | ||||
|         self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target) | ||||
|  | ||||
|     def test_duplicate_titles(self): | ||||
|         # TODO | ||||
|         pass | ||||
							
								
								
									
										58
									
								
								src/documents/tests/test_management_retagger.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,58 @@ | ||||
| from django.core.management import call_command | ||||
| from django.test import TestCase | ||||
|  | ||||
| from documents.models import Document, Tag, Correspondent, DocumentType | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| class TestRetagger(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_models(self): | ||||
|         self.d1 = Document.objects.create(checksum="A", title="A", content="first document") | ||||
|         self.d2 = Document.objects.create(checksum="B", title="B", content="second document") | ||||
|         self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document") | ||||
|  | ||||
|         self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY) | ||||
|         self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY) | ||||
|  | ||||
|         self.correspondent_first = Correspondent.objects.create( | ||||
|             name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY) | ||||
|         self.correspondent_second = Correspondent.objects.create( | ||||
|             name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY) | ||||
|  | ||||
|         self.doctype_first = DocumentType.objects.create( | ||||
|             name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY) | ||||
|         self.doctype_second = DocumentType.objects.create( | ||||
|             name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY) | ||||
|  | ||||
|     def get_updated_docs(self): | ||||
|         return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C") | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|         super(TestRetagger, self).setUp() | ||||
|         self.make_models() | ||||
|  | ||||
|     def test_add_tags(self): | ||||
|         call_command('document_retagger', '--tags') | ||||
|         d_first, d_second, d_unrelated = self.get_updated_docs() | ||||
|  | ||||
|         self.assertEqual(d_first.tags.count(), 1) | ||||
|         self.assertEqual(d_second.tags.count(), 1) | ||||
|         self.assertEqual(d_unrelated.tags.count(), 0) | ||||
|  | ||||
|         self.assertEqual(d_first.tags.first(), self.tag_first) | ||||
|         self.assertEqual(d_second.tags.first(), self.tag_second) | ||||
|  | ||||
|     def test_add_type(self): | ||||
|         call_command('document_retagger', '--document_type') | ||||
|         d_first, d_second, d_unrelated = self.get_updated_docs() | ||||
|  | ||||
|         self.assertEqual(d_first.document_type, self.doctype_first) | ||||
|         self.assertEqual(d_second.document_type, self.doctype_second) | ||||
|  | ||||
|     def test_add_correspondent(self): | ||||
|         call_command('document_retagger', '--correspondent') | ||||
|         d_first, d_second, d_unrelated = self.get_updated_docs() | ||||
|  | ||||
|         self.assertEqual(d_first.correspondent, self.correspondent_first) | ||||
|         self.assertEqual(d_second.correspondent, self.correspondent_second) | ||||
| @@ -1,3 +1,5 @@ | ||||
| import shutil | ||||
| import tempfile | ||||
| from random import randint | ||||
|  | ||||
| from django.contrib.admin.models import LogEntry | ||||
| @@ -215,6 +217,13 @@ class TestDocumentConsumptionFinishedSignal(TestCase): | ||||
|         self.doc_contains = Document.objects.create( | ||||
|             content="I contain the keyword.", mime_type="application/pdf") | ||||
|  | ||||
|         self.index_dir = tempfile.mkdtemp() | ||||
|         # TODO: we should not need the index here. | ||||
|         override_settings(INDEX_DIR=self.index_dir).enable() | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         shutil.rmtree(self.index_dir, ignore_errors=True) | ||||
|  | ||||
|     def test_tag_applied_any(self): | ||||
|         t1 = Tag.objects.create( | ||||
|             name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY) | ||||
|   | ||||
| @@ -1,10 +1,15 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from tempfile import TemporaryDirectory | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.parsers import get_parser_class | ||||
| from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \ | ||||
|     get_parser_class_for_mime_type, DocumentParser, is_file_ext_supported | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
| from paperless_text.parsers import TextDocumentParser | ||||
|  | ||||
|  | ||||
| def fake_magic_from_file(file, mime=False): | ||||
| @@ -27,7 +32,7 @@ class TestParserDiscovery(TestCase): | ||||
|             pass | ||||
|  | ||||
|         m.return_value = ( | ||||
|             (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}), | ||||
|             (None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}), | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual( | ||||
| @@ -45,8 +50,8 @@ class TestParserDiscovery(TestCase): | ||||
|             pass | ||||
|  | ||||
|         m.return_value = ( | ||||
|             (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}), | ||||
|             (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}), | ||||
|             (None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}), | ||||
|             (None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}), | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual( | ||||
| @@ -61,3 +66,57 @@ class TestParserDiscovery(TestCase): | ||||
|             self.assertIsNone( | ||||
|                 get_parser_class("doc.pdf") | ||||
|             ) | ||||
|  | ||||
|  | ||||
| def fake_get_thumbnail(self, path, mimetype): | ||||
|     return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") | ||||
|  | ||||
|  | ||||
| class TestBaseParser(TestCase): | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|  | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|         override_settings( | ||||
|             SCRATCH_DIR=self.scratch | ||||
|         ).enable() | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) | ||||
|     @override_settings(OPTIMIZE_THUMBNAILS=True) | ||||
|     def test_get_optimised_thumbnail(self): | ||||
|         parser = DocumentParser(None) | ||||
|  | ||||
|         parser.get_optimised_thumbnail("any", "not important") | ||||
|  | ||||
|     @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) | ||||
|     @override_settings(OPTIMIZE_THUMBNAILS=False) | ||||
|     def test_get_optimised_thumb_disabled(self): | ||||
|         parser = DocumentParser(None) | ||||
|  | ||||
|         path = parser.get_optimised_thumbnail("any", "not important") | ||||
|         self.assertEqual(path, fake_get_thumbnail(None, None, None)) | ||||
|  | ||||
|  | ||||
| class TestParserAvailability(TestCase): | ||||
|  | ||||
|     def test_file_extensions(self): | ||||
|  | ||||
|         for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: | ||||
|             self.assertIn(ext, get_supported_file_extensions()) | ||||
|         self.assertEqual(get_default_file_extension('application/pdf'), ".pdf") | ||||
|         self.assertEqual(get_default_file_extension('image/png'), ".png") | ||||
|         self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg") | ||||
|         self.assertEqual(get_default_file_extension('text/plain'), ".txt") | ||||
|         self.assertEqual(get_default_file_extension('text/csv'), ".csv") | ||||
|         self.assertEqual(get_default_file_extension('application/zip'), ".zip") | ||||
|         self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "") | ||||
|  | ||||
|         self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser) | ||||
|         self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser) | ||||
|         self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None) | ||||
|  | ||||
|         self.assertTrue(is_file_ext_supported('.pdf')) | ||||
|         self.assertFalse(is_file_ext_supported('.hsdfh')) | ||||
|   | ||||
							
								
								
									
										56
									
								
								src/documents/tests/test_post_consume_handlers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,56 @@ | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.models import Document, Tag, Correspondent | ||||
| from documents.signals.handlers import run_post_consume_script | ||||
|  | ||||
|  | ||||
| class PostConsumeTestCase(TestCase): | ||||
|  | ||||
|     @mock.patch("documents.signals.handlers.Popen") | ||||
|     @override_settings(POST_CONSUME_SCRIPT=None) | ||||
|     def test_no_post_consume_script(self, m): | ||||
|         doc = Document.objects.create(title="Test", mime_type="application/pdf") | ||||
|         tag1 = Tag.objects.create(name="a") | ||||
|         tag2 = Tag.objects.create(name="b") | ||||
|         doc.tags.add(tag1) | ||||
|         doc.tags.add(tag2) | ||||
|  | ||||
|         run_post_consume_script(None, doc) | ||||
|  | ||||
|         m.assert_not_called() | ||||
|  | ||||
|     @mock.patch("documents.signals.handlers.Popen") | ||||
|     @override_settings(POST_CONSUME_SCRIPT="script") | ||||
|     def test_post_consume_script_simple(self, m): | ||||
|         doc = Document.objects.create(title="Test", mime_type="application/pdf") | ||||
|  | ||||
|         run_post_consume_script(None, doc) | ||||
|  | ||||
|         m.assert_called_once() | ||||
|  | ||||
|     @mock.patch("documents.signals.handlers.Popen") | ||||
|     @override_settings(POST_CONSUME_SCRIPT="script") | ||||
|     def test_post_consume_script_with_correspondent(self, m): | ||||
|         c = Correspondent.objects.create(name="my_bank") | ||||
|         doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c) | ||||
|         tag1 = Tag.objects.create(name="a") | ||||
|         tag2 = Tag.objects.create(name="b") | ||||
|         doc.tags.add(tag1) | ||||
|         doc.tags.add(tag2) | ||||
|  | ||||
|         run_post_consume_script(None, doc) | ||||
|  | ||||
|         m.assert_called_once() | ||||
|  | ||||
|         args, kwargs = m.call_args | ||||
|  | ||||
|         command = args[0] | ||||
|  | ||||
|         self.assertEqual(command[0], "script") | ||||
|         self.assertEqual(command[1], str(doc.pk)) | ||||
|         self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/") | ||||
|         self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/") | ||||
|         self.assertEqual(command[7], "my_bank") | ||||
|         self.assertCountEqual(command[8].split(","), ["a", "b"]) | ||||
							
								
								
									
										87
									
								
								src/documents/tests/test_sanity_check.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,87 @@ | ||||
| import os | ||||
| import shutil | ||||
| from pathlib import Path | ||||
|  | ||||
| from django.test import TestCase | ||||
|  | ||||
| from documents.models import Document | ||||
| from documents.sanity_checker import check_sanity, SanityFailedError | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| class TestSanityCheck(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_test_data(self): | ||||
|  | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf")) | ||||
|         shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png")) | ||||
|  | ||||
|         return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf") | ||||
|  | ||||
|     def test_no_docs(self): | ||||
|         self.assertEqual(len(check_sanity()), 0) | ||||
|  | ||||
|     def test_success(self): | ||||
|         self.make_test_data() | ||||
|         self.assertEqual(len(check_sanity()), 0) | ||||
|  | ||||
|     def test_no_thumbnail(self): | ||||
|         doc = self.make_test_data() | ||||
|         os.remove(doc.thumbnail_path) | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_thumbnail_no_access(self): | ||||
|         doc = self.make_test_data() | ||||
|         os.chmod(doc.thumbnail_path, 0o000) | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|         os.chmod(doc.thumbnail_path, 0o777) | ||||
|  | ||||
|     def test_no_original(self): | ||||
|         doc = self.make_test_data() | ||||
|         os.remove(doc.source_path) | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_original_no_access(self): | ||||
|         doc = self.make_test_data() | ||||
|         os.chmod(doc.source_path, 0o000) | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|         os.chmod(doc.source_path, 0o777) | ||||
|  | ||||
|     def test_original_checksum_mismatch(self): | ||||
|         doc = self.make_test_data() | ||||
|         doc.checksum = "WOW" | ||||
|         doc.save() | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_no_archive(self): | ||||
|         doc = self.make_test_data() | ||||
|         os.remove(doc.archive_path) | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_archive_no_access(self): | ||||
|         doc = self.make_test_data() | ||||
|         os.chmod(doc.archive_path, 0o000) | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|         os.chmod(doc.archive_path, 0o777) | ||||
|  | ||||
|     def test_archive_checksum_mismatch(self): | ||||
|         doc = self.make_test_data() | ||||
|         doc.archive_checksum = "WOW" | ||||
|         doc.save() | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_empty_content(self): | ||||
|         doc = self.make_test_data() | ||||
|         doc.content = "" | ||||
|         doc.save() | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_orphaned_file(self): | ||||
|         doc = self.make_test_data() | ||||
|         Path(self.dirs.originals_dir, "orphaned").touch() | ||||
|         self.assertEqual(len(check_sanity()), 1) | ||||
|  | ||||
|     def test_all(self): | ||||
|         Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf") | ||||
|         string = str(SanityFailedError(check_sanity())) | ||||
							
								
								
									
										24
									
								
								src/documents/tests/test_tasks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,24 @@ | ||||
| from datetime import datetime | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.utils import timezone | ||||
|  | ||||
| from documents import tasks | ||||
| from documents.models import Document | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| class TestTasks(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def test_index_reindex(self): | ||||
|         Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now()) | ||||
|  | ||||
|         tasks.index_reindex() | ||||
|  | ||||
|     def test_index_optimize(self): | ||||
|         Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now()) | ||||
|  | ||||
|         tasks.index_optimize() | ||||
|  | ||||
|     def test_train_classifier(self): | ||||
|         tasks.train_classifier() | ||||
							
								
								
									
										76
									
								
								src/documents/tests/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,76 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from collections import namedtuple | ||||
| from contextlib import contextmanager | ||||
|  | ||||
| from django.test import override_settings | ||||
|  | ||||
|  | ||||
| def setup_directories(): | ||||
|  | ||||
|     dirs = namedtuple("Dirs", ()) | ||||
|  | ||||
|     dirs.data_dir = tempfile.mkdtemp() | ||||
|     dirs.scratch_dir = tempfile.mkdtemp() | ||||
|     dirs.media_dir = tempfile.mkdtemp() | ||||
|     dirs.consumption_dir = tempfile.mkdtemp() | ||||
|     dirs.index_dir = os.path.join(dirs.data_dir, "index") | ||||
|     dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals") | ||||
|     dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails") | ||||
|     dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive") | ||||
|  | ||||
|     os.makedirs(dirs.index_dir, exist_ok=True) | ||||
|     os.makedirs(dirs.originals_dir, exist_ok=True) | ||||
|     os.makedirs(dirs.thumbnail_dir, exist_ok=True) | ||||
|     os.makedirs(dirs.archive_dir, exist_ok=True) | ||||
|  | ||||
|     dirs.settings_override = override_settings( | ||||
|         DATA_DIR=dirs.data_dir, | ||||
|         SCRATCH_DIR=dirs.scratch_dir, | ||||
|         MEDIA_ROOT=dirs.media_dir, | ||||
|         ORIGINALS_DIR=dirs.originals_dir, | ||||
|         THUMBNAIL_DIR=dirs.thumbnail_dir, | ||||
|         ARCHIVE_DIR=dirs.archive_dir, | ||||
|         CONSUMPTION_DIR=dirs.consumption_dir, | ||||
|         INDEX_DIR=dirs.index_dir, | ||||
|         MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle") | ||||
|  | ||||
|     ) | ||||
|     dirs.settings_override.enable() | ||||
|  | ||||
|     return dirs | ||||
|  | ||||
|  | ||||
| def remove_dirs(dirs): | ||||
|     shutil.rmtree(dirs.media_dir, ignore_errors=True) | ||||
|     shutil.rmtree(dirs.data_dir, ignore_errors=True) | ||||
|     shutil.rmtree(dirs.scratch_dir, ignore_errors=True) | ||||
|     shutil.rmtree(dirs.consumption_dir, ignore_errors=True) | ||||
|     dirs.settings_override.disable() | ||||
|  | ||||
|  | ||||
| @contextmanager | ||||
| def paperless_environment(): | ||||
|     dirs = None | ||||
|     try: | ||||
|         dirs = setup_directories() | ||||
|         yield dirs | ||||
|     finally: | ||||
|         if dirs: | ||||
|             remove_dirs(dirs) | ||||
|  | ||||
|  | ||||
| class DirectoriesMixin: | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         super().__init__(*args, **kwargs) | ||||
|         self.dirs = None | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|         self.dirs = setup_directories() | ||||
|         super(DirectoriesMixin, self).setUp() | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         super(DirectoriesMixin, self).tearDown() | ||||
|         remove_dirs(self.dirs) | ||||
| @@ -1,8 +1,16 @@ | ||||
| import os | ||||
| import tempfile | ||||
| from datetime import datetime | ||||
| from time import mktime | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.db.models import Count, Max | ||||
| from django.http import HttpResponse, HttpResponseBadRequest, Http404 | ||||
| from django.views.decorators.cache import cache_control | ||||
| from django.views.generic import TemplateView | ||||
| from django_filters.rest_framework import DjangoFilterBackend | ||||
| from django_q.tasks import async_task | ||||
| from rest_framework import parsers | ||||
| from rest_framework.decorators import action | ||||
| from rest_framework.filters import OrderingFilter, SearchFilter | ||||
| from rest_framework.mixins import ( | ||||
| @@ -30,14 +38,14 @@ from .filters import ( | ||||
|     DocumentTypeFilterSet, | ||||
|     LogFilterSet | ||||
| ) | ||||
| from .forms import UploadForm | ||||
| from .models import Correspondent, Document, Log, Tag, DocumentType | ||||
| from .serialisers import ( | ||||
|     CorrespondentSerializer, | ||||
|     DocumentSerializer, | ||||
|     LogSerializer, | ||||
|     TagSerializer, | ||||
|     DocumentTypeSerializer | ||||
|     DocumentTypeSerializer, | ||||
|     PostDocumentSerializer | ||||
| ) | ||||
|  | ||||
|  | ||||
| @@ -126,36 +134,54 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|         index.remove_document_from_index(self.get_object()) | ||||
|         return super(DocumentViewSet, self).destroy(request, *args, **kwargs) | ||||
|  | ||||
|     def file_response(self, pk, disposition): | ||||
|     @staticmethod | ||||
|     def original_requested(request): | ||||
|         return ( | ||||
|             'original' in request.query_params and | ||||
|             request.query_params['original'] == 'true' | ||||
|         ) | ||||
|  | ||||
|     def file_response(self, pk, request, disposition): | ||||
|         doc = Document.objects.get(id=pk) | ||||
|  | ||||
|         if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: | ||||
|             file_handle = doc.source_file | ||||
|         if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501 | ||||
|             file_handle = doc.archive_file | ||||
|             filename = doc.get_public_filename(archive=True) | ||||
|             mime_type = 'application/pdf' | ||||
|         else: | ||||
|             file_handle = GnuPG.decrypted(doc.source_file) | ||||
|             file_handle = doc.source_file | ||||
|             filename = doc.get_public_filename() | ||||
|             mime_type = doc.mime_type | ||||
|  | ||||
|         response = HttpResponse(file_handle, content_type=doc.mime_type) | ||||
|         if doc.storage_type == Document.STORAGE_TYPE_GPG: | ||||
|             file_handle = GnuPG.decrypted(file_handle) | ||||
|  | ||||
|         response = HttpResponse(file_handle, content_type=mime_type) | ||||
|         response["Content-Disposition"] = '{}; filename="{}"'.format( | ||||
|             disposition, doc.file_name) | ||||
|             disposition, filename) | ||||
|         return response | ||||
|  | ||||
|     @action(methods=['post'], detail=False) | ||||
|     def post_document(self, request, pk=None): | ||||
|         # TODO: is this a good implementation? | ||||
|         form = UploadForm(data=request.POST, files=request.FILES) | ||||
|         if form.is_valid(): | ||||
|             form.save() | ||||
|             return Response("OK") | ||||
|         else: | ||||
|             return HttpResponseBadRequest(str(form.errors)) | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def metadata(self, request, pk=None): | ||||
|         try: | ||||
|             doc = Document.objects.get(pk=pk) | ||||
|             return Response({ | ||||
|                 "paperless__checksum": doc.checksum, | ||||
|                 "paperless__mime_type": doc.mime_type, | ||||
|                 "paperless__filename": doc.filename, | ||||
|                 "paperless__has_archive_version": | ||||
|                     os.path.isfile(doc.archive_path) | ||||
|             }) | ||||
|         except Document.DoesNotExist: | ||||
|             raise Http404() | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def preview(self, request, pk=None): | ||||
|         try: | ||||
|             response = self.file_response(pk, "inline") | ||||
|             response = self.file_response( | ||||
|                 pk, request, "inline") | ||||
|             return response | ||||
|         except FileNotFoundError: | ||||
|             raise Http404("Document source file does not exist") | ||||
|         except (FileNotFoundError, Document.DoesNotExist): | ||||
|             raise Http404() | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     @cache_control(public=False, max_age=315360000) | ||||
| @@ -163,15 +189,16 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|         try: | ||||
|             return HttpResponse(Document.objects.get(id=pk).thumbnail_file, | ||||
|                                 content_type='image/png') | ||||
|         except FileNotFoundError: | ||||
|             raise Http404("Document thumbnail does not exist") | ||||
|         except (FileNotFoundError, Document.DoesNotExist): | ||||
|             raise Http404() | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def download(self, request, pk=None): | ||||
|         try: | ||||
|             return self.file_response(pk, "attachment") | ||||
|         except FileNotFoundError: | ||||
|             raise Http404("Document source file does not exist") | ||||
|             return self.file_response( | ||||
|                 pk, request, "attachment") | ||||
|         except (FileNotFoundError, Document.DoesNotExist): | ||||
|             raise Http404() | ||||
|  | ||||
|  | ||||
| class LogViewSet(ReadOnlyModelViewSet): | ||||
| @@ -186,11 +213,62 @@ class LogViewSet(ReadOnlyModelViewSet): | ||||
|     ordering_fields = ("created",) | ||||
|  | ||||
|  | ||||
| class PostDocumentView(APIView): | ||||
|  | ||||
|     permission_classes = (IsAuthenticated,) | ||||
|     serializer_class = PostDocumentSerializer | ||||
|     parser_classes = (parsers.MultiPartParser,) | ||||
|  | ||||
|     def get_serializer_context(self): | ||||
|         return { | ||||
|             'request': self.request, | ||||
|             'format': self.format_kwarg, | ||||
|             'view': self | ||||
|         } | ||||
|  | ||||
|     def get_serializer(self, *args, **kwargs): | ||||
|         kwargs['context'] = self.get_serializer_context() | ||||
|         return self.serializer_class(*args, **kwargs) | ||||
|  | ||||
|     def post(self, request, *args, **kwargs): | ||||
|  | ||||
|         serializer = self.get_serializer(data=request.data) | ||||
|         serializer.is_valid(raise_exception=True) | ||||
|  | ||||
|         doc_name, doc_data = serializer.validated_data.get('document') | ||||
|         correspondent_id = serializer.validated_data.get('correspondent') | ||||
|         document_type_id = serializer.validated_data.get('document_type') | ||||
|         tag_ids = serializer.validated_data.get('tags') | ||||
|         title = serializer.validated_data.get('title') | ||||
|  | ||||
|         t = int(mktime(datetime.now().timetuple())) | ||||
|  | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|  | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", | ||||
|                                          dir=settings.SCRATCH_DIR, | ||||
|                                          delete=False) as f: | ||||
|             f.write(doc_data) | ||||
|             os.utime(f.name, times=(t, t)) | ||||
|  | ||||
|             async_task("documents.tasks.consume_file", | ||||
|                        f.name, | ||||
|                        override_filename=doc_name, | ||||
|                        override_title=title, | ||||
|                        override_correspondent_id=correspondent_id, | ||||
|                        override_document_type_id=document_type_id, | ||||
|                        override_tag_ids=tag_ids, | ||||
|                        task_name=os.path.basename(doc_name)[:100]) | ||||
|         return Response("OK") | ||||
|  | ||||
|  | ||||
| class SearchView(APIView): | ||||
|  | ||||
|     permission_classes = (IsAuthenticated,) | ||||
|  | ||||
|     ix = index.open_index() | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         super(SearchView, self).__init__(*args, **kwargs) | ||||
|         self.ix = index.open_index() | ||||
|  | ||||
|     def add_infos_to_hit(self, r): | ||||
|         doc = Document.objects.get(id=r['id']) | ||||
| @@ -203,33 +281,42 @@ class SearchView(APIView): | ||||
|                 } | ||||
|  | ||||
|     def get(self, request, format=None): | ||||
|         if 'query' in request.query_params: | ||||
|             query = request.query_params['query'] | ||||
|             try: | ||||
|                 page = int(request.query_params.get('page', 1)) | ||||
|             except (ValueError, TypeError): | ||||
|                 page = 1 | ||||
|  | ||||
|             with index.query_page(self.ix, query, page) as result_page: | ||||
|                 return Response( | ||||
|                     {'count': len(result_page), | ||||
|                      'page': result_page.pagenum, | ||||
|                      'page_count': result_page.pagecount, | ||||
|                      'results': list(map(self.add_infos_to_hit, result_page))}) | ||||
|  | ||||
|         else: | ||||
|         if 'query' not in request.query_params: | ||||
|             return Response({ | ||||
|                 'count': 0, | ||||
|                 'page': 0, | ||||
|                 'page_count': 0, | ||||
|                 'results': []}) | ||||
|  | ||||
|         query = request.query_params['query'] | ||||
|         try: | ||||
|             page = int(request.query_params.get('page', 1)) | ||||
|         except (ValueError, TypeError): | ||||
|             page = 1 | ||||
|  | ||||
|         if page < 1: | ||||
|             page = 1 | ||||
|  | ||||
|         try: | ||||
|             with index.query_page(self.ix, query, page) as (result_page, | ||||
|                                                             corrected_query): | ||||
|                 return Response( | ||||
|                     {'count': len(result_page), | ||||
|                      'page': result_page.pagenum, | ||||
|                      'page_count': result_page.pagecount, | ||||
|                      'corrected_query': corrected_query, | ||||
|                      'results': list(map(self.add_infos_to_hit, result_page))}) | ||||
|         except Exception as e: | ||||
|             return HttpResponseBadRequest(str(e)) | ||||
|  | ||||
|  | ||||
| class SearchAutoCompleteView(APIView): | ||||
|  | ||||
|     permission_classes = (IsAuthenticated,) | ||||
|  | ||||
|     ix = index.open_index() | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         super(SearchAutoCompleteView, self).__init__(*args, **kwargs) | ||||
|         self.ix = index.open_index() | ||||
|  | ||||
|     def get(self, request, format=None): | ||||
|         if 'term' in request.query_params: | ||||
|   | ||||
| @@ -1,8 +1,19 @@ | ||||
| from django.conf import settings | ||||
| from django.contrib.auth.models import User | ||||
| from django.utils.deprecation import MiddlewareMixin | ||||
| from rest_framework import authentication | ||||
|  | ||||
|  | ||||
| class AutoLoginMiddleware(MiddlewareMixin): | ||||
|  | ||||
|     def process_request(self, request): | ||||
|         try: | ||||
|             request.user = User.objects.get( | ||||
|                 username=settings.AUTO_LOGIN_USERNAME) | ||||
|         except User.DoesNotExist: | ||||
|             pass | ||||
|  | ||||
|  | ||||
| class AngularApiAuthenticationOverride(authentication.BaseAuthentication): | ||||
|     """ This class is here to provide authentication to the angular dev server | ||||
|         during development. This is disabled in production. | ||||
|   | ||||
| @@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs): | ||||
|     binaries = ( | ||||
|         settings.CONVERT_BINARY, | ||||
|         settings.OPTIPNG_BINARY, | ||||
|         settings.UNPAPER_BINARY, | ||||
|         "tesseract" | ||||
|     ) | ||||
|  | ||||
|   | ||||
| @@ -17,16 +17,3 @@ class GnuPG: | ||||
|             passphrase = settings.PASSPHRASE | ||||
|  | ||||
|         return cls.gpg.decrypt_file(file_handle, passphrase=passphrase).data | ||||
|  | ||||
|     @classmethod | ||||
|     def encrypted(cls, file_handle, passphrase=None): | ||||
|  | ||||
|         if not passphrase: | ||||
|             passphrase = settings.PASSPHRASE | ||||
|  | ||||
|         return cls.gpg.encrypt_file( | ||||
|             file_handle, | ||||
|             recipients=None, | ||||
|             passphrase=passphrase, | ||||
|             symmetric=True | ||||
|         ).data | ||||
|   | ||||
| @@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta | ||||
|  | ||||
| MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) | ||||
| ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") | ||||
| ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive") | ||||
| THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") | ||||
|  | ||||
| DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) | ||||
| @@ -85,6 +86,7 @@ INSTALLED_APPS = [ | ||||
|     "django.contrib.admin", | ||||
|  | ||||
|     "rest_framework", | ||||
|     "rest_framework.authtoken", | ||||
|     "django_filters", | ||||
|  | ||||
|     "django_q", | ||||
| @@ -96,7 +98,8 @@ INSTALLED_APPS = [ | ||||
| REST_FRAMEWORK = { | ||||
|     'DEFAULT_AUTHENTICATION_CLASSES': [ | ||||
|         'rest_framework.authentication.BasicAuthentication', | ||||
|         'rest_framework.authentication.SessionAuthentication' | ||||
|         'rest_framework.authentication.SessionAuthentication', | ||||
|         'rest_framework.authentication.TokenAuthentication' | ||||
|     ] | ||||
| } | ||||
|  | ||||
| @@ -156,6 +159,15 @@ CHANNEL_LAYERS = { | ||||
| # Security                                                                    # | ||||
| ############################################################################### | ||||
|  | ||||
| AUTO_LOGIN_USERNAME = os.getenv("PAPERLESS_AUTO_LOGIN_USERNAME") | ||||
|  | ||||
| if AUTO_LOGIN_USERNAME: | ||||
|     _index = MIDDLEWARE.index('django.contrib.auth.middleware.AuthenticationMiddleware') | ||||
|     # This overrides everything the auth middleware is doing but still allows | ||||
|     # regular login in case the provided user does not exist. | ||||
|     MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware') | ||||
|  | ||||
|  | ||||
| if DEBUG: | ||||
|     X_FRAME_OPTIONS = '' | ||||
|     # this should really be 'allow-from uri' but its not supported in any mayor | ||||
| @@ -253,29 +265,48 @@ USE_TZ = True | ||||
| # Logging                                                                     # | ||||
| ############################################################################### | ||||
|  | ||||
| DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER") | ||||
|  | ||||
| LOGGING = { | ||||
|     "version": 1, | ||||
|     "disable_existing_loggers": False, | ||||
|     'formatters': { | ||||
|         'verbose': { | ||||
|             'format': '{levelname} {asctime} {module} {message}', | ||||
|             'style': '{', | ||||
|         }, | ||||
|         'simple': { | ||||
|             'format': '{levelname} {message}', | ||||
|             'style': '{', | ||||
|         }, | ||||
|     }, | ||||
|     "handlers": { | ||||
|         "dbhandler": { | ||||
|         "db": { | ||||
|             "level": "DEBUG", | ||||
|             "class": "documents.loggers.PaperlessHandler", | ||||
|         }, | ||||
|         "streamhandler": { | ||||
|             "class": "logging.StreamHandler" | ||||
|         "console": { | ||||
|             "level": "INFO", | ||||
|             "class": "logging.StreamHandler", | ||||
|             "formatter": "verbose", | ||||
|         } | ||||
|     }, | ||||
|     "root": { | ||||
|         "handlers": ["console"], | ||||
|         "level": "DEBUG", | ||||
|     }, | ||||
|     "loggers": { | ||||
|         "documents": { | ||||
|             "handlers": ["dbhandler", "streamhandler"], | ||||
|             "level": "DEBUG" | ||||
|             "handlers": ["db"], | ||||
|             "propagate": True, | ||||
|         }, | ||||
|         "paperless_mail": { | ||||
|             "handlers": ["dbhandler", "streamhandler"], | ||||
|             "level": "DEBUG" | ||||
|             "handlers": ["db"], | ||||
|             "propagate": True, | ||||
|         }, | ||||
|         "paperless_tesseract": { | ||||
|             "handlers": ["dbhandler", "streamhandler"], | ||||
|             "level": "DEBUG" | ||||
|             "handlers": ["db"], | ||||
|             "propagate": True, | ||||
|         }, | ||||
|     }, | ||||
| } | ||||
| @@ -332,6 +363,10 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0)) | ||||
|  | ||||
| CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES") | ||||
|  | ||||
| CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE") | ||||
|  | ||||
| CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") | ||||
|  | ||||
| OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | ||||
|  | ||||
| OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) | ||||
| @@ -340,9 +375,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) | ||||
| # documents.  It should be a 3-letter language code consistent with ISO 639. | ||||
| OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||
|  | ||||
| # OCRmyPDF --output-type options are available. | ||||
| # TODO: validate this setting. | ||||
| OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") | ||||
|  | ||||
| # OCR all documents? | ||||
| OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") | ||||
| # skip. redo, force | ||||
| # TODO: validate this. | ||||
| OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | ||||
|  | ||||
| OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||
|  | ||||
| OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | ||||
|  | ||||
| # GNUPG needs a home directory for some reason | ||||
| GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||
| @@ -351,11 +394,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||
| CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert") | ||||
| CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | ||||
| CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | ||||
| CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300)) | ||||
|  | ||||
| GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") | ||||
|  | ||||
| OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") | ||||
| UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | ||||
|  | ||||
|  | ||||
| # Pre-2.x versions of Paperless stored your documents locally with GPG | ||||
|   | ||||
| @@ -4,6 +4,7 @@ from django.contrib.auth.decorators import login_required | ||||
| from django.urls import path, re_path | ||||
| from django.views.decorators.csrf import csrf_exempt | ||||
| from django.views.generic import RedirectView | ||||
| from rest_framework.authtoken import views | ||||
| from rest_framework.routers import DefaultRouter | ||||
|  | ||||
| from paperless.consumers import StatusConsumer | ||||
| @@ -16,7 +17,8 @@ from documents.views import ( | ||||
|     SearchView, | ||||
|     IndexView, | ||||
|     SearchAutoCompleteView, | ||||
|     StatisticsView | ||||
|     StatisticsView, | ||||
|     PostDocumentView | ||||
| ) | ||||
| from paperless.views import FaviconView | ||||
|  | ||||
| @@ -46,6 +48,11 @@ urlpatterns = [ | ||||
|                 StatisticsView.as_view(), | ||||
|                 name="statistics"), | ||||
|  | ||||
|         re_path(r"^documents/post_document/", PostDocumentView.as_view(), | ||||
|                 name="post_document"), | ||||
|  | ||||
|         path('token/', views.obtain_auth_token) | ||||
|  | ||||
|     ] + api_router.urls)), | ||||
|  | ||||
|     re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"), | ||||
|   | ||||
| @@ -1 +1 @@ | ||||
| __version__ = (0, 9, 1) | ||||
| __version__ = (0, 9, 5) | ||||
|   | ||||
| @@ -4,6 +4,7 @@ from datetime import timedelta, date | ||||
|  | ||||
| import magic | ||||
| from django.conf import settings | ||||
| from django.db import DatabaseError | ||||
| from django.utils.text import slugify | ||||
| from django_q.tasks import async_task | ||||
| from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \ | ||||
| @@ -86,46 +87,6 @@ def make_criterias(rule): | ||||
|     return {**criterias, **get_rule_action(rule).get_criteria()} | ||||
|  | ||||
|  | ||||
| def get_title(message, att, rule): | ||||
|     if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT: | ||||
|         title = message.subject | ||||
|     elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME: | ||||
|         title = os.path.splitext(os.path.basename(att.filename))[0] | ||||
|     else: | ||||
|         raise ValueError("Unknown title selector.") | ||||
|  | ||||
|     return title | ||||
|  | ||||
|  | ||||
| def get_correspondent(message, rule): | ||||
|     if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING: | ||||
|         correspondent = None | ||||
|     elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL: | ||||
|         correspondent_name = message.from_ | ||||
|         correspondent = Correspondent.objects.get_or_create( | ||||
|             name=correspondent_name, defaults={ | ||||
|                 "slug": slugify(correspondent_name) | ||||
|             })[0] | ||||
|     elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME: | ||||
|         if message.from_values and \ | ||||
|            'name' in message.from_values \ | ||||
|            and message.from_values['name']: | ||||
|             correspondent_name = message.from_values['name'] | ||||
|         else: | ||||
|             correspondent_name = message.from_ | ||||
|  | ||||
|         correspondent = Correspondent.objects.get_or_create( | ||||
|             name=correspondent_name, defaults={ | ||||
|                 "slug": slugify(correspondent_name) | ||||
|             })[0] | ||||
|     elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM: | ||||
|         correspondent = rule.assign_correspondent | ||||
|     else: | ||||
|         raise ValueError("Unknwown correspondent selector") | ||||
|  | ||||
|     return correspondent | ||||
|  | ||||
|  | ||||
| def get_mailbox(server, port, security): | ||||
|     if security == MailAccount.IMAP_SECURITY_NONE: | ||||
|         mailbox = MailBoxUnencrypted(server, port) | ||||
| @@ -140,6 +101,51 @@ def get_mailbox(server, port, security): | ||||
|  | ||||
| class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|     def _correspondent_from_name(self, name): | ||||
|         try: | ||||
|             return Correspondent.objects.get_or_create( | ||||
|                 name=name, defaults={ | ||||
|                     "slug": slugify(name) | ||||
|                 })[0] | ||||
|         except DatabaseError as e: | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 f"Error while retrieving correspondent {name}: {e}" | ||||
|             ) | ||||
|             return None | ||||
|  | ||||
|     def get_title(self, message, att, rule): | ||||
|         if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT: | ||||
|             return message.subject | ||||
|  | ||||
|         elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME: | ||||
|             return os.path.splitext(os.path.basename(att.filename))[0] | ||||
|  | ||||
|         else: | ||||
|             raise ValueError("Unknown title selector.") | ||||
|  | ||||
|     def get_correspondent(self, message, rule): | ||||
|         c_from = rule.assign_correspondent_from | ||||
|  | ||||
|         if c_from == MailRule.CORRESPONDENT_FROM_NOTHING: | ||||
|             return None | ||||
|  | ||||
|         elif c_from == MailRule.CORRESPONDENT_FROM_EMAIL: | ||||
|             return self._correspondent_from_name(message.from_) | ||||
|  | ||||
|         elif c_from == MailRule.CORRESPONDENT_FROM_NAME: | ||||
|             if message.from_values and 'name' in message.from_values and message.from_values['name']:  # NOQA: E501 | ||||
|                 return self._correspondent_from_name( | ||||
|                     message.from_values['name']) | ||||
|             else: | ||||
|                 return self._correspondent_from_name(message.from_) | ||||
|  | ||||
|         elif c_from == MailRule.CORRESPONDENT_FROM_CUSTOM: | ||||
|             return rule.assign_correspondent | ||||
|  | ||||
|         else: | ||||
|             raise ValueError("Unknwown correspondent selector") | ||||
|  | ||||
|     def handle_mail_account(self, account): | ||||
|  | ||||
|         self.renew_logging_group() | ||||
| @@ -156,79 +162,89 @@ class MailAccountHandler(LoggingMixin): | ||||
|                 M.login(account.username, account.password) | ||||
|             except Exception: | ||||
|                 raise MailError( | ||||
|                     f"Error while authenticating account {account.name}") | ||||
|                     f"Error while authenticating account {account}") | ||||
|  | ||||
|             self.log('debug', f"Account {account}: Processing " | ||||
|                               f"{account.rules.count()} rule(s)") | ||||
|  | ||||
|             for rule in account.rules.order_by('order'): | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Account {account}: Processing rule {rule.name}") | ||||
|  | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Rule {account}.{rule}: Selecting folder {rule.folder}") | ||||
|  | ||||
|                 try: | ||||
|                     M.folder.set(rule.folder) | ||||
|                 except MailboxFolderSelectError: | ||||
|                     raise MailError( | ||||
|                         f"Rule {rule.name}: Folder {rule.folder} " | ||||
|                         f"does not exist in account {account.name}") | ||||
|                     total_processed_files += self.handle_mail_rule(M, rule) | ||||
|                 except Exception as e: | ||||
|                     self.log( | ||||
|                         "error", | ||||
|                         f"Rule {rule}: Error while processing rule: {e}", | ||||
|                         exc_info=True | ||||
|                     ) | ||||
|  | ||||
|                 criterias = make_criterias(rule) | ||||
|         return total_processed_files | ||||
|  | ||||
|     def handle_mail_rule(self, M, rule): | ||||
|  | ||||
|         self.log( | ||||
|             'debug', | ||||
|             f"Rule {rule}: Selecting folder {rule.folder}") | ||||
|  | ||||
|         try: | ||||
|             M.folder.set(rule.folder) | ||||
|         except MailboxFolderSelectError: | ||||
|             raise MailError( | ||||
|                 f"Rule {rule}: Folder {rule.folder} " | ||||
|                 f"does not exist in account {rule.account}") | ||||
|  | ||||
|         criterias = make_criterias(rule) | ||||
|  | ||||
|         self.log( | ||||
|             'debug', | ||||
|             f"Rule {rule}: Searching folder with criteria " | ||||
|             f"{str(AND(**criterias))}") | ||||
|  | ||||
|         try: | ||||
|             messages = M.fetch(criteria=AND(**criterias), | ||||
|                                mark_seen=False) | ||||
|         except Exception: | ||||
|             raise MailError( | ||||
|                 f"Rule {rule}: Error while fetching folder {rule.folder}") | ||||
|  | ||||
|         post_consume_messages = [] | ||||
|  | ||||
|         mails_processed = 0 | ||||
|         total_processed_files = 0 | ||||
|  | ||||
|         for message in messages: | ||||
|             try: | ||||
|                 processed_files = self.handle_message(message, rule) | ||||
|                 if processed_files > 0: | ||||
|                     post_consume_messages.append(message.uid) | ||||
|  | ||||
|                 total_processed_files += processed_files | ||||
|                 mails_processed += 1 | ||||
|             except Exception as e: | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Rule {account}.{rule}: Searching folder with criteria " | ||||
|                     f"{str(AND(**criterias))}") | ||||
|                     "error", | ||||
|                     f"Rule {rule}: Error while processing mail " | ||||
|                     f"{message.uid}: {e}", | ||||
|                     exc_info=True) | ||||
|  | ||||
|                 try: | ||||
|                     messages = M.fetch(criteria=AND(**criterias), | ||||
|                                        mark_seen=False) | ||||
|                 except Exception: | ||||
|                     raise MailError( | ||||
|                         f"Rule {rule.name}: Error while fetching folder " | ||||
|                         f"{rule.folder} of account {account.name}") | ||||
|         self.log( | ||||
|             'debug', | ||||
|             f"Rule {rule}: Processed {mails_processed} matching mail(s)") | ||||
|  | ||||
|                 post_consume_messages = [] | ||||
|         self.log( | ||||
|             'debug', | ||||
|             f"Rule {rule}: Running mail actions on " | ||||
|             f"{len(post_consume_messages)} mails") | ||||
|  | ||||
|                 mails_processed = 0 | ||||
|         try: | ||||
|             get_rule_action(rule).post_consume( | ||||
|                 M, | ||||
|                 post_consume_messages, | ||||
|                 rule.action_parameter) | ||||
|  | ||||
|                 for message in messages: | ||||
|                     try: | ||||
|                         processed_files = self.handle_message(message, rule) | ||||
|                     except Exception: | ||||
|                         raise MailError( | ||||
|                             f"Rule {rule.name}: Error while processing mail " | ||||
|                             f"{message.uid} of account {account.name}") | ||||
|                     if processed_files > 0: | ||||
|                         post_consume_messages.append(message.uid) | ||||
|  | ||||
|                     total_processed_files += processed_files | ||||
|                     mails_processed += 1 | ||||
|  | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Rule {account}.{rule}: Processed {mails_processed} " | ||||
|                     f"matching mail(s)") | ||||
|  | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Rule {account}.{rule}: Running mail actions on " | ||||
|                     f"{len(post_consume_messages)} mails") | ||||
|  | ||||
|                 try: | ||||
|                     get_rule_action(rule).post_consume( | ||||
|                         M, | ||||
|                         post_consume_messages, | ||||
|                         rule.action_parameter) | ||||
|  | ||||
|                 except Exception: | ||||
|                     raise MailError( | ||||
|                         f"Rule {rule.name}: Error while processing " | ||||
|                         f"post-consume actions for account {account.name}") | ||||
|         except Exception as e: | ||||
|             raise MailError( | ||||
|                 f"Rule {rule}: Error while processing post-consume actions: " | ||||
|                 f"{e}") | ||||
|  | ||||
|         return total_processed_files | ||||
|  | ||||
| @@ -238,11 +254,11 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|         self.log( | ||||
|             'debug', | ||||
|             f"Rule {rule.account}.{rule}: " | ||||
|             f"Rule {rule}: " | ||||
|             f"Processing mail {message.subject} from {message.from_} with " | ||||
|             f"{len(message.attachments)} attachment(s)") | ||||
|  | ||||
|         correspondent = get_correspondent(message, rule) | ||||
|         correspondent = self.get_correspondent(message, rule) | ||||
|         tag = rule.assign_tag | ||||
|         doc_type = rule.assign_document_type | ||||
|  | ||||
| @@ -253,12 +269,12 @@ class MailAccountHandler(LoggingMixin): | ||||
|             if not att.content_disposition == "attachment": | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Rule {rule.account}.{rule}: " | ||||
|                     f"Rule {rule}: " | ||||
|                     f"Skipping attachment {att.filename} " | ||||
|                     f"with content disposition inline") | ||||
|                     f"with content disposition {att.content_disposition}") | ||||
|                 continue | ||||
|  | ||||
|             title = get_title(message, att, rule) | ||||
|             title = self.get_title(message, att, rule) | ||||
|  | ||||
|             # don't trust the content type of the attachment. Could be | ||||
|             # generic application/octet-stream. | ||||
| @@ -274,7 +290,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|                 self.log( | ||||
|                     'info', | ||||
|                     f"Rule {rule.account}.{rule}: " | ||||
|                     f"Rule {rule}: " | ||||
|                     f"Consuming attachment {att.filename} from mail " | ||||
|                     f"{message.subject} from {message.from_}") | ||||
|  | ||||
| @@ -293,7 +309,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|             else: | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Rule {rule.account}.{rule}: " | ||||
|                     f"Rule {rule}: " | ||||
|                     f"Skipping attachment {att.filename} " | ||||
|                     f"since guessed mime type {mime_type} is not supported " | ||||
|                     f"by paperless") | ||||
|   | ||||
| @@ -139,4 +139,4 @@ class MailRule(models.Model): | ||||
|     ) | ||||
|  | ||||
|     def __str__(self): | ||||
|         return self.name | ||||
|         return f"{self.account.name}.{self.name}" | ||||
|   | ||||
| @@ -1,14 +1,20 @@ | ||||
| import logging | ||||
|  | ||||
| from paperless_mail.mail import MailAccountHandler | ||||
| from paperless_mail.mail import MailAccountHandler, MailError | ||||
| from paperless_mail.models import MailAccount | ||||
|  | ||||
|  | ||||
| def process_mail_accounts(): | ||||
|     total_new_documents = 0 | ||||
|     for account in MailAccount.objects.all(): | ||||
|         total_new_documents += MailAccountHandler().handle_mail_account( | ||||
|             account) | ||||
|         try: | ||||
|             total_new_documents += MailAccountHandler().handle_mail_account( | ||||
|                 account) | ||||
|         except MailError as e: | ||||
|             logging.getLogger(__name__).error( | ||||
|                 f"Error while processing mail account {account}: {e}", | ||||
|                 exc_info=True | ||||
|             ) | ||||
|  | ||||
|     if total_new_documents > 0: | ||||
|         return f"Added {total_new_documents} document(s)." | ||||
| @@ -17,8 +23,8 @@ def process_mail_accounts(): | ||||
|  | ||||
|  | ||||
| def process_mail_account(name): | ||||
|     account = MailAccount.objects.find(name=name) | ||||
|     if account: | ||||
|     try: | ||||
|         account = MailAccount.objects.get(name=name) | ||||
|         MailAccountHandler().handle_mail_account(account) | ||||
|     else: | ||||
|         logging.error("Unknown mail acccount: {}".format(name)) | ||||
|     except MailAccount.DoesNotExist: | ||||
|         logging.getLogger(__name__).error(f"Unknown mail acccount: {name}") | ||||
|   | ||||
| @@ -3,11 +3,14 @@ from collections import namedtuple | ||||
| from typing import ContextManager | ||||
| from unittest import mock | ||||
|  | ||||
| from django.core.management import call_command | ||||
| from django.db import DatabaseError | ||||
| from django.test import TestCase | ||||
| from imap_tools import MailMessageFlags, MailboxFolderSelectError | ||||
|  | ||||
| from documents.models import Correspondent | ||||
| from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title | ||||
| from paperless_mail import tasks | ||||
| from paperless_mail.mail import MailError, MailAccountHandler | ||||
| from paperless_mail.models import MailRule, MailAccount | ||||
|  | ||||
|  | ||||
| @@ -163,28 +166,30 @@ class TestMail(TestCase): | ||||
|         me_localhost = Correspondent.objects.create(name=message2.from_) | ||||
|         someone_else = Correspondent.objects.create(name="someone else") | ||||
|  | ||||
|         handler = MailAccountHandler() | ||||
|  | ||||
|         rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING) | ||||
|         self.assertIsNone(get_correspondent(message, rule)) | ||||
|         self.assertIsNone(handler.get_correspondent(message, rule)) | ||||
|  | ||||
|         rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL) | ||||
|         c = get_correspondent(message, rule) | ||||
|         c = handler.get_correspondent(message, rule) | ||||
|         self.assertIsNotNone(c) | ||||
|         self.assertEqual(c.name, "someone@somewhere.com") | ||||
|         c = get_correspondent(message2, rule) | ||||
|         c = handler.get_correspondent(message2, rule) | ||||
|         self.assertIsNotNone(c) | ||||
|         self.assertEqual(c.name, "me@localhost.com") | ||||
|         self.assertEqual(c.id, me_localhost.id) | ||||
|  | ||||
|         rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME) | ||||
|         c = get_correspondent(message, rule) | ||||
|         c = handler.get_correspondent(message, rule) | ||||
|         self.assertIsNotNone(c) | ||||
|         self.assertEqual(c.name, "Someone!") | ||||
|         c = get_correspondent(message2, rule) | ||||
|         c = handler.get_correspondent(message2, rule) | ||||
|         self.assertIsNotNone(c) | ||||
|         self.assertEqual(c.id, me_localhost.id) | ||||
|  | ||||
|         rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else) | ||||
|         c = get_correspondent(message, rule) | ||||
|         c = handler.get_correspondent(message, rule) | ||||
|         self.assertEqual(c, someone_else) | ||||
|  | ||||
|     def test_get_title(self): | ||||
| @@ -192,10 +197,13 @@ class TestMail(TestCase): | ||||
|         message.subject = "the message title" | ||||
|         att = namedtuple('Attachment', []) | ||||
|         att.filename = "this_is_the_file.pdf" | ||||
|  | ||||
|         handler = MailAccountHandler() | ||||
|  | ||||
|         rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME) | ||||
|         self.assertEqual(get_title(message, att, rule), "this_is_the_file") | ||||
|         self.assertEqual(handler.get_title(message, att, rule), "this_is_the_file") | ||||
|         rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT) | ||||
|         self.assertEqual(get_title(message, att, rule), "the message title") | ||||
|         self.assertEqual(handler.get_title(message, att, rule), "the message title") | ||||
|  | ||||
|     def test_handle_message(self): | ||||
|         message = create_message(subject="the message title", from_="Myself", num_attachments=2) | ||||
| @@ -317,7 +325,7 @@ class TestMail(TestCase): | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages), 2) | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) | ||||
|  | ||||
|     def test_errors(self): | ||||
|     def test_error_login(self): | ||||
|         account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong") | ||||
|  | ||||
|         try: | ||||
| @@ -327,26 +335,84 @@ class TestMail(TestCase): | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|     def test_error_skip_account(self): | ||||
|         account_faulty = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wroasdng") | ||||
|  | ||||
|         account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") | ||||
|         rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh") | ||||
|         rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, | ||||
|                                        action_parameter="spam", filter_subject="Claim") | ||||
|  | ||||
|         tasks.process_mail_accounts() | ||||
|         self.assertEqual(self.async_task.call_count, 1) | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages), 2) | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) | ||||
|  | ||||
|     def test_error_skip_rule(self): | ||||
|  | ||||
|         account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") | ||||
|         rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, | ||||
|                                        action_parameter="spam", filter_subject="Claim", order=1, folder="uuuhhhh") | ||||
|         rule2 = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, | ||||
|                                        action_parameter="spam", filter_subject="Claim", order=2) | ||||
|  | ||||
|         self.mail_account_handler.handle_mail_account(account) | ||||
|         self.assertEqual(self.async_task.call_count, 1) | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages), 2) | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) | ||||
|  | ||||
|  | ||||
|     @mock.patch("paperless_mail.mail.MailAccountHandler.get_correspondent") | ||||
|     def test_error_skip_mail(self, m): | ||||
|  | ||||
|         def get_correspondent_fake(message, rule): | ||||
|             if message.from_ == 'amazon@amazon.de': | ||||
|                 raise ValueError("Does not compute.") | ||||
|             else: | ||||
|                 return None | ||||
|  | ||||
|         m.side_effect = get_correspondent_fake | ||||
|  | ||||
|         account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") | ||||
|         rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam") | ||||
|  | ||||
|         self.mail_account_handler.handle_mail_account(account) | ||||
|  | ||||
|         # test that we still consume mail even if some mails throw errors. | ||||
|         self.assertEqual(self.async_task.call_count, 2) | ||||
|  | ||||
|         # faulty mail still in inbox, untouched | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages), 1) | ||||
|         self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de') | ||||
|  | ||||
|     def test_error_create_correspondent(self): | ||||
|  | ||||
|         account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") | ||||
|         rule = MailRule.objects.create( | ||||
|             name="testrule", filter_from="amazon@amazon.de", | ||||
|             account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", | ||||
|             assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL) | ||||
|  | ||||
|         self.mail_account_handler.handle_mail_account(account) | ||||
|  | ||||
|         self.async_task.assert_called_once() | ||||
|         args, kwargs = self.async_task.call_args | ||||
|  | ||||
|         c = Correspondent.objects.get(name="amazon@amazon.de") | ||||
|         # should work | ||||
|         self.assertEquals(kwargs['override_correspondent_id'], c.id) | ||||
|  | ||||
|         self.async_task.reset_mock() | ||||
|         self.reset_bogus_mailbox() | ||||
|  | ||||
|         with mock.patch("paperless_mail.mail.Correspondent.objects.get_or_create") as m: | ||||
|             m.side_effect = DatabaseError() | ||||
|  | ||||
|         try: | ||||
|             self.mail_account_handler.handle_mail_account(account) | ||||
|         except MailError as e: | ||||
|             self.assertTrue("uuuh does not exist" in str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|         account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret") | ||||
|         args, kwargs = self.async_task.call_args | ||||
|         self.async_task.assert_called_once() | ||||
|         self.assertEquals(kwargs['override_correspondent_id'], None) | ||||
|  | ||||
|         rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim") | ||||
|  | ||||
|         try: | ||||
|             self.mail_account_handler.handle_mail_account(account) | ||||
|         except MailError as e: | ||||
|             self.assertTrue("Error while processing post-consume actions" in str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|     def test_filters(self): | ||||
|  | ||||
| @@ -390,3 +456,43 @@ class TestMail(TestCase): | ||||
|         self.mail_account_handler.handle_mail_account(account) | ||||
|         self.assertEqual(len(self.bogus_mailbox.messages), 2) | ||||
|         self.assertEqual(self.async_task.call_count, 5) | ||||
|  | ||||
| class TestManagementCommand(TestCase): | ||||
|  | ||||
|     @mock.patch("paperless_mail.management.commands.mail_fetcher.tasks.process_mail_accounts") | ||||
|     def test_mail_fetcher(self, m): | ||||
|  | ||||
|         call_command("mail_fetcher") | ||||
|  | ||||
|         m.assert_called_once() | ||||
|  | ||||
| class TestTasks(TestCase): | ||||
|  | ||||
|     @mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account") | ||||
|     def test_all_accounts(self, m): | ||||
|         m.side_effect = lambda account: 6 | ||||
|  | ||||
|         MailAccount.objects.create(name="A", imap_server="A", username="A", password="A") | ||||
|         MailAccount.objects.create(name="B", imap_server="A", username="A", password="A") | ||||
|  | ||||
|         result = tasks.process_mail_accounts() | ||||
|  | ||||
|         self.assertEqual(m.call_count, 2) | ||||
|         self.assertIn("Added 12", result) | ||||
|  | ||||
|         m.side_effect = lambda account: 0 | ||||
|         result = tasks.process_mail_accounts() | ||||
|         self.assertIn("No new", result) | ||||
|  | ||||
|     @mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account") | ||||
|     def test_single_accounts(self, m): | ||||
|  | ||||
|         MailAccount.objects.create(name="A", imap_server="A", username="A", password="A") | ||||
|  | ||||
|         tasks.process_mail_account("A") | ||||
|  | ||||
|         m.assert_called_once() | ||||
|         m.reset_mock() | ||||
|  | ||||
|         tasks.process_mail_account("B") | ||||
|         m.assert_not_called() | ||||
|   | ||||
| @@ -0,0 +1,2 @@ | ||||
| # this is here so that django finds the checks. | ||||
| from .checks import * | ||||
|   | ||||
							
								
								
									
										34
									
								
								src/paperless_tesseract/checks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,34 @@ | ||||
| import subprocess | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.checks import Error, register | ||||
|  | ||||
|  | ||||
| def get_tesseract_langs(): | ||||
|     with subprocess.Popen(['tesseract', '--list-langs'], | ||||
|                           stdout=subprocess.PIPE) as p: | ||||
|         stdout, stderr = p.communicate() | ||||
|  | ||||
|     return stdout.decode().strip().split("\n")[1:] | ||||
|  | ||||
|  | ||||
| @register() | ||||
| def check_default_language_available(app_configs, **kwargs): | ||||
|     installed_langs = get_tesseract_langs() | ||||
|  | ||||
|     if not settings.OCR_LANGUAGE: | ||||
|         return [Warning( | ||||
|             "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " | ||||
|             "This means that tesseract will fallback to english." | ||||
|         )] | ||||
|  | ||||
|     specified_langs = settings.OCR_LANGUAGE.split("+") | ||||
|  | ||||
|     for lang in specified_langs: | ||||
|         if lang not in installed_langs: | ||||
|             return [Error( | ||||
|                 f"The selected ocr language {lang} is " | ||||
|                 f"not installed. Paperless cannot OCR your documents " | ||||
|                 f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")] | ||||
|  | ||||
|     return [] | ||||
| @@ -1,23 +1,15 @@ | ||||
| import itertools | ||||
| import json | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import ThreadPool | ||||
|  | ||||
| import langdetect | ||||
| import ocrmypdf | ||||
| import pdftotext | ||||
| import pyocr | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from pyocr import PyocrException | ||||
| from ocrmypdf import InputFileError, EncryptedPdfError | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, run_unpaper, \ | ||||
|     run_convert | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
| from documents.parsers import DocumentParser, ParseError, run_convert | ||||
|  | ||||
|  | ||||
| class RasterisedDocumentParser(DocumentParser): | ||||
| @@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path, logging_group, progress_callback): | ||||
|         super().__init__(path, logging_group, progress_callback) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
| @@ -43,8 +31,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                         scale="500x5000>", | ||||
|                         alpha="remove", | ||||
|                         strip=True, | ||||
|                         trim=True, | ||||
|                         input_file="{}[0]".format(self.document_path), | ||||
|                         trim=False, | ||||
|                         input_file="{}[0]".format(document_path), | ||||
|                         output_file=out_path, | ||||
|                         logging_group=self.logging_group) | ||||
|         except ParseError: | ||||
| @@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                    "-q", | ||||
|                    "-sDEVICE=pngalpha", | ||||
|                    "-o", gs_out_path, | ||||
|                    self.document_path] | ||||
|                    document_path] | ||||
|             if not subprocess.Popen(cmd).wait() == 0: | ||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||
|             # then run convert on the output from gs | ||||
| @@ -67,187 +55,160 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                         scale="500x5000>", | ||||
|                         alpha="remove", | ||||
|                         strip=True, | ||||
|                         trim=True, | ||||
|                         trim=False, | ||||
|                         input_file=gs_out_path, | ||||
|                         output_file=out_path, | ||||
|                         logging_group=self.logging_group) | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def _is_ocred(self): | ||||
|  | ||||
|         # Extract text from PDF using pdftotext | ||||
|         text = get_text_from_pdf(self.document_path) | ||||
|  | ||||
|         # We assume, that a PDF with at least 50 characters contains text | ||||
|         # (so no OCR required) | ||||
|         return len(text) > 50 | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|         if self._text is not None: | ||||
|             return self._text | ||||
|  | ||||
|         if not settings.OCR_ALWAYS and self._is_ocred(): | ||||
|             self.log("debug", "Skipping OCR, using Text from PDF") | ||||
|             self._text = get_text_from_pdf(self.document_path) | ||||
|             return self._text | ||||
|  | ||||
|         self.progress_callback(0, 1, "Making greyscale images.") | ||||
|         images = self._get_greyscale() | ||||
|  | ||||
|         if not images: | ||||
|             raise ParseError("Empty document, nothing to do.") | ||||
|     def is_image(self, mime_type): | ||||
|         return mime_type in [ | ||||
|             "image/png", | ||||
|             "image/jpeg", | ||||
|             "image/tiff", | ||||
|             "image/bmp", | ||||
|             "image/gif", | ||||
|         ] | ||||
|  | ||||
|     def get_dpi(self, image): | ||||
|         try: | ||||
|  | ||||
|             sample_page_index = int(len(images) / 2) | ||||
|             self.log( | ||||
|                 "debug", | ||||
|                 f"Attempting language detection on page " | ||||
|                 f"{sample_page_index + 1} of {len(images)}...") | ||||
|             self.progress_callback(0.4, 1, "Language Detection.") | ||||
|             sample_page_text = self._ocr([images[sample_page_index]], | ||||
|                                          settings.OCR_LANGUAGE)[0] | ||||
|             guessed_language = self._guess_language(sample_page_text) | ||||
|             self.progress_callback(0.6, 1, "OCR all the pages.") | ||||
|  | ||||
|             if not guessed_language or guessed_language not in ISO639: | ||||
|                 self.log("warning", "Language detection failed.") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             elif ISO639[guessed_language] == settings.OCR_LANGUAGE: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Detected language: {guessed_language} " | ||||
|                     f"(default language)") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501 | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"Detected language {guessed_language} is not available " | ||||
|                     f"on this system.") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             else: | ||||
|                 self.log("debug", f"Detected language: {guessed_language}") | ||||
|                 ocr_pages = self._ocr( | ||||
|                     images, ISO639[guessed_language], report_progress=True) | ||||
|  | ||||
|             self.log("debug", "OCR completed.") | ||||
|             self._text = strip_excess_whitespace(" ".join(ocr_pages)) | ||||
|             return self._text | ||||
|  | ||||
|         except OCRError as e: | ||||
|             raise ParseError(e) | ||||
|  | ||||
|     def _get_greyscale(self): | ||||
|         """ | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         """ | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         input_file = self.document_path | ||||
|  | ||||
|         if settings.OCR_PAGES == 1: | ||||
|             input_file += "[0]" | ||||
|         elif settings.OCR_PAGES > 1: | ||||
|             input_file += f"[0-{settings.OCR_PAGES - 1}]" | ||||
|  | ||||
|         self.log( | ||||
|             "debug", | ||||
|             f"Converting document {input_file} into greyscale images") | ||||
|  | ||||
|         output_files = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||
|  | ||||
|         run_convert(density=settings.CONVERT_DENSITY, | ||||
|                     depth="8", | ||||
|                     type="grayscale", | ||||
|                     input_file=input_file, | ||||
|                     output_file=output_files, | ||||
|                     logging_group=self.logging_group) | ||||
|  | ||||
|         # Get a list of converted images | ||||
|         pnms = [] | ||||
|         for f in os.listdir(self.tempdir): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|  | ||||
|         self.log("debug", f"Running unpaper on {len(pnms)} pages...") | ||||
|  | ||||
|         self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms))) | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             pnms = pool.map(run_unpaper, pnms) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
|  | ||||
|     def _guess_language(self, text): | ||||
|         try: | ||||
|             guess = langdetect.detect(text) | ||||
|             return guess | ||||
|             with Image.open(image) as im: | ||||
|                 x, y = im.info['dpi'] | ||||
|                 return x | ||||
|         except Exception as e: | ||||
|             self.log('warning', f"Language detection failed with: {e}") | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def _ocr(self, imgs, lang, report_progress=False): | ||||
|         self.log( | ||||
|             "debug", | ||||
|             f"Performing OCR on {len(imgs)} page(s) with language {lang}") | ||||
|         r = [] | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             # r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             for i, page in enumerate(pool.imap(image_to_string, itertools.product(imgs, [lang]))): | ||||
|                 if report_progress: | ||||
|                     self.progress_callback(0.6 + (i / len(imgs)) * 0.4, 1, "OCR'ed {} pages".format(i+1)) | ||||
|                 r += [page] | ||||
|         return r | ||||
|     def parse(self, document_path, mime_type): | ||||
|         mode = settings.OCR_MODE | ||||
|  | ||||
|     def _complete_ocr_default_language(self, | ||||
|                                        images, | ||||
|                                        sample_page_index, | ||||
|                                        sample_page): | ||||
|         images_copy = list(images) | ||||
|         del images_copy[sample_page_index] | ||||
|         if images_copy: | ||||
|             self.log('debug', "Continuing ocr with default language.") | ||||
|             ocr_pages = self._ocr( | ||||
|                 images_copy, settings.OCR_LANGUAGE, report_progress=True) | ||||
|             ocr_pages.insert(sample_page_index, sample_page) | ||||
|             return ocr_pages | ||||
|         text_original = get_text_from_pdf(document_path) | ||||
|         has_text = text_original and len(text_original) > 50 | ||||
|  | ||||
|         if mode == "skip_noarchive" and has_text: | ||||
|             self.log("debug", | ||||
|                      "Document has text, skipping OCRmyPDF entirely.") | ||||
|             self.text = text_original | ||||
|             return | ||||
|  | ||||
|         if mode in ['skip', 'skip_noarchive'] and not has_text: | ||||
|             # upgrade to redo, since there appears to be no text in the | ||||
|             # document. This happens to some weird encrypted documents or | ||||
|             # documents with failed OCR attempts for which OCRmyPDF will | ||||
|             # still report that there actually is text in them. | ||||
|             self.log("debug", | ||||
|                      "No text was found in the document and skip is " | ||||
|                      "specified. Upgrading OCR mode to redo.") | ||||
|             mode = "redo" | ||||
|  | ||||
|         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||
|  | ||||
|         ocr_args = { | ||||
|             'input_file': document_path, | ||||
|             'output_file': archive_path, | ||||
|             'use_threads': True, | ||||
|             'jobs': settings.THREADS_PER_WORKER, | ||||
|             'language': settings.OCR_LANGUAGE, | ||||
|             'output_type': settings.OCR_OUTPUT_TYPE, | ||||
|             'progress_bar': False, | ||||
|             'clean': True | ||||
|         } | ||||
|  | ||||
|         if settings.OCR_PAGES > 0: | ||||
|             ocr_args['pages'] = f"1-{settings.OCR_PAGES}" | ||||
|  | ||||
|         # Mode selection. | ||||
|  | ||||
|         if mode in ['skip', 'skip_noarchive']: | ||||
|             ocr_args['skip_text'] = True | ||||
|         elif mode == 'redo': | ||||
|             ocr_args['redo_ocr'] = True | ||||
|         elif mode == 'force': | ||||
|             ocr_args['force_ocr'] = True | ||||
|         else: | ||||
|             return [sample_page] | ||||
|             raise ParseError( | ||||
|                 f"Invalid ocr mode: {mode}") | ||||
|  | ||||
|         if self.is_image(mime_type): | ||||
|             dpi = self.get_dpi(document_path) | ||||
|             if dpi: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Detected DPI for image {document_path}: {dpi}" | ||||
|                 ) | ||||
|                 ocr_args['image_dpi'] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||
|             else: | ||||
|                 raise ParseError( | ||||
|                     f"Cannot produce archive PDF for image {document_path}, " | ||||
|                     f"no DPI information is present in this image and " | ||||
|                     f"OCR_IMAGE_DPI is not set.") | ||||
|  | ||||
|         if settings.OCR_USER_ARGS: | ||||
|             try: | ||||
|                 user_args = json.loads(settings.OCR_USER_ARGS) | ||||
|                 ocr_args = {**ocr_args, **user_args} | ||||
|             except Exception as e: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " | ||||
|                     f"they will not be used: {e}") | ||||
|  | ||||
|         # This forces tesseract to use one core per page. | ||||
|         os.environ['OMP_THREAD_LIMIT'] = "1" | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", | ||||
|                      f"Calling OCRmyPDF with {str(ocr_args)}") | ||||
|             ocrmypdf.ocr(**ocr_args) | ||||
|             # success! announce results | ||||
|             self.archive_path = archive_path | ||||
|             self.text = get_text_from_pdf(archive_path) | ||||
|  | ||||
|         except (InputFileError, EncryptedPdfError) as e: | ||||
|  | ||||
|             self.log("debug", | ||||
|                      f"Encountered an error: {e}. Trying to use text from " | ||||
|                      f"original.") | ||||
|             # This happens with some PDFs when used with the redo_ocr option. | ||||
|             # This is not the end of the world, we'll just use what we already | ||||
|             # have in the document. | ||||
|             self.text = text_original | ||||
|             # Also, no archived file. | ||||
|             if not self.text: | ||||
|                 # However, if we don't have anything, fail: | ||||
|                 raise ParseError(e) | ||||
|  | ||||
|         except Exception as e: | ||||
|             # Anything else is probably serious. | ||||
|             raise ParseError(e) | ||||
|  | ||||
|         if not self.text: | ||||
|             # This may happen for files that don't have any text. | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Document {document_path} does not have any text." | ||||
|                 f"This is probably an error or you tried to add an image " | ||||
|                 f"without text, or something is wrong with this document.") | ||||
|             self.text = "" | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
|     if not text: | ||||
|         return None | ||||
|  | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub( | ||||
|         r"([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     return no_trailing_whitespace | ||||
|  | ||||
|  | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|     with Image.open(img) as f: | ||||
|         if ocr.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except Exception: | ||||
|                 # Rotation not possible, ignore | ||||
|                 pass | ||||
|         try: | ||||
|             return ocr.image_to_string(f, lang=lang) | ||||
|         except PyocrException as e: | ||||
|             raise OCRError(e) | ||||
|     # TODO: this needs a rework | ||||
|     return no_trailing_whitespace.strip() | ||||
|  | ||||
|  | ||||
| def get_text_from_pdf(pdf_file): | ||||
| @@ -256,6 +217,9 @@ def get_text_from_pdf(pdf_file): | ||||
|         try: | ||||
|             pdf = pdftotext.PDF(f) | ||||
|         except pdftotext.Error: | ||||
|             return "" | ||||
|             # might not be a PDF file | ||||
|             return None | ||||
|  | ||||
|     return "\n".join(pdf) | ||||
|     text = "\n".join(pdf) | ||||
|  | ||||
|     return strip_excess_whitespace(text) | ||||
|   | ||||
| @@ -5,9 +5,12 @@ def tesseract_consumer_declaration(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": RasterisedDocumentParser, | ||||
|         "weight": 0, | ||||
|         "mime_types": [ | ||||
|             "application/pdf", | ||||
|             "image/jpeg", | ||||
|             "image/png" | ||||
|         ] | ||||
|         "mime_types": { | ||||
|             "application/pdf": ".pdf", | ||||
|             "image/jpeg": ".jpg", | ||||
|             "image/png": ".png", | ||||
|             "image/tiff": ".tif", | ||||
|             "image/gif": ".gif", | ||||
|             "image/bmp": ".bmp", | ||||
|         } | ||||
|     } | ||||
|   | ||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/multi-page-digital.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/multi-page-images.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/no-text-alpha.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-alpha.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 8.2 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-digital.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-no-dpi.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 6.8 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.bmp
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 1.7 MiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.gif
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 18 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.jpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 19 KiB | 
| Before Width: | Height: | Size: 7.7 KiB After Width: | Height: | Size: 7.2 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.tif
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/with-form.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -1,193 +0,0 @@ | ||||
| import datetime | ||||
| import os | ||||
| import shutil | ||||
| from unittest import mock | ||||
| from uuid import uuid4 | ||||
|  | ||||
| from dateutil import tz | ||||
| from django.conf import settings | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from ..parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||
|  | ||||
|     def setUp(self): | ||||
|         os.makedirs(self.SCRATCH, exist_ok=True) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.SCRATCH) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_1(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 130218 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_2(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 2018 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_3(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 20180213 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_4(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 13.02.2018 lorem ipsum" | ||||
|         date = document.get_date() | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_5(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " | ||||
|             "ipsum" | ||||
|         ) | ||||
|         date = document.get_date() | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_6(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_7(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "März 2019\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         date = document.get_date() | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2019, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_8(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum\n" | ||||
|             "März 2020" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             document.get_date(), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_9(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "27. Nullmonth 2020\n" | ||||
|             "März 2020\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             document.get_date(), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="01-07-0590 00:00:00" | ||||
|     ) | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_crazy_date_past(self, *args): | ||||
|         document = RasterisedDocumentParser("/dev/null", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="01-07-2350 00:00:00" | ||||
|     ) | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_crazy_date_future(self, *args): | ||||
|         document = RasterisedDocumentParser("/dev/null", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="20 408000l 2475" | ||||
|     ) | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_crazy_date_with_spaces(self, *args): | ||||
|         document = RasterisedDocumentParser("/dev/null", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="No date in here" | ||||
|     ) | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_filename_date_parse_invalid(self, *args): | ||||
|         document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
| @@ -1,76 +0,0 @@ | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
|  | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from ..parsers import image_to_string, strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| class TestOCR(TestCase): | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) | ||||
| @@ -1,46 +1,17 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| import uuid | ||||
| from typing import ContextManager | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
| from pyocr.error import TesseractError | ||||
|  | ||||
| from documents.parsers import ParseError, run_convert | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace | ||||
|  | ||||
| image_to_string_calls = [] | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise TesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_languages(): | ||||
|         return ['eng', 'deu'] | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         image_to_string_calls.append((file_handle.name, lang)) | ||||
|         return file_handle.read() | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| def fake_convert(input_file, output_file, **kwargs): | ||||
|     with open(input_file) as f: | ||||
|         lines = f.readlines() | ||||
| @@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs): | ||||
|             f2.write(line.strip()) | ||||
|  | ||||
|  | ||||
| def fake_unpaper(pnm): | ||||
|     output = pnm + ".unpaper.pnm" | ||||
|     shutil.copy(pnm, output) | ||||
|     return output | ||||
|  | ||||
|  | ||||
| class FakeImageFile(ContextManager): | ||||
|     def __init__(self, fname): | ||||
|         self.fname = fname | ||||
| @@ -67,142 +32,50 @@ class FakeImageFile(ContextManager): | ||||
|         return os.path.basename(self.fname) | ||||
|  | ||||
|  | ||||
| fake_image = FakeImageFile | ||||
|  | ||||
|  | ||||
| @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
| @mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) | ||||
| @mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) | ||||
| @mock.patch("paperless_tesseract.parsers.Image.open", open) | ||||
| class TestRasterisedDocumentParser(TestCase): | ||||
| class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|     def assertContainsStrings(self, content, strings): | ||||
|         # Asserts that all strings appear in content, in the given order. | ||||
|         indices = [content.index(s) for s in strings] | ||||
|         self.assertListEqual(indices, sorted(indices)) | ||||
|  | ||||
|         global image_to_string_calls | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|         image_to_string_calls = [] | ||||
|  | ||||
|         override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     def get_input_file(self, pages): | ||||
|         _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) | ||||
|         with open(fname, "w") as f: | ||||
|             f.writelines([f"line {p}\n" for p in range(pages)]) | ||||
|         return fname | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_simple_language_match(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_2_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_3_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) | ||||
|     def test_parse_text_lang_detect_failed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") | ||||
|     def test_parse_text_lang_not_installed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2 line 3") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_text_lang_mismatch(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_empty_doc(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) | ||||
|         try: | ||||
|             parser.get_text() | ||||
|         except ParseError as e: | ||||
|             self.assertEqual("Empty document, nothing to do.", str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|  | ||||
| class TestAuxilliaryFunctions(TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings(SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def test_get_text_from_pdf(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "This is a test document.") | ||||
|  | ||||
|     def test_get_text_from_pdf_error(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "") | ||||
|  | ||||
|     def test_image_to_string(self): | ||||
|         text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) | ||||
|  | ||||
|         self.assertEqual(text, "This is a test document.") | ||||
|  | ||||
|     def test_image_to_string_language_unavailable(self): | ||||
|         try: | ||||
|             image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) | ||||
|         except OCRError as e: | ||||
|             self.assertTrue("Failed loading language" in str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|     @override_settings(OCR_ALWAYS=False) | ||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") | ||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") | ||||
|     def test_is_ocred(self, m2, m): | ||||
|         parser = RasterisedDocumentParser("", uuid.uuid4()) | ||||
|         m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " | ||||
|         parser.get_text() | ||||
|         self.assertEqual(m.call_count, 2) | ||||
|         self.assertEqual(m2.call_count, 0) | ||||
|         self.assertContainsStrings(text.strip(), ["This is a test document."]) | ||||
|  | ||||
|     def test_thumbnail(self): | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|         parser.get_thumbnail() | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.run_convert") | ||||
| @@ -216,6 +89,191 @@ class TestAuxilliaryFunctions(TestCase): | ||||
|  | ||||
|         m.side_effect = call_convert | ||||
|  | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|         parser.get_thumbnail() | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||
|  | ||||
|     def test_get_dpi(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) | ||||
|         self.assertEqual(dpi, None) | ||||
|  | ||||
|         dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png")) | ||||
|         self.assertEqual(dpi, 72) | ||||
|  | ||||
|     def test_simple_digital(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["This is a test document."]) | ||||
|  | ||||
|     def test_with_form(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     def test_with_form_error(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None) | ||||
|     def test_with_form_error_notext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|     @override_settings(OCR_MODE="force") | ||||
|     def test_with_form_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|  | ||||
|     def test_image_simple(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["This is a test document."]) | ||||
|  | ||||
|     def test_image_simple_alpha_fail(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|  | ||||
|     def test_image_no_dpi_fail(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|     @override_settings(OCR_IMAGE_DPI=72) | ||||
|     def test_image_no_dpi_default(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."]) | ||||
|  | ||||
|     def test_multi_page(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="skip") | ||||
|     def test_multi_page_pages_skip(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||
|     def test_multi_page_pages_redo(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="force") | ||||
|     def test_multi_page_pages_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OOCR_MODE="skip") | ||||
|     def test_multi_page_analog_pages_skip(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||
|     def test_multi_page_analog_pages_redo(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=1, OCR_MODE="force") | ||||
|     def test_multi_page_analog_pages_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) | ||||
|         self.assertFalse("page 2" in parser.get_text().lower()) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_withtext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_notext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.join(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|  | ||||
| class TestParserFileTypes(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def test_bmp(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertTrue("this is a test document" in parser.get_text().lower()) | ||||
|  | ||||
|     def test_jpg(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertTrue("this is a test document" in parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_IMAGE_DPI=200) | ||||
|     def test_gif(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertTrue("this is a test document" in parser.get_text().lower()) | ||||
|  | ||||
|     def test_tiff(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertTrue("this is a test document" in parser.get_text().lower()) | ||||
|   | ||||
| @@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser): | ||||
|     This parser directly parses a text document (.txt, .md, or .csv) | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path, logging_group): | ||||
|         super().__init__(path, logging_group) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a text file is just a 500px wide image of the text | ||||
|         rendered onto a letter-sized page. | ||||
| @@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser): | ||||
|             ) | ||||
|  | ||||
|         def read_text(): | ||||
|             with open(self.document_path, 'r') as src: | ||||
|             with open(document_path, 'r') as src: | ||||
|                 lines = [line.strip() for line in src.readlines()] | ||||
|                 text = "\n".join([line for line in lines[:n_lines]]) | ||||
|                 return text.replace('"', "'") | ||||
| @@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser): | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|         if self._text is not None: | ||||
|             return self._text | ||||
|  | ||||
|         with open(self.document_path, 'r') as f: | ||||
|             self._text = f.read() | ||||
|  | ||||
|         return self._text | ||||
|     def parse(self, document_path, mime_type): | ||||
|         with open(document_path, 'r') as f: | ||||
|             self.text = f.read() | ||||
|  | ||||
|  | ||||
| def run_command(*args): | ||||
|   | ||||
| @@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": TextDocumentParser, | ||||
|         "weight": 10, | ||||
|         "mime_types": [ | ||||
|             "text/plain", | ||||
|             "text/comma-separated-values" | ||||
|         ] | ||||
|         "mime_types": { | ||||
|             "text/plain": ".txt", | ||||
|             "text/csv": ".csv", | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -3,10 +3,9 @@ exclude = migrations, paperless/settings.py, .tox, */tests/* | ||||
|  | ||||
| [tool:pytest] | ||||
| DJANGO_SETTINGS_MODULE=paperless.settings | ||||
| addopts = --pythonwarnings=all | ||||
| addopts = --pythonwarnings=all --cov --cov-report=html -n auto | ||||
| env = | ||||
|   PAPERLESS_SECRET=paperless | ||||
|   PAPERLESS_EMAIL_SECRET=paperless | ||||
|   PAPERLESS_DISABLE_DBHANDLER=true | ||||
|  | ||||
|  | ||||
| [coverage:run] | ||||
|   | ||||
 jonaswinkler
					jonaswinkler