Merge branch 'dev' into l10n_dev

This commit is contained in:
Michael Shamoon
2022-11-09 13:43:29 -08:00
140 changed files with 5617 additions and 16361 deletions

View File

@@ -9,7 +9,9 @@ from typing import Tuple
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image
@@ -19,6 +21,10 @@ from pyzbar import pyzbar
logger = logging.getLogger("paperless.barcodes")
class BarcodeImageFormatError(Exception):
pass
@lru_cache(maxsize=8)
def supported_file_type(mime_type) -> bool:
"""
@@ -93,7 +99,7 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
images[0].save(newpath)
else:
images[0].save(newpath, save_all=True, append_images=images[1:])
except OSError as e:
except OSError as e: # pragma: no cover
logger.warning(
f"Could not save the file as pdf. Error: {str(e)}",
)
@@ -108,6 +114,38 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
which separate the file into new files
"""
def _pikepdf_barcode_scan(pdf_filepath: str):
with Pdf.open(pdf_filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
# This type is known to have issues:
# https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive, so this may
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
separator_page_numbers.append(page_num)
def _pdf2image_barcode_scan(pdf_filepath: str):
# use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
current_barcodes = barcode_reader(page)
if settings.CONSUMER_BARCODE_STRING in current_barcodes:
separator_page_numbers.append(current_page_number)
separator_page_numbers = []
pdf_filepath = None
@@ -118,17 +156,32 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
pdf = Pdf.open(pdf_filepath)
# Always try pikepdf first, it's usually fine, faster and
# uses less memory
try:
_pikepdf_barcode_scan(pdf_filepath)
# Password protected files can't be checked
except PasswordError as e:
logger.warning(
f"File is likely password protected, not checking for barcodes: {e}",
)
# Handle pikepdf related image decoding issues with a fallback to page
# by page conversion to images in a temporary directory
except Exception as e:
logger.warning(
f"Falling back to pdf2image because: {e}",
)
try:
# Clear the list in case some processing worked
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
separator_page_numbers.append(page_num)
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",

View File

@@ -1,11 +1,12 @@
import itertools
from django.db.models import Q
from django_q.tasks import async_task
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.tasks import bulk_update_documents
from documents.tasks import update_document_archive_file
def set_correspondent(doc_ids, correspondent):
@@ -16,7 +17,7 @@ def set_correspondent(doc_ids, correspondent):
affected_docs = [doc.id for doc in qs]
qs.update(correspondent=correspondent)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -31,8 +32,7 @@ def set_storage_path(doc_ids, storage_path):
affected_docs = [doc.id for doc in qs]
qs.update(storage_path=storage_path)
async_task(
"documents.tasks.bulk_update_documents",
bulk_update_documents.delay(
document_ids=affected_docs,
)
@@ -47,7 +47,7 @@ def set_document_type(doc_ids, document_type):
affected_docs = [doc.id for doc in qs]
qs.update(document_type=document_type)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -63,7 +63,7 @@ def add_tag(doc_ids, tag):
[DocumentTagRelationship(document_id=doc, tag_id=tag) for doc in affected_docs],
)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -79,7 +79,7 @@ def remove_tag(doc_ids, tag):
Q(document_id__in=affected_docs) & Q(tag_id=tag),
).delete()
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -103,7 +103,7 @@ def modify_tags(doc_ids, add_tags, remove_tags):
ignore_conflicts=True,
)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -123,8 +123,7 @@ def delete(doc_ids):
def redo_ocr(doc_ids):
for document_id in doc_ids:
async_task(
"documents.tasks.update_document_archive_file",
update_document_archive_file.delay(
document_id=document_id,
)

View File

@@ -5,12 +5,15 @@ import pickle
import re
import shutil
import warnings
from typing import List
from typing import Optional
from django.conf import settings
from documents.models import Document
from documents.models import MatchingModel
logger = logging.getLogger("paperless.classifier")
class IncompatibleClassifierVersionError(Exception):
pass
@@ -20,15 +23,6 @@ class ClassifierModelCorruptError(Exception):
pass
logger = logging.getLogger("paperless.classifier")
def preprocess_content(content: str) -> str:
content = content.lower().strip()
content = re.sub(r"\s+", " ", content)
return content
def load_classifier() -> Optional["DocumentClassifier"]:
if not os.path.isfile(settings.MODEL_FILE):
logger.debug(
@@ -81,6 +75,9 @@ class DocumentClassifier:
self.document_type_classifier = None
self.storage_path_classifier = None
self._stemmer = None
self._stop_words = None
def load(self):
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
@@ -139,11 +136,11 @@ class DocumentClassifier:
def train(self):
data = list()
labels_tags = list()
labels_correspondent = list()
labels_document_type = list()
labels_storage_path = list()
data = []
labels_tags = []
labels_correspondent = []
labels_document_type = []
labels_storage_path = []
# Step 1: Extract and preprocess training data from the database.
logger.debug("Gathering data from database...")
@@ -151,7 +148,7 @@ class DocumentClassifier:
for doc in Document.objects.order_by("pk").exclude(
tags__is_inbox_tag=True,
):
preprocessed_content = preprocess_content(doc.content)
preprocessed_content = self.preprocess_content(doc.content)
m.update(preprocessed_content.encode("utf-8"))
data.append(preprocessed_content)
@@ -231,6 +228,11 @@ class DocumentClassifier:
)
data_vectorized = self.data_vectorizer.fit_transform(data)
# See the notes here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # noqa: 501
# This attribute isn't needed to function and can be large
self.data_vectorizer.stop_words_ = None
# Step 3: train the classifiers
if num_tags > 0:
logger.debug("Training tags classifier...")
@@ -296,9 +298,52 @@ class DocumentClassifier:
return True
def preprocess_content(self, content: str) -> str:
"""
Process to contents of a document, distilling it down into
words which are meaningful to the content
"""
# Lower case the document
content = content.lower().strip()
# Reduce spaces
content = re.sub(r"\s+", " ", content)
# Get only the letters
content = re.sub(r"[^\w\s]", " ", content)
# If the NLTK language is supported, do further processing
if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
# Do some one time setup
if self._stemmer is None:
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
if self._stop_words is None:
self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
# Tokenize
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Remove stop words
meaningful_words = [w for w in words if w not in self._stop_words]
# Stem words
meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
return " ".join(meaningful_words)
return content
def predict_correspondent(self, content):
if self.correspondent_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
correspondent_id = self.correspondent_classifier.predict(X)
if correspondent_id != -1:
return correspondent_id
@@ -309,7 +354,7 @@ class DocumentClassifier:
def predict_document_type(self, content):
if self.document_type_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
document_type_id = self.document_type_classifier.predict(X)
if document_type_id != -1:
return document_type_id
@@ -322,7 +367,7 @@ class DocumentClassifier:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
if type_of_target(y).startswith("multilabel"):
@@ -341,7 +386,7 @@ class DocumentClassifier:
def predict_storage_path(self, content):
if self.storage_path_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)
if storage_path_id != -1:
return storage_path_id

View File

@@ -111,14 +111,16 @@ class Consumer(LoggingMixin):
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(
existing_doc = Document.objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
).exists():
)
if existing_doc.exists():
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
self._fail(
MESSAGE_DOCUMENT_ALREADY_EXISTS,
f"Not consuming {self.filename}: It is a duplicate.",
f"Not consuming {self.filename}: It is a duplicate of"
f" {existing_doc.get().title} (#{existing_doc.get().pk})",
)
def pre_check_directories(self):
@@ -403,6 +405,7 @@ class Consumer(LoggingMixin):
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
# This triggers things like file renaming
document.save()
# Delete the file only if it was successfully consumed
@@ -436,6 +439,9 @@ class Consumer(LoggingMixin):
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
# Return the most up to date fields
document.refresh_from_db()
return document
def _store(self, text, date, mime_type) -> Document:

View File

@@ -1,4 +1,3 @@
import datetime
import logging
import os
from collections import defaultdict
@@ -172,7 +171,7 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
else:
asn = "-none-"
# Convert UTC database date to localized date
# Convert UTC database datetime to localized date
local_added = timezone.localdate(doc.added)
local_created = timezone.localdate(doc.created)
@@ -180,14 +179,20 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(local_created),
created_year=local_created.year,
created_month=f"{local_created.month:02}",
created_day=f"{local_created.day:02}",
added=datetime.date.isoformat(local_added),
added_year=local_added.year,
added_month=f"{local_added.month:02}",
added_day=f"{local_added.day:02}",
created=local_created.isoformat(),
created_year=local_created.strftime("%Y"),
created_year_short=local_created.strftime("%y"),
created_month=local_created.strftime("%m"),
created_month_name=local_created.strftime("%B"),
created_month_name_short=local_created.strftime("%b"),
created_day=local_created.strftime("%d"),
added=local_added.isoformat(),
added_year=local_added.strftime("%Y"),
added_year_short=local_added.strftime("%y"),
added_month=local_added.strftime("%m"),
added_month_name=local_added.strftime("%B"),
added_month_name_short=local_added.strftime("%b"),
added_day=local_added.strftime("%d"),
asn=asn,
tags=tags,
tag_list=tag_list,

View File

@@ -11,9 +11,9 @@ from typing import Final
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django_q.tasks import async_task
from documents.models import Tag
from documents.parsers import is_file_ext_supported
from documents.tasks import consume_file
from watchdog.events import FileSystemEventHandler
from watchdog.observers.polling import PollingObserver
@@ -92,11 +92,9 @@ def _consume(filepath):
try:
logger.info(f"Adding {filepath} to the task queue.")
async_task(
"documents.tasks.consume_file",
consume_file.delay(
filepath,
override_tag_ids=tag_ids if tag_ids else None,
task_name=os.path.basename(filepath)[:100],
override_tag_ids=list(tag_ids) if tag_ids else None,
)
except Exception:
# Catch all so that the consumer won't crash.

View File

@@ -142,14 +142,14 @@ def matches(matching_model, document):
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
from fuzzywuzzy import fuzz
from rapidfuzz import fuzz
match = re.sub(r"[^\w\s]", "", matching_model.match)
text = re.sub(r"[^\w\s]", "", document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
if fuzz.partial_ratio(match, text) >= 90:
if fuzz.partial_ratio(match, text, score_cutoff=90):
# TODO: make this better
log_reason(
matching_model,

View File

@@ -1,34 +1,14 @@
# Generated by Django 3.1.3 on 2020-11-09 16:36
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule(
"documents.tasks.train_classifier",
name="Train the classifier",
schedule_type=Schedule.HOURLY,
)
schedule(
"documents.tasks.index_optimize",
name="Optimize the index",
schedule_type=Schedule.DAILY,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func="documents.tasks.train_classifier").delete()
Schedule.objects.filter(func="documents.tasks.index_optimize").delete()
class Migration(migrations.Migration):
dependencies = [
("documents", "1000_update_paperless_all"),
("django_q", "0013_task_attempt_count"),
]
operations = [RunPython(add_schedules, remove_schedules)]
operations = [
migrations.RunPython(migrations.RunPython.noop, migrations.RunPython.noop)
]

View File

@@ -2,27 +2,12 @@
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule(
"documents.tasks.sanity_check",
name="Perform sanity check",
schedule_type=Schedule.WEEKLY,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func="documents.tasks.sanity_check").delete()
class Migration(migrations.Migration):
dependencies = [
("documents", "1003_mime_types"),
("django_q", "0013_task_attempt_count"),
]
operations = [RunPython(add_schedules, remove_schedules)]
operations = [RunPython(migrations.RunPython.noop, migrations.RunPython.noop)]

View File

@@ -4,28 +4,9 @@ from django.db import migrations, models
import django.db.models.deletion
def init_paperless_tasks(apps, schema_editor):
PaperlessTask = apps.get_model("documents", "PaperlessTask")
Task = apps.get_model("django_q", "Task")
for task in Task.objects.filter(func="documents.tasks.consume_file"):
if not hasattr(task, "paperlesstask"):
paperlesstask = PaperlessTask.objects.create(
attempted_task=task,
task_id=task.id,
name=task.name,
created=task.started,
started=task.started,
acknowledged=True,
)
task.paperlesstask = paperlesstask
task.save()
class Migration(migrations.Migration):
dependencies = [
("django_q", "0014_schedule_cluster"),
("documents", "1021_webp_thumbnail_conversion"),
]
@@ -60,10 +41,12 @@ class Migration(migrations.Migration):
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
to="django_q.task",
# This is a dummy field, 1026 will fix up the column
# This manual change is required, as django doesn't django doesn't really support
# removing an app which has migration deps like this
to="documents.document",
),
),
],
),
migrations.RunPython(init_paperless_tasks, migrations.RunPython.noop),
)
]

View File

@@ -0,0 +1,57 @@
# Generated by Django 4.1.1 on 2022-09-27 19:31
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
("django_celery_results", "0011_taskresult_periodic_task_name"),
("documents", "1025_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.RemoveField(
model_name="paperlesstask",
name="created",
),
migrations.RemoveField(
model_name="paperlesstask",
name="name",
),
migrations.RemoveField(
model_name="paperlesstask",
name="started",
),
# Remove the field from the model
migrations.RemoveField(
model_name="paperlesstask",
name="attempted_task",
),
# Add the field back, pointing to the correct model
# This resolves a problem where the temporary change in 1022
# results in a type mismatch
migrations.AddField(
model_name="paperlesstask",
name="attempted_task",
field=models.OneToOneField(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
to="django_celery_results.taskresult",
),
),
# Drop the django-q tables entirely
# Must be done last or there could be references here
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_ormq", reverse_sql=migrations.RunSQL.noop
),
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_schedule", reverse_sql=migrations.RunSQL.noop
),
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_task", reverse_sql=migrations.RunSQL.noop
),
]

View File

@@ -0,0 +1,134 @@
# Generated by Django 4.1.2 on 2022-10-17 16:31
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
("documents", "1026_transition_to_celery"),
]
operations = [
migrations.RemoveField(
model_name="paperlesstask",
name="attempted_task",
),
migrations.AddField(
model_name="paperlesstask",
name="date_created",
field=models.DateTimeField(
default=django.utils.timezone.now,
help_text="Datetime field when the task result was created in UTC",
null=True,
verbose_name="Created DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="date_done",
field=models.DateTimeField(
default=None,
help_text="Datetime field when the task was completed in UTC",
null=True,
verbose_name="Completed DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="date_started",
field=models.DateTimeField(
default=None,
help_text="Datetime field when the task was started in UTC",
null=True,
verbose_name="Started DateTime",
),
),
migrations.AddField(
model_name="paperlesstask",
name="result",
field=models.TextField(
default=None,
help_text="The data returned by the task",
null=True,
verbose_name="Result Data",
),
),
migrations.AddField(
model_name="paperlesstask",
name="status",
field=models.CharField(
choices=[
("FAILURE", "FAILURE"),
("PENDING", "PENDING"),
("RECEIVED", "RECEIVED"),
("RETRY", "RETRY"),
("REVOKED", "REVOKED"),
("STARTED", "STARTED"),
("SUCCESS", "SUCCESS"),
],
default="PENDING",
help_text="Current state of the task being run",
max_length=30,
verbose_name="Task State",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_args",
field=models.JSONField(
help_text="JSON representation of the positional arguments used with the task",
null=True,
verbose_name="Task Positional Arguments",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_file_name",
field=models.CharField(
help_text="Name of the file which the Task was run for",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_kwargs",
field=models.JSONField(
help_text="JSON representation of the named arguments used with the task",
null=True,
verbose_name="Task Named Arguments",
),
),
migrations.AddField(
model_name="paperlesstask",
name="task_name",
field=models.CharField(
help_text="Name of the Task which was run",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="acknowledged",
field=models.BooleanField(
default=False,
help_text="If the task is acknowledged via the frontend or API",
verbose_name="Acknowledged",
),
),
migrations.AlterField(
model_name="paperlesstask",
name="task_id",
field=models.CharField(
help_text="Celery ID for the Task that was run",
max_length=255,
unique=True,
verbose_name="Task ID",
),
),
]

View File

@@ -7,14 +7,17 @@ from typing import Optional
import dateutil.parser
import pathvalidate
from celery import states
from django.conf import settings
from django.contrib.auth.models import User
from django.db import models
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from django_q.tasks import Task
from documents.parsers import get_default_file_extension
ALL_STATES = sorted(states.ALL_STATES)
TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES))
class MatchingModel(models.Model):
@@ -527,19 +530,80 @@ class UiSettings(models.Model):
class PaperlessTask(models.Model):
task_id = models.CharField(max_length=128)
name = models.CharField(max_length=256)
created = models.DateTimeField(_("created"), auto_now=True)
started = models.DateTimeField(_("started"), null=True)
attempted_task = models.OneToOneField(
Task,
on_delete=models.CASCADE,
related_name="attempted_task",
null=True,
blank=True,
task_id = models.CharField(
max_length=255,
unique=True,
verbose_name=_("Task ID"),
help_text=_("Celery ID for the Task that was run"),
)
acknowledged = models.BooleanField(
default=False,
verbose_name=_("Acknowledged"),
help_text=_("If the task is acknowledged via the frontend or API"),
)
task_file_name = models.CharField(
null=True,
max_length=255,
verbose_name=_("Task Name"),
help_text=_("Name of the file which the Task was run for"),
)
task_name = models.CharField(
null=True,
max_length=255,
verbose_name=_("Task Name"),
help_text=_("Name of the Task which was run"),
)
task_args = models.JSONField(
null=True,
verbose_name=_("Task Positional Arguments"),
help_text=_(
"JSON representation of the positional arguments used with the task",
),
)
task_kwargs = models.JSONField(
null=True,
verbose_name=_("Task Named Arguments"),
help_text=_(
"JSON representation of the named arguments used with the task",
),
)
status = models.CharField(
max_length=30,
default=states.PENDING,
choices=TASK_STATE_CHOICES,
verbose_name=_("Task State"),
help_text=_("Current state of the task being run"),
)
date_created = models.DateTimeField(
null=True,
default=timezone.now,
verbose_name=_("Created DateTime"),
help_text=_("Datetime field when the task result was created in UTC"),
)
date_started = models.DateTimeField(
null=True,
default=None,
verbose_name=_("Started DateTime"),
help_text=_("Datetime field when the task was started in UTC"),
)
date_done = models.DateTimeField(
null=True,
default=None,
verbose_name=_("Completed DateTime"),
help_text=_("Datetime field when the task was completed in UTC"),
)
result = models.TextField(
null=True,
default=None,
verbose_name=_("Result Data"),
help_text=_(
"The data returned by the task",
),
)
acknowledged = models.BooleanField(default=False)
class Comment(models.Model):

View File

@@ -2,6 +2,8 @@ import datetime
import math
import re
from celery import states
try:
import zoneinfo
except ImportError:
@@ -18,12 +20,12 @@ from .models import Correspondent
from .models import Document
from .models import DocumentType
from .models import MatchingModel
from .models import PaperlessTask
from .models import SavedView
from .models import SavedViewFilterRule
from .models import StoragePath
from .models import Tag
from .models import UiSettings
from .models import PaperlessTask
from .parsers import is_mime_type_supported
@@ -608,6 +610,15 @@ class UiSettingsViewSerializer(serializers.ModelSerializer):
"settings",
]
def validate_settings(self, settings):
# we never save update checking backend setting
if "update_checking" in settings:
try:
settings["update_checking"].pop("backend_setting")
except KeyError:
pass
return settings
def create(self, validated_data):
ui_settings = UiSettings.objects.update_or_create(
user=validated_data.get("user"),
@@ -620,7 +631,18 @@ class TasksViewSerializer(serializers.ModelSerializer):
class Meta:
model = PaperlessTask
depth = 1
fields = "__all__"
fields = (
"id",
"task_id",
"task_file_name",
"date_created",
"date_done",
"type",
"status",
"result",
"acknowledged",
"related_document",
)
type = serializers.SerializerMethodField()
@@ -628,29 +650,19 @@ class TasksViewSerializer(serializers.ModelSerializer):
# just file tasks, for now
return "file"
result = serializers.SerializerMethodField()
related_document = serializers.SerializerMethodField()
related_doc_re = re.compile(r"New document id (\d+) created")
def get_related_document(self, obj):
result = None
if obj.status is not None and obj.status == states.SUCCESS:
try:
result = self.related_doc_re.search(obj.result).group(1)
except Exception:
pass
def get_result(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.result
return result
status = serializers.SerializerMethodField()
def get_status(self, obj):
if obj.attempted_task is None:
if obj.started:
return "started"
else:
return "queued"
elif obj.attempted_task.success:
return "complete"
elif not obj.attempted_task.success:
return "failed"
else:
return "unknown"
class AcknowledgeTasksViewSerializer(serializers.Serializer):

View File

@@ -1,8 +1,13 @@
import logging
import os
import shutil
from ast import literal_eval
from pathlib import Path
import django_q
from celery import states
from celery.signals import before_task_publish
from celery.signals import task_postrun
from celery.signals import task_prerun
from django.conf import settings
from django.contrib.admin.models import ADDITION
from django.contrib.admin.models import LogEntry
@@ -25,7 +30,6 @@ from ..models import MatchingModel
from ..models import PaperlessTask
from ..models import Tag
logger = logging.getLogger("paperless.handlers")
@@ -396,6 +400,13 @@ def update_filename_and_move_files(sender, instance, **kwargs):
with FileLock(settings.MEDIA_LOCK):
try:
# If this was waiting for the lock, the filename or archive_filename
# of this document may have been updated. This happens if multiple updates
# get queued from the UI for the same document
# So freshen up the data before doing anything
instance.refresh_from_db()
old_filename = instance.filename
old_source_path = instance.source_path
@@ -503,47 +514,94 @@ def add_to_index(sender, document, **kwargs):
index.add_or_update_document(document)
@receiver(django_q.signals.pre_enqueue)
def init_paperless_task(sender, task, **kwargs):
if task["func"] == "documents.tasks.consume_file":
try:
paperless_task, created = PaperlessTask.objects.get_or_create(
task_id=task["id"],
)
paperless_task.name = task["name"]
paperless_task.created = task["started"]
paperless_task.save()
except Exception as e:
# Don't let an exception in the signal handlers prevent
# a document from being consumed.
logger.error(f"Creating PaperlessTask failed: {e}")
@before_task_publish.connect
def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs):
"""
Creates the PaperlessTask object in a pending state. This is sent before
the task reaches the broker, but
https://docs.celeryq.dev/en/stable/userguide/signals.html#before-task-publish
"""
if "task" not in headers or headers["task"] != "documents.tasks.consume_file":
# Assumption: this is only ever a v2 message
return
@receiver(django_q.signals.pre_execute)
def paperless_task_started(sender, task, **kwargs):
try:
if task["func"] == "documents.tasks.consume_file":
paperless_task, created = PaperlessTask.objects.get_or_create(
task_id=task["id"],
)
paperless_task.started = timezone.now()
paperless_task.save()
except PaperlessTask.DoesNotExist:
pass
except Exception as e:
task_file_name = ""
if headers["kwargsrepr"] is not None:
task_kwargs = literal_eval(headers["kwargsrepr"])
if "override_filename" in task_kwargs:
task_file_name = task_kwargs["override_filename"]
else:
task_kwargs = None
task_args = literal_eval(headers["argsrepr"])
# Nothing was found, report the task first argument
if not len(task_file_name):
# There are always some arguments to the consume, first is always filename
filepath = Path(task_args[0])
task_file_name = filepath.name
PaperlessTask.objects.create(
task_id=headers["id"],
status=states.PENDING,
task_file_name=task_file_name,
task_name=headers["task"],
task_args=task_args,
task_kwargs=task_kwargs,
result=None,
date_created=timezone.now(),
date_started=None,
date_done=None,
)
except Exception as e: # pragma: no cover
# Don't let an exception in the signal handlers prevent
# a document from being consumed.
logger.error(f"Creating PaperlessTask failed: {e}")
@receiver(models.signals.post_save, sender=django_q.models.Task)
def update_paperless_task(sender, instance, **kwargs):
@task_prerun.connect
def task_prerun_handler(sender=None, task_id=None, task=None, **kwargs):
"""
Updates the PaperlessTask to be started. Sent before the task begins execution
on a worker.
https://docs.celeryq.dev/en/stable/userguide/signals.html#task-prerun
"""
try:
if instance.func == "documents.tasks.consume_file":
paperless_task, created = PaperlessTask.objects.get_or_create(
task_id=instance.id,
)
paperless_task.attempted_task = instance
paperless_task.save()
except PaperlessTask.DoesNotExist:
pass
except Exception as e:
logger.error(f"Creating PaperlessTask failed: {e}")
task_instance = PaperlessTask.objects.filter(task_id=task_id).first()
if task_instance is not None:
task_instance.status = states.STARTED
task_instance.date_started = timezone.now()
task_instance.save()
except Exception as e: # pragma: no cover
# Don't let an exception in the signal handlers prevent
# a document from being consumed.
logger.error(f"Setting PaperlessTask started failed: {e}")
@task_postrun.connect
def task_postrun_handler(
sender=None, task_id=None, task=None, retval=None, state=None, **kwargs
):
"""
Updates the result of the PaperlessTask.
https://docs.celeryq.dev/en/stable/userguide/signals.html#task-postrun
"""
try:
task_instance = PaperlessTask.objects.filter(task_id=task_id).first()
if task_instance is not None:
task_instance.status = state
task_instance.result = retval
task_instance.date_done = timezone.now()
task_instance.save()
except Exception as e: # pragma: no cover
# Don't let an exception in the signal handlers prevent
# a document from being consumed.
logger.error(f"Updating PaperlessTask failed: {e}")

View File

@@ -8,6 +8,7 @@ from typing import Type
import tqdm
from asgiref.sync import async_to_sync
from celery import shared_task
from channels.layers import get_channel_layer
from django.conf import settings
from django.db import transaction
@@ -30,12 +31,14 @@ from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.sanity_checker import SanityCheckFailedException
from filelock import FileLock
from redis.exceptions import ConnectionError
from whoosh.writing import AsyncWriter
logger = logging.getLogger("paperless.tasks")
@shared_task
def index_optimize():
ix = index.open_index()
writer = AsyncWriter(ix)
@@ -52,6 +55,7 @@ def index_reindex(progress_bar_disable=False):
index.update_document(writer, document)
@shared_task
def train_classifier():
if (
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
@@ -80,6 +84,7 @@ def train_classifier():
logger.warning("Classifier error: " + str(e))
@shared_task
def consume_file(
path,
override_filename=None,
@@ -112,10 +117,22 @@ def consume_file(
newname = f"{str(n)}_" + override_filename
else:
newname = None
# If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags
# from subdirectories
try:
# is_relative_to would be nicer, but new in 3.9
_ = path.relative_to(settings.SCRATCH_DIR)
save_to_dir = settings.CONSUMPTION_DIR
except ValueError:
save_to_dir = path.parent
barcodes.save_to_dir(
document,
newname=newname,
target_dir=path.parent,
target_dir=save_to_dir,
)
# Delete the PDF file which was split
@@ -141,11 +158,8 @@ def consume_file(
"status_updates",
{"type": "status_update", "data": payload},
)
except OSError as e:
logger.warning(
"OSError. It could be, the broker cannot be reached.",
)
logger.warning(str(e))
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
return "File successfully split"
@@ -171,6 +185,7 @@ def consume_file(
)
@shared_task
def sanity_check():
messages = sanity_checker.check_sanity()
@@ -186,6 +201,7 @@ def sanity_check():
return "No issues detected."
@shared_task
def bulk_update_documents(document_ids):
documents = Document.objects.filter(id__in=document_ids)
@@ -199,6 +215,7 @@ def bulk_update_documents(document_ids):
index.update_document(writer, doc)
@shared_task
def update_document_archive_file(document_id):
"""
Re-creates the archive file of a document, including new OCR content and thumbnail

Binary file not shown.

View File

@@ -10,6 +10,8 @@ import zipfile
from unittest import mock
from unittest.mock import MagicMock
import celery
try:
import zoneinfo
except ImportError:
@@ -20,7 +22,6 @@ from django.conf import settings
from django.contrib.auth.models import User
from django.test import override_settings
from django.utils import timezone
from django_q.models import Task
from documents import bulk_edit
from documents import index
from documents.models import Correspondent
@@ -31,7 +32,6 @@ from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import StoragePath
from documents.models import Tag
from documents.models import UiSettings
from documents.models import Comment
from documents.models import StoragePath
from documents.tests.utils import DirectoriesMixin
@@ -790,7 +790,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data["documents_inbox"], None)
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload(self, m):
with open(
@@ -813,7 +813,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_empty_metadata(self, m):
with open(
@@ -836,7 +836,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_invalid_form(self, m):
with open(
@@ -850,7 +850,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
m.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_invalid_file(self, m):
with open(
@@ -864,7 +864,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
m.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_title(self, async_task):
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@@ -882,7 +882,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(kwargs["override_title"], "my custom title")
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_correspondent(self, async_task):
c = Correspondent.objects.create(name="test-corres")
with open(
@@ -901,7 +901,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(kwargs["override_correspondent_id"], c.id)
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_correspondent(self, async_task):
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@@ -915,7 +915,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_document_type(self, async_task):
dt = DocumentType.objects.create(name="invoice")
with open(
@@ -934,7 +934,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(kwargs["override_document_type_id"], dt.id)
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_document_type(self, async_task):
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@@ -948,7 +948,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_tags(self, async_task):
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
@@ -968,7 +968,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertCountEqual(kwargs["override_tag_ids"], [t1.id, t2.id])
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_tags(self, async_task):
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
@@ -984,7 +984,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_created(self, async_task):
created = datetime.datetime(
2022,
@@ -1581,7 +1581,11 @@ class TestApiUiSettings(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertDictEqual(
response.data["settings"],
{},
{
"update_checking": {
"backend_setting": "default",
},
},
)
def test_api_set_ui_settings(self):
@@ -1615,7 +1619,7 @@ class TestBulkEdit(DirectoriesMixin, APITestCase):
user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=user)
patcher = mock.patch("documents.bulk_edit.async_task")
patcher = mock.patch("documents.bulk_edit.bulk_update_documents.delay")
self.async_task = patcher.start()
self.addCleanup(patcher.stop)
self.c1 = Correspondent.objects.create(name="c1")
@@ -2542,38 +2546,6 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
def setUp(self):
super().setUp()
def test_remote_version_default(self):
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertDictEqual(
response.data,
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": False,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=False,
)
def test_remote_version_disabled(self):
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertDictEqual(
response.data,
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_enabled_no_update_prefix(self, urlopen_mock):
@@ -2591,13 +2563,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": "1.6.0",
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_enabled_no_update_no_prefix(self, urlopen_mock):
@@ -2617,13 +2585,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": version.__full_version_str__,
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_enabled_update(self, urlopen_mock):
@@ -2650,13 +2614,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": new_version_str,
"update_available": True,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_bad_json(self, urlopen_mock):
@@ -2674,13 +2634,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_exception(self, urlopen_mock):
@@ -2698,7 +2654,6 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": True,
},
)
@@ -2783,7 +2738,7 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase):
class TestTasks(APITestCase):
ENDPOINT = "/api/tasks/"
ENDPOINT_ACKOWLEDGE = "/api/acknowledge_tasks/"
ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"
def setUp(self):
super().setUp()
@@ -2792,16 +2747,24 @@ class TestTasks(APITestCase):
self.client.force_authenticate(user=self.user)
def test_get_tasks(self):
task_id1 = str(uuid.uuid4())
PaperlessTask.objects.create(task_id=task_id1)
Task.objects.create(
id=task_id1,
started=timezone.now() - datetime.timedelta(seconds=30),
stopped=timezone.now(),
func="documents.tasks.consume_file",
"""
GIVEN:
- Attempted celery tasks
WHEN:
- API call is made to get tasks
THEN:
- Attempting and pending tasks are serialized and provided
"""
task1 = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
)
task2 = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_two.pdf",
)
task_id2 = str(uuid.uuid4())
PaperlessTask.objects.create(task_id=task_id2)
response = self.client.get(self.ENDPOINT)
@@ -2809,25 +2772,160 @@ class TestTasks(APITestCase):
self.assertEqual(len(response.data), 2)
returned_task1 = response.data[1]
returned_task2 = response.data[0]
self.assertEqual(returned_task1["task_id"], task_id1)
self.assertEqual(returned_task1["status"], "complete")
self.assertIsNotNone(returned_task1["attempted_task"])
self.assertEqual(returned_task2["task_id"], task_id2)
self.assertEqual(returned_task2["status"], "queued")
self.assertIsNone(returned_task2["attempted_task"])
from pprint import pprint
pprint(returned_task1)
pprint(returned_task2)
self.assertEqual(returned_task1["task_id"], task1.task_id)
self.assertEqual(returned_task1["status"], celery.states.PENDING)
self.assertEqual(returned_task1["task_file_name"], task1.task_file_name)
self.assertEqual(returned_task2["task_id"], task2.task_id)
self.assertEqual(returned_task2["status"], celery.states.PENDING)
self.assertEqual(returned_task2["task_file_name"], task2.task_file_name)
def test_acknowledge_tasks(self):
task_id = str(uuid.uuid4())
task = PaperlessTask.objects.create(task_id=task_id)
"""
GIVEN:
- Attempted celery tasks
WHEN:
- API call is made to get mark task as acknowledged
THEN:
- Task is marked as acknowledged
"""
task = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
)
response = self.client.get(self.ENDPOINT)
self.assertEqual(len(response.data), 1)
response = self.client.post(
self.ENDPOINT_ACKOWLEDGE,
self.ENDPOINT_ACKNOWLEDGE,
{"tasks": [task.id]},
)
self.assertEqual(response.status_code, 200)
response = self.client.get(self.ENDPOINT)
self.assertEqual(len(response.data), 0)
def test_task_result_no_error(self):
"""
GIVEN:
- A celery task completed without error
WHEN:
- API call is made to get tasks
THEN:
- The returned data includes the task result
"""
task = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
status=celery.states.SUCCESS,
result="Success. New document id 1 created",
)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(returned_data["result"], "Success. New document id 1 created")
self.assertEqual(returned_data["related_document"], "1")
def test_task_result_with_error(self):
"""
GIVEN:
- A celery task completed with an exception
WHEN:
- API call is made to get tasks
THEN:
- The returned result is the exception info
"""
task = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
status=celery.states.FAILURE,
result="test.pdf: Not consuming test.pdf: It is a duplicate.",
)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(
returned_data["result"],
"test.pdf: Not consuming test.pdf: It is a duplicate.",
)
def test_task_name_webui(self):
"""
GIVEN:
- Attempted celery task
- Task was created through the webui
WHEN:
- API call is made to get tasks
THEN:
- Returned data include the filename
"""
task = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="test.pdf",
task_name="documents.tasks.some_task",
status=celery.states.SUCCESS,
task_args=("/tmp/paperless/paperless-upload-5iq7skzc",),
task_kwargs={
"override_filename": "test.pdf",
"override_title": None,
"override_correspondent_id": None,
"override_document_type_id": None,
"override_tag_ids": None,
"task_id": "466e8fe7-7193-4698-9fff-72f0340e2082",
"override_created": None,
},
)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(returned_data["task_file_name"], "test.pdf")
def test_task_name_consume_folder(self):
"""
GIVEN:
- Attempted celery task
- Task was created through the consume folder
WHEN:
- API call is made to get tasks
THEN:
- Returned data include the filename
"""
task = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="anothertest.pdf",
task_name="documents.tasks.some_task",
status=celery.states.SUCCESS,
task_args=("/consume/anothertest.pdf",),
task_kwargs={"override_tag_ids": None},
)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(returned_data["task_file_name"], "anothertest.pdf")

View File

@@ -3,6 +3,7 @@ import shutil
import tempfile
from unittest import mock
import pikepdf
from django.conf import settings
from django.test import override_settings
from django.test import TestCase
@@ -173,7 +174,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes2(self):
def test_scan_file_for_separating_barcodes_none_present(self):
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
@@ -218,6 +219,86 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes_pillow_transcode_error(self):
"""
GIVEN:
- A PDF containing an image which cannot be transcoded to a PIL image
WHEN:
- The image tries to be transcoded to a PIL image, but fails
THEN:
- The barcode reader is still called
"""
def _build_device_n_pdf(self, save_path: str):
# Based on the pikepdf tests
# https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
pdf = pikepdf.new()
pdf.add_blank_page(page_size=(72, 72))
imobj = pikepdf.Stream(
pdf,
bytes(range(0, 256)),
BitsPerComponent=8,
ColorSpace=pikepdf.Array(
[
pikepdf.Name.DeviceN,
pikepdf.Array([pikepdf.Name.Black]),
pikepdf.Name.DeviceCMYK,
pikepdf.Stream(
pdf,
b"{0 0 0 4 -1 roll}", # Colorspace conversion function
FunctionType=4,
Domain=[0.0, 1.0],
Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
),
],
),
Width=16,
Height=16,
Type=pikepdf.Name.XObject,
Subtype=pikepdf.Name.Image,
)
pim = pikepdf.PdfImage(imobj)
self.assertEqual(pim.mode, "DeviceN")
self.assertTrue(pim.is_device_n)
pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
pdf.pages[0].Resources = pikepdf.Dictionary(
XObject=pikepdf.Dictionary(Im0=imobj),
)
pdf.save(save_path)
with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
# Build an offending file
_build_device_n_pdf(self, str(device_n_pdf.name))
with mock.patch("documents.barcodes.barcode_reader") as reader:
reader.return_value = list()
_, _ = barcodes.scan_file_for_separating_barcodes(
str(device_n_pdf.name),
)
reader.assert_called()
def test_scan_file_for_separating_barcodes_fax_decode(self):
"""
GIVEN:
- A PDF containing an image encoded as CCITT Group 4 encoding
WHEN:
- Barcode processing happens with the file
THEN:
- The barcode is still detected
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-fax-image.pdf",
)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_qr_barcodes(self):
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
@@ -397,7 +478,8 @@ class TestBarcode(DirectoriesMixin, TestCase):
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
@@ -411,7 +493,8 @@ class TestBarcode(DirectoriesMixin, TestCase):
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
@@ -465,4 +548,23 @@ class TestBarcode(DirectoriesMixin, TestCase):
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
def test_scan_file_for_separating_barcodes_password(self):
"""
GIVEN:
- Password protected PDF
- pikepdf based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])

View File

@@ -1,9 +1,9 @@
import os
import re
import tempfile
from pathlib import Path
from unittest import mock
import documents
import pytest
from django.conf import settings
from django.test import override_settings
@@ -20,10 +20,19 @@ from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
def dummy_preprocess(content: str):
content = content.lower().strip()
content = re.sub(r"\s+", " ", content)
return content
class TestClassifier(DirectoriesMixin, TestCase):
def setUp(self):
super().setUp()
self.classifier = DocumentClassifier()
self.classifier.preprocess_content = mock.MagicMock(
side_effect=dummy_preprocess,
)
def generate_test_data(self):
self.c1 = Correspondent.objects.create(
@@ -192,6 +201,8 @@ class TestClassifier(DirectoriesMixin, TestCase):
new_classifier = DocumentClassifier()
new_classifier.load()
new_classifier.preprocess_content = mock.MagicMock(side_effect=dummy_preprocess)
self.assertFalse(new_classifier.train())
# @override_settings(
@@ -215,6 +226,7 @@ class TestClassifier(DirectoriesMixin, TestCase):
new_classifier = DocumentClassifier()
new_classifier.load()
new_classifier.preprocess_content = mock.MagicMock(side_effect=dummy_preprocess)
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])

View File

@@ -14,6 +14,7 @@ except ImportError:
import backports.zoneinfo as zoneinfo
from django.conf import settings
from django.utils import timezone
from django.test import override_settings
from django.test import TestCase
@@ -326,6 +327,12 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testNormalOperation(self):
filename = self.get_test_file()
# Get the local time, as an aware datetime
# Roughly equal to file modification time
rough_create_date_local = timezone.localtime(timezone.now())
# Consume the file
document = self.consumer.try_consume_file(filename)
self.assertEqual(document.content, "The Text")
@@ -351,7 +358,20 @@ class TestConsumer(DirectoriesMixin, TestCase):
self._assert_first_last_send_progress()
self.assertEqual(document.created.tzinfo, zoneinfo.ZoneInfo("America/Chicago"))
# Convert UTC time from DB to local time
document_date_local = timezone.localtime(document.created)
self.assertEqual(
document_date_local.tzinfo,
zoneinfo.ZoneInfo("America/Chicago"),
)
self.assertEqual(document_date_local.tzinfo, rough_create_date_local.tzinfo)
self.assertEqual(document_date_local.year, rough_create_date_local.year)
self.assertEqual(document_date_local.month, rough_create_date_local.month)
self.assertEqual(document_date_local.day, rough_create_date_local.day)
self.assertEqual(document_date_local.hour, rough_create_date_local.hour)
self.assertEqual(document_date_local.minute, rough_create_date_local.minute)
# Skipping seconds and more precise
@override_settings(FILENAME_FORMAT=None)
def testDeleteMacFiles(self):

View File

@@ -1036,6 +1036,34 @@ class TestFilenameGeneration(TestCase):
self.assertEqual(generate_filename(doc_a), "0000002.pdf")
self.assertEqual(generate_filename(doc_b), "SomeImportantNone/2020-07-25.pdf")
@override_settings(
FILENAME_FORMAT="{created_year_short}/{created_month_name_short}/{created_month_name}/{title}",
)
def test_short_names_created(self):
doc = Document.objects.create(
title="The Title",
created=timezone.make_aware(
datetime.datetime(1989, 12, 21, 7, 36, 51, 153),
),
mime_type="application/pdf",
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), "89/Dec/December/The Title.pdf")
@override_settings(
FILENAME_FORMAT="{added_year_short}/{added_month_name}/{added_month_name_short}/{title}",
)
def test_short_names_added(self):
doc = Document.objects.create(
title="The Title",
added=timezone.make_aware(datetime.datetime(1984, 8, 21, 7, 36, 51, 153)),
mime_type="application/pdf",
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), "84/August/Aug/The Title.pdf")
def run():
doc = Document.objects.create(

View File

@@ -43,7 +43,7 @@ class ConsumerMixin:
super().setUp()
self.t = None
patcher = mock.patch(
"documents.management.commands.document_consumer.async_task",
"documents.tasks.consume_file.delay",
)
self.task_mock = patcher.start()
self.addCleanup(patcher.stop)
@@ -76,7 +76,7 @@ class ConsumerMixin:
# A bogus async_task that will simply check the file for
# completeness and raise an exception otherwise.
def bogus_task(self, func, filename, **kwargs):
def bogus_task(self, filename, **kwargs):
eq = filecmp.cmp(filename, self.sample_file, shallow=False)
if not eq:
print("Consumed an INVALID file.")
@@ -115,7 +115,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
self.assertEqual(args[0], f)
def test_consume_file_invalid_ext(self):
self.t_start()
@@ -135,7 +135,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
self.assertEqual(args[0], f)
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_pdf(self, error_logger):
@@ -155,7 +155,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname)
self.assertEqual(args[0], fname)
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_and_move(self, error_logger):
@@ -175,7 +175,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname2)
self.assertEqual(args[0], fname2)
error_logger.assert_not_called()
@@ -193,7 +193,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname)
self.assertEqual(args[0], fname)
# assert that we have an error logged with this invalid file.
error_logger.assert_called_once()
@@ -241,7 +241,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.assertEqual(2, self.task_mock.call_count)
fnames = [
os.path.basename(args[1]) for args, _ in self.task_mock.call_args_list
os.path.basename(args[0]) for args, _ in self.task_mock.call_args_list
]
self.assertCountEqual(fnames, ["my_file.pdf", "my_second_file.pdf"])
@@ -338,7 +338,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
self.assertEqual(args[0], f)
# assertCountEqual has a bad name, but test that the first
# sequence contains the same elements as second, regardless of

View File

@@ -0,0 +1,126 @@
import celery
from django.test import TestCase
from documents.models import PaperlessTask
from documents.signals.handlers import before_task_publish_handler
from documents.signals.handlers import task_postrun_handler
from documents.signals.handlers import task_prerun_handler
from documents.tests.utils import DirectoriesMixin
class TestTaskSignalHandler(DirectoriesMixin, TestCase):
HEADERS_CONSUME = {
"lang": "py",
"task": "documents.tasks.consume_file",
"id": "52d31e24-9dcc-4c32-9e16-76007e9add5e",
"shadow": None,
"eta": None,
"expires": None,
"group": None,
"group_index": None,
"retries": 0,
"timelimit": [None, None],
"root_id": "52d31e24-9dcc-4c32-9e16-76007e9add5e",
"parent_id": None,
"argsrepr": "('/consume/hello-999.pdf',)",
"kwargsrepr": "{'override_tag_ids': None}",
"origin": "gen260@paperless-ngx-dev-webserver",
"ignore_result": False,
}
HEADERS_WEB_UI = {
"lang": "py",
"task": "documents.tasks.consume_file",
"id": "6e88a41c-e5f8-4631-9972-68c314512498",
"shadow": None,
"eta": None,
"expires": None,
"group": None,
"group_index": None,
"retries": 0,
"timelimit": [None, None],
"root_id": "6e88a41c-e5f8-4631-9972-68c314512498",
"parent_id": None,
"argsrepr": "('/tmp/paperless/paperless-upload-st9lmbvx',)",
"kwargsrepr": "{'override_filename': 'statement.pdf', 'override_title': None, 'override_correspondent_id': None, 'override_document_type_id': None, 'override_tag_ids': None, 'task_id': 'f5622ca9-3707-4ed0-b418-9680b912572f', 'override_created': None}",
"origin": "gen342@paperless-ngx-dev-webserver",
"ignore_result": False,
}
def util_call_before_task_publish_handler(self, headers_to_use):
self.assertEqual(PaperlessTask.objects.all().count(), 0)
before_task_publish_handler(headers=headers_to_use)
self.assertEqual(PaperlessTask.objects.all().count(), 1)
def test_before_task_publish_handler_consume(self):
"""
GIVEN:
- A celery task completed with an exception
WHEN:
- API call is made to get tasks
THEN:
- The returned result is the exception info
"""
self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_CONSUME)
task = PaperlessTask.objects.get()
self.assertIsNotNone(task)
self.assertEqual(self.HEADERS_CONSUME["id"], task.task_id)
self.assertListEqual(["/consume/hello-999.pdf"], task.task_args)
self.assertDictEqual({"override_tag_ids": None}, task.task_kwargs)
self.assertEqual("hello-999.pdf", task.task_file_name)
self.assertEqual("documents.tasks.consume_file", task.task_name)
self.assertEqual(celery.states.PENDING, task.status)
def test_before_task_publish_handler_webui(self):
self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_WEB_UI)
task = PaperlessTask.objects.get()
self.assertIsNotNone(task)
self.assertEqual(self.HEADERS_WEB_UI["id"], task.task_id)
self.assertListEqual(
["/tmp/paperless/paperless-upload-st9lmbvx"],
task.task_args,
)
self.assertDictEqual(
{
"override_filename": "statement.pdf",
"override_title": None,
"override_correspondent_id": None,
"override_document_type_id": None,
"override_tag_ids": None,
"task_id": "f5622ca9-3707-4ed0-b418-9680b912572f",
"override_created": None,
},
task.task_kwargs,
)
self.assertEqual("statement.pdf", task.task_file_name)
self.assertEqual("documents.tasks.consume_file", task.task_name)
self.assertEqual(celery.states.PENDING, task.status)
def test_task_prerun_handler(self):
self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_CONSUME)
task_prerun_handler(task_id=self.HEADERS_CONSUME["id"])
task = PaperlessTask.objects.get()
self.assertEqual(celery.states.STARTED, task.status)
def test_task_postrun_handler(self):
self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_CONSUME)
task_postrun_handler(
task_id=self.HEADERS_CONSUME["id"],
retval="Success. New document id 1 created",
state=celery.states.SUCCESS,
)
task = PaperlessTask.objects.get()
self.assertEqual(celery.states.SUCCESS, task.status)

View File

@@ -11,6 +11,7 @@ from documents.models import DocumentType
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from documents.sanity_checker import SanityCheckMessages
from documents.tests.test_classifier import dummy_preprocess
from documents.tests.utils import DirectoriesMixin
@@ -75,21 +76,26 @@ class TestClassifier(DirectoriesMixin, TestCase):
doc = Document.objects.create(correspondent=c, content="test", title="test")
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime = os.stat(settings.MODEL_FILE).st_mtime
with mock.patch(
"documents.classifier.DocumentClassifier.preprocess_content",
) as pre_proc_mock:
pre_proc_mock.side_effect = dummy_preprocess
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
self.assertEqual(mtime, mtime2)
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime = os.stat(settings.MODEL_FILE).st_mtime
doc.content = "test2"
doc.save()
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
self.assertNotEqual(mtime2, mtime3)
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
self.assertEqual(mtime, mtime2)
doc.content = "test2"
doc.save()
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
self.assertNotEqual(mtime2, mtime3)
class TestSanityCheck(DirectoriesMixin, TestCase):

View File

@@ -28,7 +28,7 @@ from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django_q.tasks import async_task
from documents.tasks import consume_file
from packaging import version as packaging_version
from paperless import version
from paperless.db import GnuPG
@@ -261,6 +261,9 @@ class DocumentViewSet(
file_handle = doc.source_file
filename = doc.get_public_filename()
mime_type = doc.mime_type
# Support browser previewing csv files by using text mime type
if mime_type in {"application/csv", "text/csv"} and disposition == "inline":
mime_type = "text/plain"
if doc.storage_type == Document.STORAGE_TYPE_GPG:
file_handle = GnuPG.decrypted(file_handle)
@@ -612,8 +615,7 @@ class PostDocumentView(GenericAPIView):
task_id = str(uuid.uuid4())
async_task(
"documents.tasks.consume_file",
consume_file.delay(
temp_filename,
override_filename=doc_name,
override_title=title,
@@ -621,7 +623,6 @@ class PostDocumentView(GenericAPIView):
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_id=task_id,
task_name=os.path.basename(doc_name)[:100],
override_created=created,
)
@@ -780,42 +781,38 @@ class RemoteVersionView(GenericAPIView):
remote_version = "0.0.0"
is_greater_than_current = False
current_version = packaging_version.parse(version.__full_version_str__)
# TODO: this can likely be removed when frontend settings are saved to DB
feature_is_set = settings.ENABLE_UPDATE_CHECK != "default"
if feature_is_set and settings.ENABLE_UPDATE_CHECK:
try:
req = urllib.request.Request(
"https://api.github.com/repos/paperless-ngx/"
"paperless-ngx/releases/latest",
)
# Ensure a JSON response
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as response:
remote = response.read().decode("utf-8")
try:
remote_json = json.loads(remote)
remote_version = remote_json["tag_name"]
# Basically PEP 616 but that only went in 3.9
if remote_version.startswith("ngx-"):
remote_version = remote_version[len("ngx-") :]
except ValueError:
logger.debug("An error occurred parsing remote version json")
except urllib.error.URLError:
logger.debug("An error occurred checking for available updates")
is_greater_than_current = (
packaging_version.parse(
remote_version,
)
> current_version
try:
req = urllib.request.Request(
"https://api.github.com/repos/paperless-ngx/"
"paperless-ngx/releases/latest",
)
# Ensure a JSON response
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as response:
remote = response.read().decode("utf-8")
try:
remote_json = json.loads(remote)
remote_version = remote_json["tag_name"]
# Basically PEP 616 but that only went in 3.9
if remote_version.startswith("ngx-"):
remote_version = remote_version[len("ngx-") :]
except ValueError:
logger.debug("An error occurred parsing remote version json")
except urllib.error.URLError:
logger.debug("An error occurred checking for available updates")
is_greater_than_current = (
packaging_version.parse(
remote_version,
)
> current_version
)
return Response(
{
"version": remote_version,
"update_available": is_greater_than_current,
"feature_is_set": feature_is_set,
},
)
@@ -848,15 +845,23 @@ class UiSettingsView(GenericAPIView):
displayname = user.username
if user.first_name or user.last_name:
displayname = " ".join([user.first_name, user.last_name])
settings = {}
ui_settings = {}
if hasattr(user, "ui_settings"):
settings = user.ui_settings.settings
ui_settings = user.ui_settings.settings
if "update_checking" in ui_settings:
ui_settings["update_checking"][
"backend_setting"
] = settings.ENABLE_UPDATE_CHECK
else:
ui_settings["update_checking"] = {
"backend_setting": settings.ENABLE_UPDATE_CHECK,
}
return Response(
{
"user_id": user.id,
"username": user.username,
"display_name": displayname,
"settings": settings,
"settings": ui_settings,
},
)
@@ -882,7 +887,7 @@ class TasksViewSet(ReadOnlyModelViewSet):
PaperlessTask.objects.filter(
acknowledged=False,
)
.order_by("created")
.order_by("date_created")
.reverse()
)

View File

@@ -5,15 +5,15 @@ msgstr ""
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-10-20 23:30\n"
"Last-Translator: \n"
"Language-Team: Arabic\n"
"Language: ar_SA\n"
"Language-Team: Arabic, Arabic\n"
"Language: ar_AR\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=6; plural=(n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5);\n"
"X-Crowdin-Project: paperless-ngx\n"
"X-Crowdin-Project-ID: 500308\n"
"X-Crowdin-Language: ar\n"
"X-Crowdin-Language: ar-AR\n"
"X-Crowdin-File: /dev/src/locale/en_US/LC_MESSAGES/django.po\n"
"X-Crowdin-File-ID: 14\n"

View File

@@ -1,5 +1,11 @@
from .celery import app as celery_app
from .checks import binaries_check
from .checks import paths_check
from .checks import settings_values_check
__all__ = ["binaries_check", "paths_check", "settings_values_check"]
__all__ = [
"celery_app",
"binaries_check",
"paths_check",
"settings_values_check",
]

17
src/paperless/celery.py Normal file
View File

@@ -0,0 +1,17 @@
import os
from celery import Celery
# Set the default Django settings module for the 'celery' program.
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
app = Celery("paperless")
# Using a string here means the worker doesn't have to serialize
# the configuration object to child processes.
# - namespace='CELERY' means all celery-related configuration keys
# should have a `CELERY_` prefix.
app.config_from_object("django.conf:settings", namespace="CELERY")
# Load task modules from all registered Django apps.
app.autodiscover_tasks()

View File

@@ -10,6 +10,7 @@ from typing import Optional
from typing import Set
from urllib.parse import urlparse
from celery.schedules import crontab
from concurrent_log_handler.queue import setup_logging_queues
from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv
@@ -83,6 +84,8 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
NLTK_DIR = os.path.join(DATA_DIR, "nltk")
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
# Lock file for synchronizing changes to the MEDIA directory across multiple
@@ -128,7 +131,7 @@ INSTALLED_APPS = [
"rest_framework",
"rest_framework.authtoken",
"django_filters",
"django_q",
"django_celery_results",
] + env_apps
if DEBUG:
@@ -179,6 +182,8 @@ ASGI_APPLICATION = "paperless.asgi.application"
STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", BASE_URL + "static/")
WHITENOISE_STATIC_PREFIX = "/static/"
_REDIS_URL = os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
# TODO: what is this used for?
TEMPLATES = [
{
@@ -200,7 +205,7 @@ CHANNEL_LAYERS = {
"default": {
"BACKEND": "channels_redis.core.RedisChannelLayer",
"CONFIG": {
"hosts": [os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")],
"hosts": [_REDIS_URL],
"capacity": 2000, # default 100
"expiry": 15, # default 60
},
@@ -342,6 +347,13 @@ if os.getenv("PAPERLESS_DBHOST"):
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
engine = "django.db.backends.mysql"
options = {"read_default_file": "/etc/mysql/my.cnf", "charset": "utf8mb4"}
# Silence Django error on old MariaDB versions.
# VARCHAR can support > 255 in modern versions
# https://docs.djangoproject.com/en/4.1/ref/checks/#database
# https://mariadb.com/kb/en/innodb-system-variables/#innodb_large_prefix
SILENCED_SYSTEM_CHECKS = ["mysql.W003"]
else: # Default to PostgresDB
engine = "django.db.backends.postgresql_psycopg2"
options = {"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer")}
@@ -456,24 +468,53 @@ TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1)
WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
# Per django-q docs, timeout must be smaller than retry
# We default retry to 10s more than the timeout to silence the
# warning, as retry functionality isn't used.
WORKER_RETRY: Final[int] = __get_int(
"PAPERLESS_WORKER_RETRY",
WORKER_TIMEOUT + 10,
)
CELERY_BROKER_URL = _REDIS_URL
CELERY_TIMEZONE = TIME_ZONE
Q_CLUSTER = {
"name": "paperless",
"guard_cycle": 5,
"catch_up": False,
"recycle": 1,
"retry": WORKER_RETRY,
"timeout": WORKER_TIMEOUT,
"workers": TASK_WORKERS,
"redis": os.getenv("PAPERLESS_REDIS", "redis://localhost:6379"),
"log_level": "DEBUG" if DEBUG else "INFO",
CELERY_WORKER_HIJACK_ROOT_LOGGER = False
CELERY_WORKER_CONCURRENCY = TASK_WORKERS
CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
CELERY_WORKER_SEND_TASK_EVENTS = True
CELERY_SEND_TASK_SENT_EVENT = True
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT
CELERY_RESULT_EXTENDED = True
CELERY_RESULT_BACKEND = "django-db"
CELERY_CACHE_BACKEND = "default"
CELERY_BEAT_SCHEDULE = {
# Every ten minutes
"Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"),
},
# Hourly at 5 minutes past the hour
"Train the classifier": {
"task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"),
},
# Daily at midnight
"Optimize the index": {
"task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0),
},
# Weekly, Sunday at 00:30
"Perform sanity check": {
"task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
},
}
CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
# django setting.
CACHES = {
"default": {
"BACKEND": "django.core.cache.backends.redis.RedisCache",
"LOCATION": _REDIS_URL,
},
}
@@ -524,15 +565,18 @@ CONSUMER_IGNORE_PATTERNS = list(
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
CONSUMER_ENABLE_BARCODES = __get_boolean(
CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_BARCODES",
)
CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
CONSUMER_BARCODE_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_BARCODE_TIFF_SUPPORT",
)
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_STRING",
"PATCHT",
)
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
@@ -674,3 +718,40 @@ if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
if ENABLE_UPDATE_CHECK != "default":
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
###############################################################################
# Machine Learning #
###############################################################################
def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
"""
Maps an ISO-639-1 language code supported by Tesseract into
an optional NLTK language name. This is the set of common supported
languages for all the NLTK data used.
Assumption: The primary language is first
"""
ocr_lang = ocr_lang.split("+")[0]
iso_code_to_nltk = {
"dan": "danish",
"nld": "dutch",
"eng": "english",
"fin": "finnish",
"fra": "french",
"deu": "german",
"ita": "italian",
"nor": "norwegian",
"por": "portuguese",
"rus": "russian",
"spa": "spanish",
"swe": "swedish",
"tur": "turkish",
}
return iso_code_to_nltk.get(ocr_lang, None)
NLTK_ENABLED: Final[bool] = __get_boolean("PAPERLESS_ENABLE_NLTK", "yes")
NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)

View File

@@ -4,15 +4,16 @@ import tempfile
from datetime import date
from datetime import timedelta
from fnmatch import fnmatch
from typing import Dict
import magic
import pathvalidate
from django.conf import settings
from django.db import DatabaseError
from django_q.tasks import async_task
from documents.loggers import LoggingMixin
from documents.models import Correspondent
from documents.parsers import is_mime_type_supported
from documents.tasks import consume_file
from imap_tools import AND
from imap_tools import MailBox
from imap_tools import MailboxFolderSelectError
@@ -30,7 +31,7 @@ class MailError(Exception):
class BaseMailAction:
def get_criteria(self):
def get_criteria(self) -> Dict:
return {}
def post_consume(self, M, message_uids, parameter):
@@ -78,7 +79,7 @@ class TagMailAction(BaseMailAction):
M.flag(message_uids, [self.keyword], True)
def get_rule_action(rule):
def get_rule_action(rule) -> BaseMailAction:
if rule.action == MailRule.MailAction.FLAG:
return FlagMailAction()
elif rule.action == MailRule.MailAction.DELETE:
@@ -108,7 +109,7 @@ def make_criterias(rule):
return {**criterias, **get_rule_action(rule).get_criteria()}
def get_mailbox(server, port, security):
def get_mailbox(server, port, security) -> MailBox:
if security == MailAccount.ImapSecurity.NONE:
mailbox = MailBoxUnencrypted(server, port)
elif security == MailAccount.ImapSecurity.STARTTLS:
@@ -167,7 +168,7 @@ class MailAccountHandler(LoggingMixin):
"Unknown correspondent selector",
) # pragma: nocover
def handle_mail_account(self, account):
def handle_mail_account(self, account: MailAccount):
self.renew_logging_group()
@@ -181,7 +182,14 @@ class MailAccountHandler(LoggingMixin):
account.imap_security,
) as M:
supports_gmail_labels = "X-GM-EXT-1" in M.client.capabilities
supports_auth_plain = "AUTH=PLAIN" in M.client.capabilities
self.log("debug", f"GMAIL Label Support: {supports_gmail_labels}")
self.log("debug", f"AUTH=PLAIN Support: {supports_auth_plain}")
try:
M.login(account.username, account.password)
except UnicodeEncodeError:
@@ -215,7 +223,11 @@ class MailAccountHandler(LoggingMixin):
for rule in account.rules.order_by("order"):
try:
total_processed_files += self.handle_mail_rule(M, rule)
total_processed_files += self.handle_mail_rule(
M,
rule,
supports_gmail_labels,
)
except Exception as e:
self.log(
"error",
@@ -233,7 +245,12 @@ class MailAccountHandler(LoggingMixin):
return total_processed_files
def handle_mail_rule(self, M: MailBox, rule):
def handle_mail_rule(
self,
M: MailBox,
rule: MailRule,
supports_gmail_labels: bool = False,
):
self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")
@@ -261,11 +278,19 @@ class MailAccountHandler(LoggingMixin):
) from err
criterias = make_criterias(rule)
criterias_imap = AND(**criterias)
# Deal with the Gmail label extension
if "gmail_label" in criterias:
gmail_label = criterias["gmail_label"]
del criterias["gmail_label"]
criterias_imap = AND(NOT(gmail_label=gmail_label), **criterias)
if not supports_gmail_labels:
criterias_imap = AND(**criterias)
else:
criterias_imap = AND(NOT(gmail_label=gmail_label), **criterias)
else:
criterias_imap = AND(**criterias)
self.log(
"debug",
@@ -389,8 +414,7 @@ class MailAccountHandler(LoggingMixin):
f"{message.subject} from {message.from_}",
)
async_task(
"documents.tasks.consume_file",
consume_file.delay(
path=temp_filename,
override_filename=pathvalidate.sanitize_filename(
att.filename,
@@ -401,7 +425,6 @@ class MailAccountHandler(LoggingMixin):
else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=tag_ids,
task_name=att.filename[:100],
)
processed_attachments += 1

View File

@@ -2,28 +2,12 @@
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule(
"paperless_mail.tasks.process_mail_accounts",
name="Check all e-mail accounts",
schedule_type=Schedule.MINUTES,
minutes=10,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func="paperless_mail.tasks.process_mail_accounts").delete()
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0001_initial"),
("django_q", "0013_task_attempt_count"),
]
operations = [RunPython(add_schedules, remove_schedules)]
operations = [RunPython(migrations.RunPython.noop, migrations.RunPython.noop)]

View File

@@ -1,13 +1,14 @@
import logging
from celery import shared_task
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
from paperless_mail.models import MailAccount
logger = logging.getLogger("paperless.mail.tasks")
@shared_task
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
@@ -20,11 +21,3 @@ def process_mail_accounts():
return f"Added {total_new_documents} document(s)."
else:
return "No new documents were added."
def process_mail_account(name):
try:
account = MailAccount.objects.get(name=name)
MailAccountHandler().handle_mail_account(account)
except MailAccount.DoesNotExist:
logger.error(f"Unknown mail acccount: {name}")

View File

@@ -0,0 +1,70 @@
import os
import pytest
from django.test import TestCase
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
# Only run if the environment is setup
# And the environment is not empty (forks, I think)
@pytest.mark.skipif(
"PAPERLESS_MAIL_TEST_HOST" not in os.environ
or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]),
reason="Live server testing not enabled",
)
class TestMailLiveServer(TestCase):
def setUp(self) -> None:
self.mail_account_handler = MailAccountHandler()
self.account = MailAccount.objects.create(
name="test",
imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
username=os.environ["PAPERLESS_MAIL_TEST_USER"],
password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
imap_port=993,
)
return super().setUp()
def tearDown(self) -> None:
self.account.delete()
return super().tearDown()
def test_process_non_gmail_server_flag(self):
try:
rule1 = MailRule.objects.create(
name="testrule",
account=self.account,
action=MailRule.MailAction.FLAG,
)
self.mail_account_handler.handle_mail_account(self.account)
rule1.delete()
except MailError as e:
self.fail(f"Failure: {e}")
except Exception as e:
pass
def test_process_non_gmail_server_tag(self):
try:
rule2 = MailRule.objects.create(
name="testrule",
account=self.account,
action=MailRule.MailAction.TAG,
)
self.mail_account_handler.handle_mail_account(self.account)
rule2.delete()
except MailError as e:
self.fail(f"Failure: {e}")
except Exception as e:
pass

View File

@@ -47,15 +47,16 @@ class BogusFolderManager:
class BogusClient:
def __init__(self, messages):
self.messages: List[MailMessage] = messages
self.capabilities: List[str] = []
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __init__(self, messages):
self.messages: List[MailMessage] = messages
def authenticate(self, mechanism, authobject):
# authobject must be a callable object
auth_bytes = authobject(None)
@@ -80,12 +81,6 @@ class BogusMailBox(ContextManager):
# Note the non-ascii characters here
UTF_PASSWORD: str = "w57äöüw4b6huwb6nhu"
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __init__(self):
self.messages: List[MailMessage] = []
self.messages_spam: List[MailMessage] = []
@@ -93,6 +88,12 @@ class BogusMailBox(ContextManager):
self.client = BogusClient(self.messages)
self._host = ""
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def updateClient(self):
self.client = BogusClient(self.messages)
@@ -247,7 +248,7 @@ class TestMail(DirectoriesMixin, TestCase):
m.return_value = self.bogus_mailbox
self.addCleanup(patcher.stop)
patcher = mock.patch("paperless_mail.mail.async_task")
patcher = mock.patch("paperless_mail.mail.consume_file.delay")
self.async_task = patcher.start()
self.addCleanup(patcher.stop)
@@ -648,6 +649,7 @@ class TestMail(DirectoriesMixin, TestCase):
def test_handle_mail_account_tag_gmail(self):
self.bogus_mailbox._host = "imap.gmail.com"
self.bogus_mailbox.client.capabilities = ["X-GM-EXT-1"]
account = MailAccount.objects.create(
name="test",
@@ -1030,20 +1032,3 @@ class TestTasks(TestCase):
m.side_effect = lambda account: 0
result = tasks.process_mail_accounts()
self.assertIn("No new", result)
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
def test_single_accounts(self, m):
MailAccount.objects.create(
name="A",
imap_server="A",
username="A",
password="A",
)
tasks.process_mail_account("A")
m.assert_called_once()
m.reset_mock()
tasks.process_mail_account("B")
m.assert_not_called()

View File

@@ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser):
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = text_original and len(text_original) > 50
original_has_text = text_original is not None and len(text_original) > 50
else:
text_original = None
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
# Either no text was in the original or there should be an archive
# file created, so OCR the file and create an archive with any
# test located via OCR
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError
@@ -276,9 +282,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
# Only create archive file if archiving isn't being skipped
if settings.OCR_MODE != "skip_noarchive":
self.archive_path = archive_path
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@@ -341,6 +341,17 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_analog_pages_redo(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only pages 1 and 2 requested
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text of page 1 and 2 extracted
- An archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
@@ -352,6 +363,17 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only page 1 requested
- OCR mode set to force
WHEN:
- Document is parsed
THEN:
- Only text of page 1 is extracted
- An archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
@@ -395,7 +417,7 @@ class TestParser(DirectoriesMixin, TestCase):
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
- An archive file is created with the OCRd text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
@@ -408,15 +430,26 @@ class TestParser(DirectoriesMixin, TestCase):
["page 1", "page 2", "page 3"],
)
self.assertIsNone(parser.archive_path)
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text and the original text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(
parser.get_text().lower(),
@@ -438,7 +471,7 @@ class TestParser(DirectoriesMixin, TestCase):
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
- No archive file is created as original file contains text
"""
parser = RasterisedDocumentParser(None)
parser.parse(

View File

@@ -11,5 +11,6 @@ def text_consumer_declaration(sender, **kwargs):
"mime_types": {
"text/plain": ".txt",
"text/csv": ".csv",
"application/csv": ".csv",
},
}

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,117 @@
import os
import time
from pathlib import Path
from typing import Final
import pytest
from django.test import TestCase
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
class TestTikaParserAgainstServer(TestCase):
"""
This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server
is available.
"""
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
def try_parse_with_wait(self, test_file, mime_type):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.
So, this function will retry the parsing up to 3 times, with larger backoff
periods between each attempt, in hopes the issue resolves itself during
one attempt to parse.
This will wait the following:
- Attempt 1 - 20s following failure
- Attempt 2 - 40s following failure
- Attempt 3 - 80s following failure
"""
succeeded = False
retry_time = 20.0
retry_count = 0
max_retry_count = 3
while retry_count < max_retry_count and not succeeded:
try:
self.parser.parse(test_file, mime_type)
succeeded = True
except Exception as e:
print(f"{e} during try #{retry_count}", flush=True)
retry_count = retry_count + 1
time.sleep(retry_time)
retry_time = retry_time * 2.0
self.assertTrue(
succeeded,
"Continued Tika server errors after multiple retries",
)
def test_basic_parse_odt(self):
"""
GIVEN:
- An input ODT format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.odt")
self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text")
self.assertEqual(
self.parser.text,
"This is an ODT test document, created September 14, 2022",
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
# PDFs begin with the bytes PDF-x.y
self.assertTrue(b"PDF-" in f.read()[:10])
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(self):
"""
GIVEN:
- An input DOCX format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.docx")
self.try_parse_with_wait(
test_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
self.assertEqual(
self.parser.text,
"This is an DOCX test document, also made September 14, 2022",
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))