From d6fedbec5263e0453d3f756a8e7039a9bbe47506 Mon Sep 17 00:00:00 2001 From: Joshua Taillon Date: Thu, 30 Aug 2018 23:32:41 -0400 Subject: [PATCH 01/21] first stab at text consumer --- src/paperless/settings.py | 1 + src/paperless_text/__init__.py | 0 src/paperless_text/apps.py | 16 +++++ src/paperless_text/parsers.py | 113 +++++++++++++++++++++++++++++++++ src/paperless_text/signals.py | 23 +++++++ 5 files changed, 153 insertions(+) create mode 100644 src/paperless_text/__init__.py create mode 100644 src/paperless_text/apps.py create mode 100644 src/paperless_text/parsers.py create mode 100644 src/paperless_text/signals.py diff --git a/src/paperless/settings.py b/src/paperless/settings.py index e40af01d1..7fdcfec76 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -67,6 +67,7 @@ INSTALLED_APPS = [ "documents.apps.DocumentsConfig", "reminders.apps.RemindersConfig", "paperless_tesseract.apps.PaperlessTesseractConfig", + "paperless_text.apps.PaperlessTextConfig", "flat_responsive", # TODO: Remove as of Django 2.x "django.contrib.admin", diff --git a/src/paperless_text/__init__.py b/src/paperless_text/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_text/apps.py b/src/paperless_text/apps.py new file mode 100644 index 000000000..389167368 --- /dev/null +++ b/src/paperless_text/apps.py @@ -0,0 +1,16 @@ +from django.apps import AppConfig + + +class PaperlessTextConfig(AppConfig): + + name = "paperless_text" + + def ready(self): + + from documents.signals import document_consumer_declaration + + from .signals import ConsumerDeclaration + + document_consumer_declaration.connect(ConsumerDeclaration.handle) + + AppConfig.ready(self) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py new file mode 100644 index 000000000..9a399da71 --- /dev/null +++ b/src/paperless_text/parsers.py @@ -0,0 +1,113 @@ +import os +import re +import subprocess + +import dateparser +from django.conf import settings + +from documents.parsers import DocumentParser, ParseError + + +class TextDocumentParser(DocumentParser): + """ + This parser directly parses a text document (.txt or .md) + """ + + + CONVERT = settings.CONVERT_BINARY + THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None + UNPAPER = settings.UNPAPER_BINARY + DATE_ORDER = settings.DATE_ORDER + DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + OCR_ALWAYS = settings.OCR_ALWAYS + + def __init__(self, path): + super().__init__(path) + self._text = None + + def get_thumbnail(self): + """ + The thumbnail of a txt is just a 500px wide image of the text + rendered onto a letter-sized page. + """ + + run_convert( + self.CONVERT, + "-size", "500x647", + "xc:white", + "-pointsize", "12", + "-fill", "black", + "-draw", "\"text 0,12 \'$(cat {})\'\"".format(self.document_path), + os.path.join(self.tempdir, "convert-txt.png") + ) + + return os.path.join(self.tempdir, "convert-txt.png") + + def get_text(self): + + if self._text is not None: + return self._text + + with open(self.document_path, 'r') as f: + self._text = f.read() + + return self._text + + def get_date(self): + date = None + datestring = None + + try: + text = self.get_text() + except ParseError as e: + return None + + # This regular expression will try to find dates in the document at + # hand and will match the following formats: + # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits + # - MONTH ZZZZ, with ZZZZ being 4 digits + # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits + pattern = re.compile( + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b') + + # Iterate through all regex matches and try to parse the date + for m in re.finditer(pattern, text): + datestring = m.group(0) + + try: + date = dateparser.parse( + datestring, + settings={'DATE_ORDER': self.DATE_ORDER, + 'PREFER_DAY_OF_MONTH': 'first', + 'RETURN_AS_TIMEZONE_AWARE': True}) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None: + break + + if date is not None: + self.log("info", "Detected document date " + date.isoformat() + + " based on string " + datestring) + else: + self.log("info", "Unable to detect date for document") + + return date + + +def run_convert(*args): + environment = os.environ.copy() + if settings.CONVERT_MEMORY_LIMIT: + environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT + if settings.CONVERT_TMPDIR: + environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR + + if not subprocess.Popen(args, env=environment).wait() == 0: + raise ParseError("Convert failed at {}".format(args)) \ No newline at end of file diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py new file mode 100644 index 000000000..2f7e5465f --- /dev/null +++ b/src/paperless_text/signals.py @@ -0,0 +1,23 @@ +import re + +from .parsers import TextDocumentParser + + +class ConsumerDeclaration: + + MATCHING_FILES = re.compile("^.*\.(txt|md)$") + + @classmethod + def handle(cls, sender, **kwargs): + return cls.test + + @classmethod + def test(cls, doc): + + if cls.MATCHING_FILES.match(doc.lower()): + return { + "parser": TextDocumentParser, + "weight": 10 + } + + return None From 4849249d8659fc74bbfa30bf32fc9680c990f2c9 Mon Sep 17 00:00:00 2001 From: Joshua Taillon Date: Mon, 3 Sep 2018 23:46:13 -0400 Subject: [PATCH 02/21] explicitly add txt, md, and csv types for consumer and viewer; fix thumbnail generation --- src/documents/models.py | 21 ++++++++----- src/documents/views.py | 3 ++ src/paperless_text/parsers.py | 59 +++++++++++++++++++++++++++-------- src/paperless_text/signals.py | 2 +- 4 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index 7390c1d3c..b97eebc72 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -188,7 +188,11 @@ class Document(models.Model): TYPE_JPG = "jpg" TYPE_GIF = "gif" TYPE_TIF = "tiff" - TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) + TYPE_TXT = "txt" + TYPE_CSV = "csv" + TYPE_MD = "md" + TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, + TYPE_TXT, TYPE_CSV, TYPE_MD) STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_GPG = "gpg" @@ -361,51 +365,52 @@ class FileInfo: ) ) + formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" REGEXES = OrderedDict([ ("created-correspondent-title-tags", re.compile( r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P.*) - " r"(?P.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-title-tags", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-correspondent-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<correspondent>.*) - " r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("correspondent-title-tags", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("correspondent-title", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*)?" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("title", re.compile( r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )) ]) diff --git a/src/documents/views.py b/src/documents/views.py index e297e0984..d7e900147 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): Document.TYPE_JPG: "image/jpeg", Document.TYPE_GIF: "image/gif", Document.TYPE_TIF: "image/tiff", + Document.TYPE_CSV: "text/csv", + Document.TYPE_MD: "text/markdown", + Document.TYPE_TXT: "text/plain" } if self.kwargs["kind"] == "thumb": diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 9a399da71..77f7a4118 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -10,7 +10,7 @@ from documents.parsers import DocumentParser, ParseError class TextDocumentParser(DocumentParser): """ - This parser directly parses a text document (.txt or .md) + This parser directly parses a text document (.txt, .md, or .csv) """ @@ -30,18 +30,50 @@ class TextDocumentParser(DocumentParser): The thumbnail of a txt is just a 500px wide image of the text rendered onto a letter-sized page. """ + # The below is heavily cribbed from https://askubuntu.com/a/590951 - run_convert( - self.CONVERT, - "-size", "500x647", - "xc:white", - "-pointsize", "12", - "-fill", "black", - "-draw", "\"text 0,12 \'$(cat {})\'\"".format(self.document_path), - os.path.join(self.tempdir, "convert-txt.png") - ) + bg_color = "white" # bg color + text_color = "black" # text color + psize = [500, 647] # icon size + n_lines = 50 # number of lines to show + output_file = os.path.join(self.tempdir, "convert-txt.png") - return os.path.join(self.tempdir, "convert-txt.png") + temp_bg = os.path.join(self.tempdir, "bg.png") + temp_txlayer = os.path.join(self.tempdir, "tx.png") + picsize = "x".join([str(n) for n in psize]) + txsize = "x".join([str(n - 8) for n in psize]) + + def create_bg(): + work_size = ",".join([str(n - 1) for n in psize]) + r = str(round(psize[0] / 10)); + rounded = ",".join([r, r]) + run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', + '"fill ', bg_color, ' roundrectangle 0,0,', + work_size, ",", rounded, '" ', temp_bg) + + def read_text(): + with open(self.document_path, 'r') as src: + lines = [l.strip() for l in src.readlines()] + text = "\n".join([l for l in lines[:n_lines]]) + return text.replace('"', "'") + + def create_txlayer(): + run_command(self.CONVERT, + "-background none", + "-fill", + text_color, + "-pointsize", "12", + "-border 4 -bordercolor none", + "-size ", txsize, + ' caption:"', read_text(), '" ', + temp_txlayer) + + create_txlayer() + create_bg() + run_command(self.CONVERT, temp_bg, temp_txlayer, + "-background None -layers merge ", output_file) + + return output_file def get_text(self): @@ -102,12 +134,13 @@ class TextDocumentParser(DocumentParser): return date -def run_convert(*args): +def run_command(*args): environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT if settings.CONVERT_TMPDIR: environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR - if not subprocess.Popen(args, env=environment).wait() == 0: + if not subprocess.Popen(' '.join(args), env=environment, + shell=True).wait() == 0: raise ParseError("Convert failed at {}".format(args)) \ No newline at end of file diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py index 2f7e5465f..598641e19 100644 --- a/src/paperless_text/signals.py +++ b/src/paperless_text/signals.py @@ -5,7 +5,7 @@ from .parsers import TextDocumentParser class ConsumerDeclaration: - MATCHING_FILES = re.compile("^.*\.(txt|md)$") + MATCHING_FILES = re.compile("^.*\.(te?xt|md|csv)$") @classmethod def handle(cls, sender, **kwargs): From 2308d5a613343699ca19ebdc9ac9aa0224910f2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Niederpr=C3=BCm?= <niederp@physik.uni-kl.de> Date: Tue, 4 Sep 2018 16:16:32 +0200 Subject: [PATCH 03/21] Catch ProgrammingError in Document checks. When running PostgreSQL or MariaDB/MySQL backends, a query to a non-existent table will raise a "ProgrammingError". This patch properly catches this error. Without this patch all management calls to manage.py will lead to an error when running PostgreSQL or MariaDB as a backend. --- src/documents/checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/checks.py b/src/documents/checks.py index c80b63863..3310b1806 100644 --- a/src/documents/checks.py +++ b/src/documents/checks.py @@ -2,7 +2,7 @@ import textwrap from django.conf import settings from django.core.checks import Error, register -from django.db.utils import OperationalError +from django.db.utils import OperationalError, ProgrammingError @register() @@ -14,7 +14,7 @@ def changed_password_check(app_configs, **kwargs): try: encrypted_doc = Document.objects.filter( storage_type=Document.STORAGE_TYPE_GPG).first() - except OperationalError: + except (OperationalError, ProgrammingError): return [] # No documents table yet if encrypted_doc: From 6b447628ed0565d0855e7f27ca3137f0094979e0 Mon Sep 17 00:00:00 2001 From: David Martin <david_martin@fastmail.com> Date: Wed, 5 Sep 2018 13:03:42 +1000 Subject: [PATCH 04/21] Bump required version for Pyocr to support the latest tesseract 4. This recently changed in the official tesseract engine [0]. -psm is not allowed as an option anymore and --psm has to be used instead. The latest pyocr enables support for this [1]. [0] tesseract-ocr/tesseract@ee201e1 [1] https://gitlab.gnome.org/World/OpenPaperwork/pyocr/commit/5abd0a566a0518bea00cb4247c16e67d0d3c2d65 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 247d9993a..0476efef1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ pillow==5.2.0 pluggy==0.7.1; python_version != '3.1.*' py==1.6.0; python_version != '3.1.*' pycodestyle==2.4.0 -pyocr==0.5.2 +pyocr==0.5.3 pytest-cov==2.5.1 pytest-django==3.4.2 pytest-env==0.6.2 From 01a358d2b07bcc45c8a620a44edd13972ec54950 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Wed, 5 Sep 2018 10:58:41 +0100 Subject: [PATCH 05/21] Re-flow text to keep it <80c wide --- paperless.conf.example | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index 8aa33216f..d47e4d453 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -89,9 +89,10 @@ PAPERLESS_EMAIL_SECRET="" # as is "example.com,www.example.com", but NOT " example.com" or "example.com," #PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com" -# If you decide to use Paperless APIs in an ajax calls, you need to add your -# servers to the allowed hosts that can do CORS calls. By default Paperless allows -# calls from localhost:8080. The same rules as above how the list should look like. +# If you decide to use the Paperless API in an ajax call, you need to add your +# servers to the list of allowed hosts that can do CORS calls. By default +# Paperless allows calls from localhost:8080, but you'd like to change that, +# you can set this value to a comma-separated list. #PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000" # To host paperless under a subpath url like example.com/paperless you set From fbc6a58f5a4460d3d2d2a7accae32fa6245dee10 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Wed, 5 Sep 2018 10:59:06 +0100 Subject: [PATCH 06/21] Add credits for 2.2.0 that I forgot --- docs/changelog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index f80445dde..4d78532f5 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -19,6 +19,10 @@ Changelog easier on those of us with lots of different tags: `#391`_. * `Kilian Koeltzsch`_ noticed a bug in how we capture & automatically create tags, so that's fixed now too: `#384`_. +* `erikarvstedt`_ tweaked the behaviour of the test suite to be better behaved + for packaging environments: `#383`_. +* `Lukasz Soluch`_ added CORS support to make building a new Javascript-based front-end + cleaner & easier: `#387`_. 2.1.0 @@ -476,6 +480,7 @@ bulk of the work on this big change. .. _Tim Brooks: https://github.com/brookst .. _Stéphane Brunner: https://github.com/sbrunner .. _Kilian Koeltzsch: https://github.com/kiliankoe +.. _Lukasz Soluch: https://github.com/LukaszSolo .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 @@ -550,8 +555,10 @@ bulk of the work on this big change. .. _#374: https://github.com/danielquinn/paperless/pull/374 .. _#375: https://github.com/danielquinn/paperless/pull/375 .. _#376: https://github.com/danielquinn/paperless/pull/376 +.. _#383: https://github.com/danielquinn/paperless/pull/383 .. _#384: https://github.com/danielquinn/paperless/issues/384 .. _#386: https://github.com/danielquinn/paperless/issues/386 +.. _#387: https://github.com/danielquinn/paperless/pull/387 .. _#391: https://github.com/danielquinn/paperless/pull/391 .. _#390: https://github.com/danielquinn/paperless/pull/390 .. _#392: https://github.com/danielquinn/paperless/issues/392 From 939a67bd4b8d35c7db7fb219e6687d8afb6fe9b5 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Wed, 5 Sep 2018 11:16:42 +0100 Subject: [PATCH 07/21] Add empty requirements for rtd to reference --- docs/requirements.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..e69de29bb From cac63494f04b8ab6278db2f4ce862e11efde2712 Mon Sep 17 00:00:00 2001 From: Joshua Taillon <jat255@gmail.com> Date: Wed, 5 Sep 2018 15:18:35 -0400 Subject: [PATCH 08/21] change tesseract parser to only convert first page to save (potentially) massive amounts of work --- src/paperless_tesseract/parsers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index add65985a..4216ec230 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser): self.CONVERT, "-scale", "500x5000", "-alpha", "remove", - self.document_path, os.path.join(self.tempdir, "convert-%04d.png") + "{}[0]".format(self.document_path), + os.path.join(self.tempdir, "convert.png") ) - return os.path.join(self.tempdir, "convert-0000.png") + return os.path.join(self.tempdir, "convert.png") def _is_ocred(self): From 72c828170e2dfea2c7be90ad79e6fa415b39c4cd Mon Sep 17 00:00:00 2001 From: Joshua Taillon <jat255@gmail.com> Date: Wed, 5 Sep 2018 21:13:36 -0400 Subject: [PATCH 09/21] move date-matching regex pattern to base parser module for use by all subclasses --- src/documents/parsers.py | 15 +++++++++++++++ src/paperless_tesseract/parsers.py | 16 +--------------- src/paperless_text/parsers.py | 17 +---------------- 3 files changed, 17 insertions(+), 31 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c44e4c5bf..c28b31a6b 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -1,9 +1,24 @@ import logging import shutil import tempfile +import re from django.conf import settings +# This regular expression will try to find dates in the document at +# hand and will match the following formats: +# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits +# - MONTH ZZZZ, with ZZZZ being 4 digits +# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits +pattern = re.compile( + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b') + class ParseError(Exception): pass diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index add65985a..f0690e5bc 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ from pyocr.tesseract import TesseractError import pdftotext -from documents.parsers import DocumentParser, ParseError +from documents.parsers import DocumentParser, ParseError, pattern from .languages import ISO639 @@ -210,20 +210,6 @@ class RasterisedDocumentParser(DocumentParser): except ParseError as e: return None - # This regular expression will try to find dates in the document at - # hand and will match the following formats: - # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits - # - MONTH ZZZZ, with ZZZZ being 4 digits - # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits - pattern = re.compile( - r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') - # Iterate through all regex matches and try to parse the date for m in re.finditer(pattern, text): datestring = m.group(0) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 77f7a4118..50c341769 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -5,7 +5,7 @@ import subprocess import dateparser from django.conf import settings -from documents.parsers import DocumentParser, ParseError +from documents.parsers import DocumentParser, ParseError, pattern class TextDocumentParser(DocumentParser): @@ -13,7 +13,6 @@ class TextDocumentParser(DocumentParser): This parser directly parses a text document (.txt, .md, or .csv) """ - CONVERT = settings.CONVERT_BINARY THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY @@ -94,20 +93,6 @@ class TextDocumentParser(DocumentParser): except ParseError as e: return None - # This regular expression will try to find dates in the document at - # hand and will match the following formats: - # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits - # - MONTH ZZZZ, with ZZZZ being 4 digits - # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits - pattern = re.compile( - r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') - # Iterate through all regex matches and try to parse the date for m in re.finditer(pattern, text): datestring = m.group(0) From 22378789e2a112bf71d499236a5c1da11fc7b0e7 Mon Sep 17 00:00:00 2001 From: Joshua Taillon <jat255@gmail.com> Date: Wed, 5 Sep 2018 22:58:38 -0400 Subject: [PATCH 10/21] add option for inline vs. attachment for document rendering --- paperless.conf.example | 4 ++++ src/documents/views.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index d47e4d453..996b816f8 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -117,6 +117,10 @@ PAPERLESS_EMAIL_SECRET="" # http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process #PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh" +# By default, when clicking on a document within the web interface, the +# browser will prompt the user to save the document to disk. By uncommenting +# the below, the document will instead be opened in the browser, if possible. +#PAPERLESS_INLINE_DOC="true" # # The following values use sensible defaults for modern systems, but if you're diff --git a/src/documents/views.py b/src/documents/views.py index e297e0984..4f71deb56 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,6 +1,8 @@ from django.http import HttpResponse, HttpResponseBadRequest from django.views.generic import DetailView, FormView, TemplateView from django_filters.rest_framework import DjangoFilterBackend +from django.conf import settings + from paperless.db import GnuPG from paperless.mixins import SessionOrBasicAuthMixin from paperless.views import StandardPagination @@ -60,8 +62,12 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): self._get_raw_data(self.object.source_file), content_type=content_types[self.object.file_type] ) - response["Content-Disposition"] = 'attachment; filename="{}"'.format( - self.object.file_name) + + print("OPEN_DOCUMENT", settings.INLINE_DOC) + DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment' + + response["Content-Disposition"] = '{}; filename="{}"'.format( + DISPOSITION, self.object.file_name) return response From be9757894a385963479e9a85abb35697b6497c44 Mon Sep 17 00:00:00 2001 From: Joshua Taillon <jat255@gmail.com> Date: Wed, 5 Sep 2018 23:03:30 -0400 Subject: [PATCH 11/21] add INLINE_DOC to settings.py --- src/paperless/settings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 06cc1807f..56360d5d5 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -270,6 +270,9 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT") POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT") +# Whether to display a selected document inline, or download it as attachment: +INLINE_DOC = os.getenv("PAPERLESS_INLINE_DOC") + # The number of items on each page in the web UI. This value must be a # positive integer, but if you don't define one in paperless.conf, a default of # 100 will be used. From 652ead2f5ccfbeb15c5d743c2e6159b3d07fbd78 Mon Sep 17 00:00:00 2001 From: Joshua Taillon <jat255@gmail.com> Date: Wed, 5 Sep 2018 23:05:37 -0400 Subject: [PATCH 12/21] remove debugging print statement --- src/documents/views.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/documents/views.py b/src/documents/views.py index 4f71deb56..6e4eeafd2 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -63,7 +63,6 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): content_type=content_types[self.object.file_type] ) - print("OPEN_DOCUMENT", settings.INLINE_DOC) DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment' response["Content-Disposition"] = '{}; filename="{}"'.format( From ed0e40d3e626fb0ed1a30801c3519475f5fe3f3e Mon Sep 17 00:00:00 2001 From: ahyear <kevindelouya@gmail.com> Date: Thu, 6 Sep 2018 15:32:41 +0200 Subject: [PATCH 13/21] add migrate commande to docker update process --- docs/migrating.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/migrating.rst b/docs/migrating.rst index d97d3d4bf..45646f058 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -101,6 +101,7 @@ is similar: $ cd /path/to/project $ git pull $ docker build -t paperless . + $ docker-compose run --rm comsumer migrate $ docker-compose up -d If ``git pull`` doesn't report any changes, there is no need to continue with From 5c39fff51b7394dcb7a735677c116ecb2dfaaaf5 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 19:59:47 +0100 Subject: [PATCH 14/21] Add tox to dev dependencies --- Pipfile | 2 ++ Pipfile.lock | 83 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 24 deletions(-) diff --git a/Pipfile b/Pipfile index a9331f134..b1c30698d 100644 --- a/Pipfile +++ b/Pipfile @@ -36,3 +36,5 @@ pytest-xdist = "*" [dev-packages] ipython = "*" sphinx = "*" +tox = "*" + diff --git a/Pipfile.lock b/Pipfile.lock index 614ee0e78..71a46d37f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e20c2294bcafd346ee57901df94a515a12976ed192dc37df848b39b56bdd1f4b" + "sha256": "6d8bad24aa5d0c102b13b5ae27acba04836cd5a07a4003cb2763de1e0a3406b7" }, "pipfile-spec": 6, "requires": {}, @@ -19,7 +19,7 @@ "sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6", "sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.5" }, "atomicwrites": { @@ -27,7 +27,7 @@ "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0", "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.2.1" }, "attrs": { @@ -85,7 +85,7 @@ "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6", "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.1.*'", + "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4'", "version": "==4.5.1" }, "coveralls": { @@ -163,7 +163,7 @@ "sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a", "sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.5.0" }, "factory-boy": { @@ -179,6 +179,7 @@ "sha256:ea7cfd3aeb1544732d08bd9cfba40c5b78e3a91e17b1a0698ab81bfc5554c628", "sha256:f6d67f04abfb2b4bea7afc7fa6c18cf4c523a67956e455668be9ae42bccc21ad" ], + "markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version >= '2.7'", "version": "==0.9.0" }, "filemagic": { @@ -282,7 +283,7 @@ "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==0.7.1" }, "py": { @@ -290,7 +291,7 @@ "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.6.0" }, "pycodestyle": { @@ -303,26 +304,26 @@ }, "pyocr": { "hashes": [ - "sha256:bdc4d43bf9b63c2a9a4b2c9a1a623a0e63c8e6600eede5dbe866b31f3a5f2207" + "sha256:b6ba6263fd92da56627dff6d263d991a2246aacd117d1788f11b93f419ca395f" ], "index": "pypi", - "version": "==0.5.2" + "version": "==0.5.3" }, "pytest": { "hashes": [ - "sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349", - "sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18" + "sha256:453cbbbe5ce6db38717d282b758b917de84802af4288910c12442984bde7b823", + "sha256:a8a07f84e680482eb51e244370aaf2caa6301ef265f37c2bdefb3dd3b663f99d" ], "index": "pypi", - "version": "==3.7.4" + "version": "==3.8.0" }, "pytest-cov": { "hashes": [ - "sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d", - "sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec" + "sha256:513c425e931a0344944f84ea47f3956be0e416d95acbd897a44970c8d926d5d7", + "sha256:e360f048b7dae3f2f2a9a4d067b2dd6b6a015d384d1577c994a43f3f7cbad762" ], "index": "pypi", - "version": "==2.5.1" + "version": "==2.6.0" }, "pytest-django": { "hashes": [ @@ -344,6 +345,7 @@ "sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805", "sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08" ], + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==0.2" }, "pytest-sugar": { @@ -457,7 +459,7 @@ "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" ], - "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'", "version": "==1.23" } }, @@ -521,10 +523,11 @@ }, "imagesize": { "hashes": [ - "sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18", - "sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315" + "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", + "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" ], - "version": "==1.0.0" + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", + "version": "==1.1.0" }, "ipython": { "hashes": [ @@ -590,6 +593,14 @@ ], "version": "==0.7.4" }, + "pluggy": { + "hashes": [ + "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", + "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1" + ], + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", + "version": "==0.7.1" + }, "prompt-toolkit": { "hashes": [ "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381", @@ -605,6 +616,14 @@ ], "version": "==0.6.0" }, + "py": { + "hashes": [ + "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", + "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6" + ], + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", + "version": "==1.6.0" + }, "pygments": { "hashes": [ "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", @@ -656,20 +675,28 @@ }, "sphinx": { "hashes": [ - "sha256:a07050845cc9a2f4026a6035cc8ed795a5ce7be6528bbc82032385c10807dfe7", - "sha256:d719de667218d763e8fd144b7fcfeefd8d434a6201f76bf9f0f0c1fa6f47fcdb" + "sha256:217a7705adcb573da5bbe1e0f5cab4fa0bd89fd9342c9159121746f593c2d5a4", + "sha256:a602513f385f1d5785ff1ca420d9c7eb1a1b63381733b2f0ea8188a391314a86" ], "index": "pypi", - "version": "==1.7.8" + "version": "==1.7.9" }, "sphinxcontrib-websupport": { "hashes": [ "sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd", "sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9" ], - "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.1.0" }, + "tox": { + "hashes": [ + "sha256:37cf240781b662fb790710c6998527e65ca6851eace84d1595ee71f7af4e85f7", + "sha256:eb61aa5bcce65325538686f09848f04ef679b5cd9b83cc491272099b28739600" + ], + "index": "pypi", + "version": "==3.2.1" + }, "traitlets": { "hashes": [ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", @@ -682,9 +709,17 @@ "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" ], - "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'", "version": "==1.23" }, + "virtualenv": { + "hashes": [ + "sha256:2ce32cd126117ce2c539f0134eb89de91a8413a29baac49cbab3eb50e2026669", + "sha256:ca07b4c0b54e14a91af9f34d0919790b016923d157afda5efdde55c96718f752" + ], + "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==16.0.0" + }, "wcwidth": { "hashes": [ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", From 5342db6adab8cdb92bca76eac38960a7aaaa0613 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 20:00:12 +0100 Subject: [PATCH 15/21] Fix pycodestyle complaints Apparently, pycodestyle updated itself to now check for invalid escape sequences, which only complain if the regex in use isn't a raw string (r""). --- src/documents/models.py | 2 +- src/documents/tests/test_matchables.py | 2 +- src/paperless_tesseract/parsers.py | 5 +++-- src/paperless_tesseract/signals.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index 36466bbac..9a8e6003d 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -135,7 +135,7 @@ class MatchingModel(models.Model): Example: ' some random words "with quotes " and spaces' ==> - ["some", "random", "words", "with\s+quotes", "and", "spaces"] + ["some", "random", "words", "with+quotes", "and", "spaces"] """ findterms = re.compile(r'"([^"]+)"|(\S+)').findall normspace = re.compile(r"\s+").sub diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 55d25598a..e592237b6 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -166,7 +166,7 @@ class TestMatching(TestCase): def test_match_regex(self): self._test_matching( - "alpha\w+gamma", + r"alpha\w+gamma", "MATCH_REGEX", ( "I have alpha_and_gamma in me", diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index add65985a..bd1ce8ffb 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -272,8 +272,9 @@ def run_unpaper(args): def strip_excess_whitespace(text): collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) no_leading_whitespace = re.sub( - "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) - no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) + r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_trailing_whitespace = re.sub( + r"([^\S\n\r]+)$", '', no_leading_whitespace) return no_trailing_whitespace diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 2fa54f5d5..237f15c52 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser class ConsumerDeclaration: - MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") + MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") @classmethod def handle(cls, sender, **kwargs): From ef302abed7ac064b97754a67cf794cd8d59f4def Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 20:55:37 +0100 Subject: [PATCH 16/21] Fix pycodestyle complaints --- src/documents/models.py | 2 +- src/paperless_text/parsers.py | 4 ++-- src/paperless_text/signals.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index bf53cd857..0fffcc87e 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -194,7 +194,7 @@ class Document(models.Model): TYPE_TIF = "tiff" TYPE_TXT = "txt" TYPE_CSV = "csv" - TYPE_MD = "md" + TYPE_MD = "md" TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, TYPE_TXT, TYPE_CSV, TYPE_MD) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 50c341769..1b97d0ea1 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -44,7 +44,7 @@ class TextDocumentParser(DocumentParser): def create_bg(): work_size = ",".join([str(n - 1) for n in psize]) - r = str(round(psize[0] / 10)); + r = str(round(psize[0] / 10)) rounded = ",".join([r, r]) run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', '"fill ', bg_color, ' roundrectangle 0,0,', @@ -128,4 +128,4 @@ def run_command(*args): if not subprocess.Popen(' '.join(args), env=environment, shell=True).wait() == 0: - raise ParseError("Convert failed at {}".format(args)) \ No newline at end of file + raise ParseError("Convert failed at {}".format(args)) diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py index 598641e19..ae5a005e1 100644 --- a/src/paperless_text/signals.py +++ b/src/paperless_text/signals.py @@ -5,7 +5,7 @@ from .parsers import TextDocumentParser class ConsumerDeclaration: - MATCHING_FILES = re.compile("^.*\.(te?xt|md|csv)$") + MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") @classmethod def handle(cls, sender, **kwargs): From c99f5923d5a529dc59e5ce15b69f4d35245f9079 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 21:02:30 +0100 Subject: [PATCH 17/21] Rename `parsers` to `DATE_REGEX` In moving the `parsers` variable into the package-level, it lost the context, so a more descriptive name was needed. --- src/documents/parsers.py | 5 +++-- src/paperless_tesseract/parsers.py | 4 ++-- src/paperless_text/parsers.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c28b31a6b..884f91ae4 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -13,11 +13,12 @@ from django.conf import settings # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits -pattern = re.compile( +DATE_REGEX = re.compile( r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') + r'\b([^\W\d_]{3,9} [0-9]{4})\b' +) class ParseError(Exception): diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 1ecf36906..1aa4513cb 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ from pyocr.tesseract import TesseractError import pdftotext -from documents.parsers import DocumentParser, ParseError, pattern +from documents.parsers import DocumentParser, ParseError, DATE_REGEX from .languages import ISO639 @@ -211,7 +211,7 @@ class RasterisedDocumentParser(DocumentParser): return None # Iterate through all regex matches and try to parse the date - for m in re.finditer(pattern, text): + for m in re.finditer(DATE_REGEX, text): datestring = m.group(0) try: diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 1b97d0ea1..f02ba3ef8 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -5,7 +5,7 @@ import subprocess import dateparser from django.conf import settings -from documents.parsers import DocumentParser, ParseError, pattern +from documents.parsers import DocumentParser, ParseError, DATE_REGEX class TextDocumentParser(DocumentParser): @@ -94,7 +94,7 @@ class TextDocumentParser(DocumentParser): return None # Iterate through all regex matches and try to parse the date - for m in re.finditer(pattern, text): + for m in re.finditer(DATE_REGEX, text): datestring = m.group(0) try: From 0472fe4e9e07c39237046711a3d0a277ff0164af Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 21:03:37 +0100 Subject: [PATCH 18/21] Reorder imports --- src/documents/models.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index 0fffcc87e..c66bb5b0f 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,24 +1,24 @@ # coding=utf-8 -import dateutil.parser import logging import os import re import uuid - from collections import OrderedDict + +import dateutil.parser +from django.conf import settings +from django.db import models +from django.template.defaultfilters import slugify +from django.utils import timezone from fuzzywuzzy import fuzz -from django.conf import settings +from .managers import LogManager + try: from django.core.urlresolvers import reverse except ImportError: from django.urls import reverse -from django.db import models -from django.template.defaultfilters import slugify -from django.utils import timezone - -from .managers import LogManager class MatchingModel(models.Model): From a86a20ef0f50184aad724ba6bddfd426faa27b65 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 21:16:53 +0100 Subject: [PATCH 19/21] Make the example file contain the default value --- paperless.conf.example | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index 996b816f8..15498a26a 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -118,9 +118,9 @@ PAPERLESS_EMAIL_SECRET="" #PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh" # By default, when clicking on a document within the web interface, the -# browser will prompt the user to save the document to disk. By uncommenting -# the below, the document will instead be opened in the browser, if possible. -#PAPERLESS_INLINE_DOC="true" +# browser will prompt the user to save the document to disk. By setting this to +# "true", the document will instead be opened in the browser, if possible. +#PAPERLESS_INLINE_DOC="false" # # The following values use sensible defaults for modern systems, but if you're From 7cef1087851de36a4a1c156224257f544d0f62d1 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 21:22:07 +0100 Subject: [PATCH 20/21] Streamline how we handle boolean values in settings.py --- src/paperless/settings.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 280d737e9..956b90a7f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -22,6 +22,14 @@ elif os.path.exists("/usr/local/etc/paperless.conf"): load_dotenv("/usr/local/etc/paperless.conf") +def __get_boolean(key): + """ + Return a boolean value based on whatever the user has supplied in the + environment based on whether the value "looks like" it's True or not. + """ + return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true")) + + # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -222,12 +230,12 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") # OCR all documents? -OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) # NOQA +OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS") # If this is true, any failed attempts to OCR a PDF will result in the PDF # being indexed anyway, with whatever we could get. If it's False, the file # will simply be left in the CONSUMPTION_DIR. -FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # NOQA +FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR") # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") @@ -272,7 +280,7 @@ PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT") POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT") # Whether to display a selected document inline, or download it as attachment: -INLINE_DOC = os.getenv("PAPERLESS_INLINE_DOC") +INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC") # The number of items on each page in the web UI. This value must be a # positive integer, but if you don't define one in paperless.conf, a default of From 2edf65dd1e439ae99af2f1e00762c10229fbf1ef Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 9 Sep 2018 21:51:44 +0100 Subject: [PATCH 21/21] Bump to 2.3.0 --- docs/changelog.rst | 25 +++++++++++++++++++++++++ src/paperless/version.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 4d78532f5..804447855 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,23 @@ Changelog ######### +2.3.0 +===== + +* Support for consuming plain text & markdown documents was added by + `Joshua Taillon`_! This was a long-requested feature, and it's addition is + likely to be greatly appreciated by the community: `#395`_ Thanks also to + `David Martin`_ for his assistance on the issue. +* `dubit0`_ found & fixed a bug that prevented management commands from running + before we had an operational database: `#396`_ +* Joshua also added a simple update to the thumbnail generation process to + improve performance: `#399`_ +* As his last bit of effort on this release, Joshua also added some code to + allow you to view the documents inline rather than download them as an + attachment. `#400`_ +* Finally, `ahyear`_ found a slip in the Docker documentation and patched it. `#401`_ + + 2.2.1 ===== @@ -481,6 +498,9 @@ bulk of the work on this big change. .. _Stéphane Brunner: https://github.com/sbrunner .. _Kilian Koeltzsch: https://github.com/kiliankoe .. _Lukasz Soluch: https://github.com/LukaszSolo +.. _Joshua Taillon: https://github.com/jat255 +.. _dubit0: https://github.com/dubit0 +.. _ahyear: https://github.com/ahyear .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 @@ -562,6 +582,11 @@ bulk of the work on this big change. .. _#391: https://github.com/danielquinn/paperless/pull/391 .. _#390: https://github.com/danielquinn/paperless/pull/390 .. _#392: https://github.com/danielquinn/paperless/issues/392 +.. _#395: https://github.com/danielquinn/paperless/pull/395 +.. _#396: https://github.com/danielquinn/paperless/pull/396 +.. _#399: https://github.com/danielquinn/paperless/pull/399 +.. _#400: https://github.com/danielquinn/paperless/pull/400 +.. _#401: https://github.com/danielquinn/paperless/pull/401 .. _pipenv: https://docs.pipenv.org/ .. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/ diff --git a/src/paperless/version.py b/src/paperless/version.py index 0fbece706..c1b36d9c1 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (2, 2, 1) +__version__ = (2, 3, 0)