From d6fedbec5263e0453d3f756a8e7039a9bbe47506 Mon Sep 17 00:00:00 2001 From: Joshua Taillon Date: Thu, 30 Aug 2018 23:32:41 -0400 Subject: [PATCH 1/3] first stab at text consumer --- src/paperless/settings.py | 1 + src/paperless_text/__init__.py | 0 src/paperless_text/apps.py | 16 +++++ src/paperless_text/parsers.py | 113 +++++++++++++++++++++++++++++++++ src/paperless_text/signals.py | 23 +++++++ 5 files changed, 153 insertions(+) create mode 100644 src/paperless_text/__init__.py create mode 100644 src/paperless_text/apps.py create mode 100644 src/paperless_text/parsers.py create mode 100644 src/paperless_text/signals.py diff --git a/src/paperless/settings.py b/src/paperless/settings.py index e40af01d1..7fdcfec76 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -67,6 +67,7 @@ INSTALLED_APPS = [ "documents.apps.DocumentsConfig", "reminders.apps.RemindersConfig", "paperless_tesseract.apps.PaperlessTesseractConfig", + "paperless_text.apps.PaperlessTextConfig", "flat_responsive", # TODO: Remove as of Django 2.x "django.contrib.admin", diff --git a/src/paperless_text/__init__.py b/src/paperless_text/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_text/apps.py b/src/paperless_text/apps.py new file mode 100644 index 000000000..389167368 --- /dev/null +++ b/src/paperless_text/apps.py @@ -0,0 +1,16 @@ +from django.apps import AppConfig + + +class PaperlessTextConfig(AppConfig): + + name = "paperless_text" + + def ready(self): + + from documents.signals import document_consumer_declaration + + from .signals import ConsumerDeclaration + + document_consumer_declaration.connect(ConsumerDeclaration.handle) + + AppConfig.ready(self) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py new file mode 100644 index 000000000..9a399da71 --- /dev/null +++ b/src/paperless_text/parsers.py @@ -0,0 +1,113 @@ +import os +import re +import subprocess + +import dateparser +from django.conf import settings + +from documents.parsers import DocumentParser, ParseError + + +class TextDocumentParser(DocumentParser): + """ + This parser directly parses a text document (.txt or .md) + """ + + + CONVERT = settings.CONVERT_BINARY + THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None + UNPAPER = settings.UNPAPER_BINARY + DATE_ORDER = settings.DATE_ORDER + DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + OCR_ALWAYS = settings.OCR_ALWAYS + + def __init__(self, path): + super().__init__(path) + self._text = None + + def get_thumbnail(self): + """ + The thumbnail of a txt is just a 500px wide image of the text + rendered onto a letter-sized page. + """ + + run_convert( + self.CONVERT, + "-size", "500x647", + "xc:white", + "-pointsize", "12", + "-fill", "black", + "-draw", "\"text 0,12 \'$(cat {})\'\"".format(self.document_path), + os.path.join(self.tempdir, "convert-txt.png") + ) + + return os.path.join(self.tempdir, "convert-txt.png") + + def get_text(self): + + if self._text is not None: + return self._text + + with open(self.document_path, 'r') as f: + self._text = f.read() + + return self._text + + def get_date(self): + date = None + datestring = None + + try: + text = self.get_text() + except ParseError as e: + return None + + # This regular expression will try to find dates in the document at + # hand and will match the following formats: + # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits + # - MONTH ZZZZ, with ZZZZ being 4 digits + # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits + pattern = re.compile( + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b') + + # Iterate through all regex matches and try to parse the date + for m in re.finditer(pattern, text): + datestring = m.group(0) + + try: + date = dateparser.parse( + datestring, + settings={'DATE_ORDER': self.DATE_ORDER, + 'PREFER_DAY_OF_MONTH': 'first', + 'RETURN_AS_TIMEZONE_AWARE': True}) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None: + break + + if date is not None: + self.log("info", "Detected document date " + date.isoformat() + + " based on string " + datestring) + else: + self.log("info", "Unable to detect date for document") + + return date + + +def run_convert(*args): + environment = os.environ.copy() + if settings.CONVERT_MEMORY_LIMIT: + environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT + if settings.CONVERT_TMPDIR: + environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR + + if not subprocess.Popen(args, env=environment).wait() == 0: + raise ParseError("Convert failed at {}".format(args)) \ No newline at end of file diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py new file mode 100644 index 000000000..2f7e5465f --- /dev/null +++ b/src/paperless_text/signals.py @@ -0,0 +1,23 @@ +import re + +from .parsers import TextDocumentParser + + +class ConsumerDeclaration: + + MATCHING_FILES = re.compile("^.*\.(txt|md)$") + + @classmethod + def handle(cls, sender, **kwargs): + return cls.test + + @classmethod + def test(cls, doc): + + if cls.MATCHING_FILES.match(doc.lower()): + return { + "parser": TextDocumentParser, + "weight": 10 + } + + return None From 4849249d8659fc74bbfa30bf32fc9680c990f2c9 Mon Sep 17 00:00:00 2001 From: Joshua Taillon Date: Mon, 3 Sep 2018 23:46:13 -0400 Subject: [PATCH 2/3] explicitly add txt, md, and csv types for consumer and viewer; fix thumbnail generation --- src/documents/models.py | 21 ++++++++----- src/documents/views.py | 3 ++ src/paperless_text/parsers.py | 59 +++++++++++++++++++++++++++-------- src/paperless_text/signals.py | 2 +- 4 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index 7390c1d3c..b97eebc72 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -188,7 +188,11 @@ class Document(models.Model): TYPE_JPG = "jpg" TYPE_GIF = "gif" TYPE_TIF = "tiff" - TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) + TYPE_TXT = "txt" + TYPE_CSV = "csv" + TYPE_MD = "md" + TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, + TYPE_TXT, TYPE_CSV, TYPE_MD) STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_GPG = "gpg" @@ -361,51 +365,52 @@ class FileInfo: ) ) + formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" REGEXES = OrderedDict([ ("created-correspondent-title-tags", re.compile( r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P.*) - " r"(?P.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-title-tags", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-correspondent-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<correspondent>.*) - " r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("correspondent-title-tags", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("correspondent-title", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*)?" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("title", re.compile( r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )) ]) diff --git a/src/documents/views.py b/src/documents/views.py index e297e0984..d7e900147 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): Document.TYPE_JPG: "image/jpeg", Document.TYPE_GIF: "image/gif", Document.TYPE_TIF: "image/tiff", + Document.TYPE_CSV: "text/csv", + Document.TYPE_MD: "text/markdown", + Document.TYPE_TXT: "text/plain" } if self.kwargs["kind"] == "thumb": diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 9a399da71..77f7a4118 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -10,7 +10,7 @@ from documents.parsers import DocumentParser, ParseError class TextDocumentParser(DocumentParser): """ - This parser directly parses a text document (.txt or .md) + This parser directly parses a text document (.txt, .md, or .csv) """ @@ -30,18 +30,50 @@ class TextDocumentParser(DocumentParser): The thumbnail of a txt is just a 500px wide image of the text rendered onto a letter-sized page. """ + # The below is heavily cribbed from https://askubuntu.com/a/590951 - run_convert( - self.CONVERT, - "-size", "500x647", - "xc:white", - "-pointsize", "12", - "-fill", "black", - "-draw", "\"text 0,12 \'$(cat {})\'\"".format(self.document_path), - os.path.join(self.tempdir, "convert-txt.png") - ) + bg_color = "white" # bg color + text_color = "black" # text color + psize = [500, 647] # icon size + n_lines = 50 # number of lines to show + output_file = os.path.join(self.tempdir, "convert-txt.png") - return os.path.join(self.tempdir, "convert-txt.png") + temp_bg = os.path.join(self.tempdir, "bg.png") + temp_txlayer = os.path.join(self.tempdir, "tx.png") + picsize = "x".join([str(n) for n in psize]) + txsize = "x".join([str(n - 8) for n in psize]) + + def create_bg(): + work_size = ",".join([str(n - 1) for n in psize]) + r = str(round(psize[0] / 10)); + rounded = ",".join([r, r]) + run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', + '"fill ', bg_color, ' roundrectangle 0,0,', + work_size, ",", rounded, '" ', temp_bg) + + def read_text(): + with open(self.document_path, 'r') as src: + lines = [l.strip() for l in src.readlines()] + text = "\n".join([l for l in lines[:n_lines]]) + return text.replace('"', "'") + + def create_txlayer(): + run_command(self.CONVERT, + "-background none", + "-fill", + text_color, + "-pointsize", "12", + "-border 4 -bordercolor none", + "-size ", txsize, + ' caption:"', read_text(), '" ', + temp_txlayer) + + create_txlayer() + create_bg() + run_command(self.CONVERT, temp_bg, temp_txlayer, + "-background None -layers merge ", output_file) + + return output_file def get_text(self): @@ -102,12 +134,13 @@ class TextDocumentParser(DocumentParser): return date -def run_convert(*args): +def run_command(*args): environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT if settings.CONVERT_TMPDIR: environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR - if not subprocess.Popen(args, env=environment).wait() == 0: + if not subprocess.Popen(' '.join(args), env=environment, + shell=True).wait() == 0: raise ParseError("Convert failed at {}".format(args)) \ No newline at end of file diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py index 2f7e5465f..598641e19 100644 --- a/src/paperless_text/signals.py +++ b/src/paperless_text/signals.py @@ -5,7 +5,7 @@ from .parsers import TextDocumentParser class ConsumerDeclaration: - MATCHING_FILES = re.compile("^.*\.(txt|md)$") + MATCHING_FILES = re.compile("^.*\.(te?xt|md|csv)$") @classmethod def handle(cls, sender, **kwargs): From 72c828170e2dfea2c7be90ad79e6fa415b39c4cd Mon Sep 17 00:00:00 2001 From: Joshua Taillon <jat255@gmail.com> Date: Wed, 5 Sep 2018 21:13:36 -0400 Subject: [PATCH 3/3] move date-matching regex pattern to base parser module for use by all subclasses --- src/documents/parsers.py | 15 +++++++++++++++ src/paperless_tesseract/parsers.py | 16 +--------------- src/paperless_text/parsers.py | 17 +---------------- 3 files changed, 17 insertions(+), 31 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c44e4c5bf..c28b31a6b 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -1,9 +1,24 @@ import logging import shutil import tempfile +import re from django.conf import settings +# This regular expression will try to find dates in the document at +# hand and will match the following formats: +# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits +# - MONTH ZZZZ, with ZZZZ being 4 digits +# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits +pattern = re.compile( + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b') + class ParseError(Exception): pass diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index add65985a..f0690e5bc 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ from pyocr.tesseract import TesseractError import pdftotext -from documents.parsers import DocumentParser, ParseError +from documents.parsers import DocumentParser, ParseError, pattern from .languages import ISO639 @@ -210,20 +210,6 @@ class RasterisedDocumentParser(DocumentParser): except ParseError as e: return None - # This regular expression will try to find dates in the document at - # hand and will match the following formats: - # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits - # - MONTH ZZZZ, with ZZZZ being 4 digits - # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits - pattern = re.compile( - r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') - # Iterate through all regex matches and try to parse the date for m in re.finditer(pattern, text): datestring = m.group(0) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 77f7a4118..50c341769 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -5,7 +5,7 @@ import subprocess import dateparser from django.conf import settings -from documents.parsers import DocumentParser, ParseError +from documents.parsers import DocumentParser, ParseError, pattern class TextDocumentParser(DocumentParser): @@ -13,7 +13,6 @@ class TextDocumentParser(DocumentParser): This parser directly parses a text document (.txt, .md, or .csv) """ - CONVERT = settings.CONVERT_BINARY THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY @@ -94,20 +93,6 @@ class TextDocumentParser(DocumentParser): except ParseError as e: return None - # This regular expression will try to find dates in the document at - # hand and will match the following formats: - # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits - # - MONTH ZZZZ, with ZZZZ being 4 digits - # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits - pattern = re.compile( - r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') - # Iterate through all regex matches and try to parse the date for m in re.finditer(pattern, text): datestring = m.group(0)