Merge branch 'ENH_text_consumer' of git://github.com/jat255/paperless into jat255-ENH_text_consumer

This commit is contained in:
Daniel Quinn 2018-09-09 20:52:59 +01:00
commit 2dc35cc856
9 changed files with 203 additions and 23 deletions

View File

@ -192,7 +192,11 @@ class Document(models.Model):
TYPE_JPG = "jpg" TYPE_JPG = "jpg"
TYPE_GIF = "gif" TYPE_GIF = "gif"
TYPE_TIF = "tiff" TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg" STORAGE_TYPE_GPG = "gpg"
@ -365,51 +369,52 @@ class FileInfo:
) )
) )
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([ REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile( ("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - " r"(?P<correspondent>.*) - "
r"(?P<title>.*) - " r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)" r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)), )),
("created-title-tags", re.compile( ("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - " r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)" r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)), )),
("created-correspondent-title", re.compile( ("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - " r"(?P<correspondent>.*) - "
r"(?P<title>.*)" r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)), )),
("created-title", re.compile( ("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)" r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)), )),
("correspondent-title-tags", re.compile( ("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - " r"(?P<correspondent>.*) - "
r"(?P<title>.*) - " r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)" r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)), )),
("correspondent-title", re.compile( ("correspondent-title", re.compile(
r"(?P<correspondent>.*) - " r"(?P<correspondent>.*) - "
r"(?P<title>.*)?" r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)), )),
("title", re.compile( ("title", re.compile(
r"(?P<title>.*)" r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE flags=re.IGNORECASE
)) ))
]) ])

View File

@ -1,9 +1,24 @@
import logging import logging
import shutil import shutil
import tempfile import tempfile
import re
from django.conf import settings from django.conf import settings
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
class ParseError(Exception): class ParseError(Exception):
pass pass

View File

@ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
Document.TYPE_JPG: "image/jpeg", Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif", Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff", Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
} }
if self.kwargs["kind"] == "thumb": if self.kwargs["kind"] == "thumb":

View File

@ -67,6 +67,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig", "documents.apps.DocumentsConfig",
"reminders.apps.RemindersConfig", "reminders.apps.RemindersConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"django.contrib.admin", "django.contrib.admin",

View File

@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError from pyocr.tesseract import TesseractError
import pdftotext import pdftotext
from documents.parsers import DocumentParser, ParseError from documents.parsers import DocumentParser, ParseError, pattern
from .languages import ISO639 from .languages import ISO639
@ -210,20 +210,6 @@ class RasterisedDocumentParser(DocumentParser):
except ParseError as e: except ParseError as e:
return None return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
# Iterate through all regex matches and try to parse the date # Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text): for m in re.finditer(pattern, text):
datestring = m.group(0) datestring = m.group(0)

View File

View File

@ -0,0 +1,16 @@
from django.apps import AppConfig
class PaperlessTextConfig(AppConfig):
name = "paperless_text"
def ready(self):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self)

View File

@ -0,0 +1,131 @@
import os
import re
import subprocess
import dateparser
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, pattern
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
"""
CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path):
super().__init__(path)
self._text = None
def get_thumbnail(self):
"""
The thumbnail of a txt is just a 500px wide image of the text
rendered onto a letter-sized page.
"""
# The below is heavily cribbed from https://askubuntu.com/a/590951
bg_color = "white" # bg color
text_color = "black" # text color
psize = [500, 647] # icon size
n_lines = 50 # number of lines to show
output_file = os.path.join(self.tempdir, "convert-txt.png")
temp_bg = os.path.join(self.tempdir, "bg.png")
temp_txlayer = os.path.join(self.tempdir, "tx.png")
picsize = "x".join([str(n) for n in psize])
txsize = "x".join([str(n - 8) for n in psize])
def create_bg():
work_size = ",".join([str(n - 1) for n in psize])
r = str(round(psize[0] / 10));
rounded = ",".join([r, r])
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,',
work_size, ",", rounded, '" ', temp_bg)
def read_text():
with open(self.document_path, 'r') as src:
lines = [l.strip() for l in src.readlines()]
text = "\n".join([l for l in lines[:n_lines]])
return text.replace('"', "'")
def create_txlayer():
run_command(self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer)
create_txlayer()
create_bg()
run_command(self.CONVERT, temp_bg, temp_txlayer,
"-background None -layers merge ", output_file)
return output_file
def get_text(self):
if self._text is not None:
return self._text
with open(self.document_path, 'r') as f:
self._text = f.read()
return self._text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
datestring = m.group(0)
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_command(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
if not subprocess.Popen(' '.join(args), env=environment,
shell=True).wait() == 0:
raise ParseError("Convert failed at {}".format(args))

View File

@ -0,0 +1,23 @@
import re
from .parsers import TextDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile("^.*\.(te?xt|md|csv)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TextDocumentParser,
"weight": 10
}
return None