mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'jat255-ENH_text_consumer'
This commit is contained in:
commit
75648cc74b
@ -1,24 +1,24 @@
|
||||
# coding=utf-8
|
||||
|
||||
import dateutil.parser
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from django.conf import settings
|
||||
from .managers import LogManager
|
||||
|
||||
try:
|
||||
from django.core.urlresolvers import reverse
|
||||
except ImportError:
|
||||
from django.urls import reverse
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
|
||||
from .managers import LogManager
|
||||
|
||||
|
||||
class MatchingModel(models.Model):
|
||||
@ -192,7 +192,11 @@ class Document(models.Model):
|
||||
TYPE_JPG = "jpg"
|
||||
TYPE_GIF = "gif"
|
||||
TYPE_TIF = "tiff"
|
||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
|
||||
TYPE_TXT = "txt"
|
||||
TYPE_CSV = "csv"
|
||||
TYPE_MD = "md"
|
||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
|
||||
TYPE_TXT, TYPE_CSV, TYPE_MD)
|
||||
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
@ -365,51 +369,52 @@ class FileInfo:
|
||||
)
|
||||
)
|
||||
|
||||
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-correspondent-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title-tags", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)?"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("title", re.compile(
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
flags=re.IGNORECASE
|
||||
))
|
||||
])
|
||||
|
@ -1,9 +1,25 @@
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
import re
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
DATE_REGEX = re.compile(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
|
||||
)
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
@ -48,6 +48,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
|
||||
Document.TYPE_JPG: "image/jpeg",
|
||||
Document.TYPE_GIF: "image/gif",
|
||||
Document.TYPE_TIF: "image/tiff",
|
||||
Document.TYPE_CSV: "text/csv",
|
||||
Document.TYPE_MD: "text/markdown",
|
||||
Document.TYPE_TXT: "text/plain"
|
||||
}
|
||||
|
||||
if self.kwargs["kind"] == "thumb":
|
||||
|
@ -67,6 +67,7 @@ INSTALLED_APPS = [
|
||||
"documents.apps.DocumentsConfig",
|
||||
"reminders.apps.RemindersConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
|
||||
"django.contrib.admin",
|
||||
|
||||
|
@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
import pdftotext
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
@ -210,22 +210,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
pattern = re.compile(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(pattern, text):
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
|
0
src/paperless_text/__init__.py
Normal file
0
src/paperless_text/__init__.py
Normal file
16
src/paperless_text/apps.py
Normal file
16
src/paperless_text/apps.py
Normal file
@ -0,0 +1,16 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PaperlessTextConfig(AppConfig):
|
||||
|
||||
name = "paperless_text"
|
||||
|
||||
def ready(self):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
|
||||
AppConfig.ready(self)
|
131
src/paperless_text/parsers.py
Normal file
131
src/paperless_text/parsers.py
Normal file
@ -0,0 +1,131 @@
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import dateparser
|
||||
from django.conf import settings
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
||||
|
||||
|
||||
class TextDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser directly parses a text document (.txt, .md, or .csv)
|
||||
"""
|
||||
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
The thumbnail of a txt is just a 500px wide image of the text
|
||||
rendered onto a letter-sized page.
|
||||
"""
|
||||
# The below is heavily cribbed from https://askubuntu.com/a/590951
|
||||
|
||||
bg_color = "white" # bg color
|
||||
text_color = "black" # text color
|
||||
psize = [500, 647] # icon size
|
||||
n_lines = 50 # number of lines to show
|
||||
output_file = os.path.join(self.tempdir, "convert-txt.png")
|
||||
|
||||
temp_bg = os.path.join(self.tempdir, "bg.png")
|
||||
temp_txlayer = os.path.join(self.tempdir, "tx.png")
|
||||
picsize = "x".join([str(n) for n in psize])
|
||||
txsize = "x".join([str(n - 8) for n in psize])
|
||||
|
||||
def create_bg():
|
||||
work_size = ",".join([str(n - 1) for n in psize])
|
||||
r = str(round(psize[0] / 10))
|
||||
rounded = ",".join([r, r])
|
||||
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
|
||||
'"fill ', bg_color, ' roundrectangle 0,0,',
|
||||
work_size, ",", rounded, '" ', temp_bg)
|
||||
|
||||
def read_text():
|
||||
with open(self.document_path, 'r') as src:
|
||||
lines = [l.strip() for l in src.readlines()]
|
||||
text = "\n".join([l for l in lines[:n_lines]])
|
||||
return text.replace('"', "'")
|
||||
|
||||
def create_txlayer():
|
||||
run_command(self.CONVERT,
|
||||
"-background none",
|
||||
"-fill",
|
||||
text_color,
|
||||
"-pointsize", "12",
|
||||
"-border 4 -bordercolor none",
|
||||
"-size ", txsize,
|
||||
' caption:"', read_text(), '" ',
|
||||
temp_txlayer)
|
||||
|
||||
create_txlayer()
|
||||
create_bg()
|
||||
run_command(self.CONVERT, temp_bg, temp_txlayer,
|
||||
"-background None -layers merge ", output_file)
|
||||
|
||||
return output_file
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
with open(self.document_path, 'r') as f:
|
||||
self._text = f.read()
|
||||
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.isoformat() +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_command(*args):
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
if not subprocess.Popen(' '.join(args), env=environment,
|
||||
shell=True).wait() == 0:
|
||||
raise ParseError("Convert failed at {}".format(args))
|
23
src/paperless_text/signals.py
Normal file
23
src/paperless_text/signals.py
Normal file
@ -0,0 +1,23 @@
|
||||
import re
|
||||
|
||||
from .parsers import TextDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration:
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc.lower()):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10
|
||||
}
|
||||
|
||||
return None
|
Loading…
x
Reference in New Issue
Block a user