mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'master' of https://github.com/danielquinn/paperless into disable_login
This commit is contained in:
@@ -118,12 +118,14 @@ class Consumer(object):
|
||||
|
||||
parsed_document = parser_class(doc)
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
|
||||
try:
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
doc,
|
||||
thumbnail
|
||||
thumbnail,
|
||||
date
|
||||
)
|
||||
except ParseError as e:
|
||||
|
||||
@@ -174,7 +176,7 @@ class Consumer(object):
|
||||
return sorted(
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
def _store(self, text, doc, thumbnail, date):
|
||||
|
||||
file_info = FileInfo.from_path(doc)
|
||||
|
||||
@@ -182,7 +184,7 @@ class Consumer(object):
|
||||
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
created = file_info.created or timezone.make_aware(
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
with open(doc, "rb") as f:
|
||||
|
@@ -135,8 +135,10 @@ class MatchingModel(models.Model):
|
||||
"""
|
||||
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||
normspace = re.compile(r"\s+").sub
|
||||
return [normspace(r"\s+", (t[0] or t[1]).strip())
|
||||
for t in findterms(self.match)]
|
||||
return [
|
||||
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
||||
for t in findterms(self.match)
|
||||
]
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
|
@@ -9,7 +9,7 @@ class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentParser(object):
|
||||
class DocumentParser:
|
||||
"""
|
||||
Subclass this to make your own parser. Have a look at
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
@@ -19,7 +19,7 @@ class DocumentParser(object):
|
||||
|
||||
def __init__(self, path):
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=self.SCRATCH)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
@@ -35,6 +35,12 @@ class DocumentParser(object):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_date(self):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
|
@@ -30,15 +30,8 @@ from .serialisers import (
|
||||
|
||||
|
||||
class IndexView(TemplateView):
|
||||
|
||||
template_name = "documents/index.html"
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
print(kwargs)
|
||||
print(self.request.GET)
|
||||
print(self.request.POST)
|
||||
return TemplateView.get_context_data(self, **kwargs)
|
||||
|
||||
|
||||
class FetchView(SessionOrBasicAuthMixin, DetailView):
|
||||
|
||||
|
@@ -270,3 +270,6 @@ PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
|
||||
|
||||
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
|
||||
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||
|
||||
# Specify the default date order (for autodetected dates)
|
||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
|
@@ -3,6 +3,7 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
import dateparser
|
||||
import pdftotext
|
||||
|
||||
import langdetect
|
||||
@@ -31,8 +32,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
TEXT_CACHE = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
@@ -60,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return False
|
||||
|
||||
def get_text(self):
|
||||
if self.TEXT_CACHE is not None:
|
||||
return self.TEXT_CACHE
|
||||
|
||||
if not self.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("info", "Skipping OCR, using Text from PDF")
|
||||
return get_text_from_pdf(self.document_path)
|
||||
self.TEXT_CACHE = get_text_from_pdf(self.document_path)
|
||||
return self.TEXT_CACHE
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
try:
|
||||
|
||||
return self._get_ocr(images)
|
||||
self.TEXT_CACHE = self._get_ocr(images)
|
||||
return self.TEXT_CACHE
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
@@ -191,6 +199,29 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
def get_date(self):
|
||||
text = self.get_text()
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ
|
||||
m = re.search(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^ ]{3,9} [0-9]{4})\b', text)
|
||||
|
||||
if m is None:
|
||||
return None
|
||||
|
||||
return dateparser.parse(m.group(0),
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
@@ -235,6 +266,6 @@ def get_text_from_pdf(pdf_file):
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
except pdftotext.Error:
|
||||
return False
|
||||
return ""
|
||||
|
||||
return "\n".join(pdf)
|
||||
|
@@ -3,7 +3,7 @@ import re
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration(object):
|
||||
class ConsumerDeclaration:
|
||||
|
||||
MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
|
BIN
src/paperless_tesseract/tests/samples/tests_date_1.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_1.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_1.png
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_2.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_2.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_2.png
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 135 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_3.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_3.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_3.png
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_3.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 138 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_4.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_4.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_4.png
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_4.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 138 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_5.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_5.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_5.png
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_5.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_6.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_6.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_6.png
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_6.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_7.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_7.pdf
Normal file
Binary file not shown.
215
src/paperless_tesseract/tests/test_date.py
Normal file
215
src/paperless_tesseract/tests/test_date.py
Normal file
@@ -0,0 +1,215 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.test import TestCase
|
||||
|
||||
from ..parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
|
||||
def setUp(self):
|
||||
os.makedirs(self.SCRATCH, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.SCRATCH)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_1_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_1_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_2_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_2_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_3_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_3_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_4_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_4_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_5_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_5_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_6_pdf_us(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
document.DATE_ORDER = "MDY"
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_6_png_us(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
document.DATE_ORDER = "MDY"
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_6_pdf_eu(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_6_png_eu(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_get_text_7_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
Reference in New Issue
Block a user