mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
reworked the interface of the parsers.
This commit is contained in:
parent
d3c13f6c93
commit
df801d17e1
@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
|||||||
from .file_handling import generate_filename, create_source_path_directory
|
from .file_handling import generate_filename, create_source_path_directory
|
||||||
from .loggers import LoggingMixin
|
from .loggers import LoggingMixin
|
||||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||||
from .parsers import ParseError, get_parser_class_for_mime_type
|
from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
|
||||||
from .signals import (
|
from .signals import (
|
||||||
document_consumption_finished,
|
document_consumption_finished,
|
||||||
document_consumption_started
|
document_consumption_started
|
||||||
@ -121,7 +121,7 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
# This doesn't parse the document yet, but gives us a parser.
|
# This doesn't parse the document yet, but gives us a parser.
|
||||||
|
|
||||||
document_parser = parser_class(self.path, self.logging_group)
|
document_parser = parser_class(self.logging_group)
|
||||||
|
|
||||||
# However, this already created working directories which we have to
|
# However, this already created working directories which we have to
|
||||||
# clean up.
|
# clean up.
|
||||||
@ -129,12 +129,18 @@ class Consumer(LoggingMixin):
|
|||||||
# Parse the document. This may take some time.
|
# Parse the document. This may take some time.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
|
||||||
thumbnail = document_parser.get_optimised_thumbnail()
|
|
||||||
self.log("debug", "Parsing {}...".format(self.filename))
|
self.log("debug", "Parsing {}...".format(self.filename))
|
||||||
|
document_parser.parse(self.path, mime_type)
|
||||||
|
|
||||||
|
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||||
|
thumbnail = document_parser.get_optimised_thumbnail(self.path, mime_type)
|
||||||
|
|
||||||
text = document_parser.get_text()
|
text = document_parser.get_text()
|
||||||
date = document_parser.get_date()
|
date = document_parser.get_date()
|
||||||
|
if not date:
|
||||||
|
date = parse_date(self.filename, text)
|
||||||
archive_path = document_parser.get_archive_path()
|
archive_path = document_parser.get_archive_path()
|
||||||
|
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
document_parser.cleanup()
|
document_parser.cleanup()
|
||||||
raise ConsumerError(e)
|
raise ConsumerError(e)
|
||||||
|
@ -107,59 +107,7 @@ def run_convert(input_file,
|
|||||||
raise ParseError("Convert failed at {}".format(args))
|
raise ParseError("Convert failed at {}".format(args))
|
||||||
|
|
||||||
|
|
||||||
class ParseError(Exception):
|
def parse_date(filename, text):
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentParser(LoggingMixin):
|
|
||||||
"""
|
|
||||||
Subclass this to make your own parser. Have a look at
|
|
||||||
`paperless_tesseract.parsers` for inspiration.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, path, logging_group):
|
|
||||||
super().__init__()
|
|
||||||
self.logging_group = logging_group
|
|
||||||
self.document_path = path
|
|
||||||
self.tempdir = tempfile.mkdtemp(
|
|
||||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
|
||||||
|
|
||||||
def get_archive_path(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_thumbnail(self):
|
|
||||||
"""
|
|
||||||
Returns the path to a file we can use as a thumbnail for this document.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def optimise_thumbnail(self, in_path):
|
|
||||||
|
|
||||||
if settings.OPTIMIZE_THUMBNAILS:
|
|
||||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
|
||||||
|
|
||||||
args = (settings.OPTIPNG_BINARY,
|
|
||||||
"-silent", "-o5", in_path, "-out", out_path)
|
|
||||||
|
|
||||||
self.log('debug', f"Execute: {' '.join(args)}")
|
|
||||||
|
|
||||||
if not subprocess.Popen(args).wait() == 0:
|
|
||||||
raise ParseError("Optipng failed at {}".format(args))
|
|
||||||
|
|
||||||
return out_path
|
|
||||||
else:
|
|
||||||
return in_path
|
|
||||||
|
|
||||||
def get_optimised_thumbnail(self):
|
|
||||||
return self.optimise_thumbnail(self.get_thumbnail())
|
|
||||||
|
|
||||||
def get_text(self):
|
|
||||||
"""
|
|
||||||
Returns the text from the document and only the text.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def get_date(self):
|
|
||||||
"""
|
"""
|
||||||
Returns the date of the document.
|
Returns the date of the document.
|
||||||
"""
|
"""
|
||||||
@ -179,15 +127,12 @@ class DocumentParser(LoggingMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
date = None
|
date = None
|
||||||
date_string = None
|
|
||||||
|
|
||||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||||
title = os.path.basename(self.document_path)
|
|
||||||
|
|
||||||
# if filename date parsing is enabled, search there first:
|
# if filename date parsing is enabled, search there first:
|
||||||
if settings.FILENAME_DATE_ORDER:
|
if settings.FILENAME_DATE_ORDER:
|
||||||
self.log("info", "Checking document title for date")
|
for m in re.finditer(DATE_REGEX, filename):
|
||||||
for m in re.finditer(DATE_REGEX, title):
|
|
||||||
date_string = m.group(0)
|
date_string = m.group(0)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -197,21 +142,8 @@ class DocumentParser(LoggingMixin):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if date is not None and next_year > date.year > 1900:
|
if date is not None and next_year > date.year > 1900:
|
||||||
self.log(
|
|
||||||
"info",
|
|
||||||
"Detected document date {} based on string {} "
|
|
||||||
"from document title"
|
|
||||||
"".format(date.isoformat(), date_string)
|
|
||||||
)
|
|
||||||
return date
|
return date
|
||||||
|
|
||||||
try:
|
|
||||||
# getting text after checking filename will save time if only
|
|
||||||
# looking at the filename instead of the whole text
|
|
||||||
text = self.get_text()
|
|
||||||
except ParseError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Iterate through all regex matches in text and try to parse the date
|
# Iterate through all regex matches in text and try to parse the date
|
||||||
for m in re.finditer(DATE_REGEX, text):
|
for m in re.finditer(DATE_REGEX, text):
|
||||||
date_string = m.group(0)
|
date_string = m.group(0)
|
||||||
@ -227,19 +159,64 @@ class DocumentParser(LoggingMixin):
|
|||||||
else:
|
else:
|
||||||
date = None
|
date = None
|
||||||
|
|
||||||
if date is not None:
|
|
||||||
self.log(
|
|
||||||
"info",
|
|
||||||
"Detected document date {} based on string {}".format(
|
|
||||||
date.isoformat(),
|
|
||||||
date_string
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.log("info", "Unable to detect date for document")
|
|
||||||
|
|
||||||
return date
|
return date
|
||||||
|
|
||||||
|
|
||||||
|
class ParseError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentParser(LoggingMixin):
|
||||||
|
"""
|
||||||
|
Subclass this to make your own parser. Have a look at
|
||||||
|
`paperless_tesseract.parsers` for inspiration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, logging_group):
|
||||||
|
super().__init__()
|
||||||
|
self.logging_group = logging_group
|
||||||
|
self.tempdir = tempfile.mkdtemp(
|
||||||
|
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
|
|
||||||
|
self.archive_path = None
|
||||||
|
self.text = None
|
||||||
|
self.date = None
|
||||||
|
|
||||||
|
def parse(self, document_path, mime_type):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_archive_path(self):
|
||||||
|
return self.archive_path
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path, mime_type):
|
||||||
|
"""
|
||||||
|
Returns the path to a file we can use as a thumbnail for this document.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||||
|
thumbnail = self.get_thumbnail(document_path, mime_type)
|
||||||
|
if settings.OPTIMIZE_THUMBNAILS:
|
||||||
|
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
|
||||||
|
|
||||||
|
args = (settings.OPTIPNG_BINARY,
|
||||||
|
"-silent", "-o5", thumbnail, "-out", out_path)
|
||||||
|
|
||||||
|
self.log('debug', f"Execute: {' '.join(args)}")
|
||||||
|
|
||||||
|
if not subprocess.Popen(args).wait() == 0:
|
||||||
|
raise ParseError("Optipng failed at {}".format(args))
|
||||||
|
|
||||||
|
return out_path
|
||||||
|
else:
|
||||||
|
return thumbnail
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
return self.date
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||||
shutil.rmtree(self.tempdir)
|
shutil.rmtree(self.tempdir)
|
||||||
|
@ -2,7 +2,6 @@ import os
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import langdetect
|
|
||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
import pdftotext
|
import pdftotext
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@ -17,12 +16,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, path, logging_group):
|
def get_thumbnail(self, document_path, mime_type):
|
||||||
super().__init__(path, logging_group)
|
|
||||||
self._text = None
|
|
||||||
self._archive_path = None
|
|
||||||
|
|
||||||
def get_thumbnail(self):
|
|
||||||
"""
|
"""
|
||||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
"""
|
"""
|
||||||
@ -36,7 +30,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
alpha="remove",
|
alpha="remove",
|
||||||
strip=True,
|
strip=True,
|
||||||
trim=True,
|
trim=True,
|
||||||
input_file="{}[0]".format(self.document_path),
|
input_file="{}[0]".format(document_path),
|
||||||
output_file=out_path,
|
output_file=out_path,
|
||||||
logging_group=self.logging_group)
|
logging_group=self.logging_group)
|
||||||
except ParseError:
|
except ParseError:
|
||||||
@ -51,7 +45,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"-q",
|
"-q",
|
||||||
"-sDEVICE=pngalpha",
|
"-sDEVICE=pngalpha",
|
||||||
"-o", gs_out_path,
|
"-o", gs_out_path,
|
||||||
self.document_path]
|
document_path]
|
||||||
if not subprocess.Popen(cmd).wait() == 0:
|
if not subprocess.Popen(cmd).wait() == 0:
|
||||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||||
# then run convert on the output from gs
|
# then run convert on the output from gs
|
||||||
@ -71,10 +65,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if self._text:
|
if self._text:
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
|
def parse(self, document_path, mime_type):
|
||||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||||
|
|
||||||
ocr_args = {
|
ocr_args = {
|
||||||
'input_file': self.document_path,
|
'input_file': document_path,
|
||||||
'output_file': archive_path,
|
'output_file': archive_path,
|
||||||
'use_threads': True,
|
'use_threads': True,
|
||||||
'jobs': settings.THREADS_PER_WORKER,
|
'jobs': settings.THREADS_PER_WORKER,
|
||||||
@ -96,17 +91,17 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
ocrmypdf.ocr(**ocr_args)
|
ocrmypdf.ocr(**ocr_args)
|
||||||
# success! announce that we have an archive document
|
# success! announce results
|
||||||
self._archive_path = archive_path
|
self.archive_path = archive_path
|
||||||
self._text = get_text_from_pdf(self._archive_path)
|
self.text = get_text_from_pdf(archive_path)
|
||||||
|
|
||||||
except InputFileError as e:
|
except InputFileError as e:
|
||||||
# This happens with some PDFs when used with the redo_ocr option.
|
# This happens with some PDFs when used with the redo_ocr option.
|
||||||
# This is not the end of the world, we'll just use what we already
|
# This is not the end of the world, we'll just use what we already
|
||||||
# have in the document.
|
# have in the document.
|
||||||
self._text = get_text_from_pdf(self.document_path)
|
self.text = get_text_from_pdf(document_path)
|
||||||
# Also, no archived file.
|
# Also, no archived file.
|
||||||
if not self._text:
|
if not self.text:
|
||||||
# However, if we don't have anything, fail:
|
# However, if we don't have anything, fail:
|
||||||
raise ParseError(e)
|
raise ParseError(e)
|
||||||
|
|
||||||
@ -114,27 +109,14 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
# Anything else is probably serious.
|
# Anything else is probably serious.
|
||||||
raise ParseError(e)
|
raise ParseError(e)
|
||||||
|
|
||||||
if not self._text:
|
if not self.text:
|
||||||
# This may happen for files that don't have any text.
|
# This may happen for files that don't have any text.
|
||||||
self.log(
|
self.log(
|
||||||
'warning',
|
'warning',
|
||||||
f"Document {self.document_path} does not have any text."
|
f"Document {document_path} does not have any text."
|
||||||
f"This is probably an error or you tried to add an image "
|
f"This is probably an error or you tried to add an image "
|
||||||
f"without text.")
|
f"without text.")
|
||||||
return ""
|
self.text = ""
|
||||||
|
|
||||||
return self._text
|
|
||||||
|
|
||||||
def get_archive_path(self):
|
|
||||||
return self._archive_path
|
|
||||||
|
|
||||||
def _guess_language(self, text):
|
|
||||||
try:
|
|
||||||
guess = langdetect.detect(text)
|
|
||||||
return guess
|
|
||||||
except Exception as e:
|
|
||||||
self.log('warning', f"Language detection failed with: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def strip_excess_whitespace(text):
|
def strip_excess_whitespace(text):
|
||||||
|
@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
|
|||||||
This parser directly parses a text document (.txt, .md, or .csv)
|
This parser directly parses a text document (.txt, .md, or .csv)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, path, logging_group):
|
def get_thumbnail(self, document_path, mime_type):
|
||||||
super().__init__(path, logging_group)
|
|
||||||
self._text = None
|
|
||||||
|
|
||||||
def get_thumbnail(self):
|
|
||||||
"""
|
"""
|
||||||
The thumbnail of a text file is just a 500px wide image of the text
|
The thumbnail of a text file is just a 500px wide image of the text
|
||||||
rendered onto a letter-sized page.
|
rendered onto a letter-sized page.
|
||||||
@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def read_text():
|
def read_text():
|
||||||
with open(self.document_path, 'r') as src:
|
with open(document_path, 'r') as src:
|
||||||
lines = [line.strip() for line in src.readlines()]
|
lines = [line.strip() for line in src.readlines()]
|
||||||
text = "\n".join([line for line in lines[:n_lines]])
|
text = "\n".join([line for line in lines[:n_lines]])
|
||||||
return text.replace('"', "'")
|
return text.replace('"', "'")
|
||||||
@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
def get_text(self):
|
def parse(self, document_path, mime_type):
|
||||||
|
with open(document_path, 'r') as f:
|
||||||
if self._text is not None:
|
self.text = f.read()
|
||||||
return self._text
|
|
||||||
|
|
||||||
with open(self.document_path, 'r') as f:
|
|
||||||
self._text = f.read()
|
|
||||||
|
|
||||||
return self._text
|
|
||||||
|
|
||||||
|
|
||||||
def run_command(*args):
|
def run_command(*args):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user