mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
made unpaper and convert a little bit nicer to interact with
This commit is contained in:
parent
c28b636ffa
commit
3a08a2d206
@ -29,6 +29,46 @@ DATE_REGEX = re.compile(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||||
|
environment = os.environ.copy()
|
||||||
|
if settings.CONVERT_MEMORY_LIMIT:
|
||||||
|
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||||
|
if settings.CONVERT_TMPDIR:
|
||||||
|
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||||
|
|
||||||
|
args = [settings.CONVERT_BINARY]
|
||||||
|
args += ['-density', str(density)] if density else []
|
||||||
|
args += ['-scale', str(scale)] if scale else []
|
||||||
|
args += ['-alpha', str(alpha)] if alpha else []
|
||||||
|
args += ['-strip'] if strip else []
|
||||||
|
args += ['-trim'] if trim else []
|
||||||
|
args += ['-type', str(type)] if type else []
|
||||||
|
args += ['-depth', str(depth)] if depth else []
|
||||||
|
args += [input, output]
|
||||||
|
|
||||||
|
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
|
||||||
|
|
||||||
|
if not subprocess.Popen(args, env=environment).wait() == 0:
|
||||||
|
raise ParseError("Convert failed at {}".format(args))
|
||||||
|
|
||||||
|
|
||||||
|
def run_unpaper(pnm, logging_group=None):
|
||||||
|
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
|
||||||
|
|
||||||
|
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
||||||
|
pnm_out)
|
||||||
|
|
||||||
|
logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
|
||||||
|
|
||||||
|
if not subprocess.Popen(command_args).wait() == 0:
|
||||||
|
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||||
|
|
||||||
|
return pnm_out
|
||||||
|
|
||||||
|
|
||||||
class ParseError(Exception):
|
class ParseError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -56,6 +96,9 @@ class DocumentParser:
|
|||||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||||
|
|
||||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||||
|
|
||||||
|
self.log('debug', 'Execute: ' + " ".join(args))
|
||||||
|
|
||||||
if not subprocess.Popen(args).wait() == 0:
|
if not subprocess.Popen(args).wait() == 0:
|
||||||
raise ParseError("Optipng failed at {}".format(args))
|
raise ParseError("Optipng failed at {}".format(args))
|
||||||
|
|
||||||
|
@ -11,7 +11,8 @@ from PIL import Image
|
|||||||
from pyocr import PyocrException
|
from pyocr import PyocrException
|
||||||
|
|
||||||
import pdftotext
|
import pdftotext
|
||||||
from documents.parsers import DocumentParser, ParseError
|
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
|
||||||
|
run_convert
|
||||||
|
|
||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
@ -39,15 +40,14 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
# Run convert to get a decent thumbnail
|
# Run convert to get a decent thumbnail
|
||||||
try:
|
try:
|
||||||
run_convert(
|
run_convert(density=300,
|
||||||
settings.CONVERT_BINARY,
|
scale="500x5000>",
|
||||||
"-density", "300",
|
alpha="remove",
|
||||||
"-scale", "500x5000>",
|
strip=True,
|
||||||
"-alpha", "remove",
|
trim=True,
|
||||||
"-strip", "-trim",
|
input="{}[0]".format(self.document_path),
|
||||||
"{}[0]".format(self.document_path),
|
output=out_path,
|
||||||
out_path
|
logging_group=self.logging_group)
|
||||||
)
|
|
||||||
except ParseError:
|
except ParseError:
|
||||||
# if convert fails, fall back to extracting
|
# if convert fails, fall back to extracting
|
||||||
# the first PDF page as a PNG using Ghostscript
|
# the first PDF page as a PNG using Ghostscript
|
||||||
@ -61,15 +61,14 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if not subprocess.Popen(cmd).wait() == 0:
|
if not subprocess.Popen(cmd).wait() == 0:
|
||||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||||
# then run convert on the output from gs
|
# then run convert on the output from gs
|
||||||
run_convert(
|
run_convert(density=300,
|
||||||
settings.CONVERT_BINARY,
|
scale="500x5000>",
|
||||||
"-density", "300",
|
alpha="remove",
|
||||||
"-scale", "500x5000>",
|
strip=True,
|
||||||
"-alpha", "remove",
|
trim=True,
|
||||||
"-strip", "-trim",
|
input=gs_out_path,
|
||||||
gs_out_path,
|
output=out_path,
|
||||||
out_path
|
logging_group=self.logging_group)
|
||||||
)
|
|
||||||
|
|
||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
@ -107,12 +106,15 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if not guessed_language or guessed_language not in ISO639:
|
if not guessed_language or guessed_language not in ISO639:
|
||||||
self.log("warning", "Language detection failed.")
|
self.log("warning", "Language detection failed.")
|
||||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||||
|
|
||||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||||
self.log("info", "Detected language: {} (default language)".format(guessed_language))
|
self.log("info", "Detected language: {} (default language)".format(guessed_language))
|
||||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||||
|
|
||||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
||||||
self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
|
self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
|
||||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.log("info", "Detected language: {}".format(guessed_language))
|
self.log("info", "Detected language: {}".format(guessed_language))
|
||||||
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
||||||
@ -133,13 +135,13 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
# Convert PDF to multiple PNMs
|
# Convert PDF to multiple PNMs
|
||||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||||
run_convert(
|
|
||||||
settings.CONVERT_BINARY,
|
run_convert(density=settings.CONVERT_DENSITY,
|
||||||
"-density", str(settings.CONVERT_DENSITY),
|
depth="8",
|
||||||
"-depth", "8",
|
type="grayscale",
|
||||||
"-type", "grayscale",
|
input=self.document_path,
|
||||||
self.document_path, pnm,
|
output=pnm,
|
||||||
)
|
logging_group=self.logging_group)
|
||||||
|
|
||||||
# Get a list of converted images
|
# Get a list of converted images
|
||||||
pnms = []
|
pnms = []
|
||||||
@ -187,27 +189,6 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return [sample_page]
|
return [sample_page]
|
||||||
|
|
||||||
|
|
||||||
def run_convert(*args):
|
|
||||||
environment = os.environ.copy()
|
|
||||||
if settings.CONVERT_MEMORY_LIMIT:
|
|
||||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
|
||||||
if settings.CONVERT_TMPDIR:
|
|
||||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
|
||||||
|
|
||||||
if not subprocess.Popen(args, env=environment).wait() == 0:
|
|
||||||
raise ParseError("Convert failed at {}".format(args))
|
|
||||||
|
|
||||||
|
|
||||||
def run_unpaper(pnm):
|
|
||||||
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
|
|
||||||
|
|
||||||
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
|
||||||
pnm_out)
|
|
||||||
if not subprocess.Popen(command_args).wait() == 0:
|
|
||||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
|
||||||
|
|
||||||
return pnm_out
|
|
||||||
|
|
||||||
|
|
||||||
def strip_excess_whitespace(text):
|
def strip_excess_whitespace(text):
|
||||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user