mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Add unpaper as another pre-processing step
This commit is contained in:
parent
0e2d8cc0d8
commit
fb36a49c26
@ -5,7 +5,7 @@ MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
|||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install -y --no-install-recommends \
|
||||||
sudo \
|
sudo \
|
||||||
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \
|
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install python dependencies
|
# Install python dependencies
|
||||||
|
@ -10,11 +10,13 @@ should work) that has the following software installed on it:
|
|||||||
* `GNU Privacy Guard`_
|
* `GNU Privacy Guard`_
|
||||||
* `Tesseract`_
|
* `Tesseract`_
|
||||||
* `Imagemagick`_
|
* `Imagemagick`_
|
||||||
|
* `unpaper`_
|
||||||
|
|
||||||
.. _Python3: https://python.org/
|
.. _Python3: https://python.org/
|
||||||
.. _GNU Privacy Guard: https://gnupg.org
|
.. _GNU Privacy Guard: https://gnupg.org
|
||||||
.. _Tesseract: https://github.com/tesseract-ocr
|
.. _Tesseract: https://github.com/tesseract-ocr
|
||||||
.. _Imagemagick: http://imagemagick.org/
|
.. _Imagemagick: http://imagemagick.org/
|
||||||
|
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||||
|
|
||||||
Notably, you should confirm how you access your Python3 installation. Many
|
Notably, you should confirm how you access your Python3 installation. Many
|
||||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
Linux distributions will install Python3 in parallel to Python2, using the names
|
||||||
|
@ -5,7 +5,7 @@ apt-get update
|
|||||||
apt-get build-dep -y python-imaging
|
apt-get build-dep -y python-imaging
|
||||||
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
|
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
|
||||||
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
|
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
|
||||||
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
|
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
|
||||||
|
|
||||||
# Python dependencies
|
# Python dependencies
|
||||||
pip3 install -r /opt/paperless/requirements.txt
|
pip3 install -r /opt/paperless/requirements.txt
|
||||||
|
@ -39,8 +39,8 @@ class ConsumerError(Exception):
|
|||||||
class Consumer(object):
|
class Consumer(object):
|
||||||
"""
|
"""
|
||||||
Loop over every file found in CONSUMPTION_DIR and:
|
Loop over every file found in CONSUMPTION_DIR and:
|
||||||
1. Convert it to a greyscale png
|
1. Convert it to a greyscale pnm
|
||||||
2. Use tesseract on the png
|
2. Use tesseract on the pnm
|
||||||
3. Encrypt and store the document in the MEDIA_ROOT
|
3. Encrypt and store the document in the MEDIA_ROOT
|
||||||
4. Store the OCR'd text in the database
|
4. Store the OCR'd text in the database
|
||||||
5. Delete the document and image(s)
|
5. Delete the document and image(s)
|
||||||
@ -48,6 +48,7 @@ class Consumer(object):
|
|||||||
|
|
||||||
SCRATCH = settings.SCRATCH_DIR
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
CONVERT = settings.CONVERT_BINARY
|
CONVERT = settings.CONVERT_BINARY
|
||||||
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
CONSUME = settings.CONSUMPTION_DIR
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
|
|
||||||
@ -118,11 +119,11 @@ class Consumer(object):
|
|||||||
self.log("info", "Consuming {}".format(doc))
|
self.log("info", "Consuming {}".format(doc))
|
||||||
|
|
||||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||||
pngs = self._get_greyscale(tempdir, doc)
|
imgs = self._get_greyscale(tempdir, doc)
|
||||||
thumbnail = self._get_thumbnail(tempdir, doc)
|
thumbnail = self._get_thumbnail(tempdir, doc)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = self._get_ocr(pngs)
|
text = self._get_ocr(imgs)
|
||||||
self._store(text, doc, thumbnail)
|
self._store(text, doc, thumbnail)
|
||||||
except OCRError as e:
|
except OCRError as e:
|
||||||
self._ignore.append(doc)
|
self._ignore.append(doc)
|
||||||
@ -140,19 +141,30 @@ class Consumer(object):
|
|||||||
|
|
||||||
self.log("info", "Generating greyscale image from {}".format(doc))
|
self.log("info", "Generating greyscale image from {}".format(doc))
|
||||||
|
|
||||||
png = os.path.join(tempdir, "convert-%04d.jpg")
|
# Convert PDF to multiple PNMs
|
||||||
|
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
||||||
subprocess.Popen((
|
subprocess.Popen((
|
||||||
self.CONVERT, "-density", "300", "-depth", "8",
|
self.CONVERT, "-density", "300", "-depth", "8",
|
||||||
"-type", "grayscale", doc, png
|
"-type", "grayscale", doc, pnm
|
||||||
)).wait()
|
)).wait()
|
||||||
|
|
||||||
pngs = []
|
# Get a list of converted images
|
||||||
|
pnms = []
|
||||||
for f in os.listdir(tempdir):
|
for f in os.listdir(tempdir):
|
||||||
if f.startswith("convert"):
|
if f.endswith(".pnm"):
|
||||||
pngs.append(os.path.join(tempdir, f))
|
pnms.append(os.path.join(tempdir, f))
|
||||||
|
|
||||||
return sorted(filter(lambda __: os.path.isfile(__), pngs))
|
# Run unpaper in parallel on converted images
|
||||||
|
with Pool(processes=self.THREADS) as pool:
|
||||||
|
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||||
|
|
||||||
|
# Return list of converted images, processed with unpaper
|
||||||
|
pnms = []
|
||||||
|
for f in os.listdir(tempdir):
|
||||||
|
if f.endswith(".unpaper.pnm"):
|
||||||
|
pnms.append(os.path.join(tempdir, f))
|
||||||
|
|
||||||
|
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||||
|
|
||||||
def _get_thumbnail(self, tempdir, doc):
|
def _get_thumbnail(self, tempdir, doc):
|
||||||
"""
|
"""
|
||||||
@ -179,21 +191,21 @@ class Consumer(object):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log("warning", "Language detection error: {}".format(e))
|
self.log("warning", "Language detection error: {}".format(e))
|
||||||
|
|
||||||
def _get_ocr(self, pngs):
|
def _get_ocr(self, imgs):
|
||||||
"""
|
"""
|
||||||
Attempts to do the best job possible OCR'ing the document based on
|
Attempts to do the best job possible OCR'ing the document based on
|
||||||
simple language detection trial & error.
|
simple language detection trial & error.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not pngs:
|
if not imgs:
|
||||||
raise OCRError("No images found")
|
raise OCRError("No images found")
|
||||||
|
|
||||||
self.log("info", "OCRing the document")
|
self.log("info", "OCRing the document")
|
||||||
|
|
||||||
# Since the division gets rounded down by int, this calculation works
|
# Since the division gets rounded down by int, this calculation works
|
||||||
# for every edge-case, i.e. 1
|
# for every edge-case, i.e. 1
|
||||||
middle = int(len(pngs) / 2)
|
middle = int(len(imgs) / 2)
|
||||||
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
guessed_language = self._guess_language(raw_text)
|
guessed_language = self._guess_language(raw_text)
|
||||||
|
|
||||||
@ -205,16 +217,16 @@ class Consumer(object):
|
|||||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||||
"best with what we have."
|
"best with what we have."
|
||||||
)
|
)
|
||||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError("Language detection failed")
|
raise OCRError("Language detection failed")
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._ocr(pngs, ISO639[guessed_language])
|
return self._ocr(imgs, ISO639[guessed_language])
|
||||||
except pyocr.pyocr.tesseract.TesseractError:
|
except pyocr.pyocr.tesseract.TesseractError:
|
||||||
if settings.FORGIVING_OCR:
|
if settings.FORGIVING_OCR:
|
||||||
self.log(
|
self.log(
|
||||||
@ -224,34 +236,34 @@ class Consumer(object):
|
|||||||
guessed_language
|
guessed_language
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError(
|
raise OCRError(
|
||||||
"The guessed language is not available in this instance of "
|
"The guessed language is not available in this instance of "
|
||||||
"Tesseract."
|
"Tesseract."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _assemble_ocr_sections(self, pngs, middle, text):
|
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||||
"""
|
"""
|
||||||
Given a `middle` value and the text that middle page represents, we OCR
|
Given a `middle` value and the text that middle page represents, we OCR
|
||||||
the remainder of the document and return the whole thing.
|
the remainder of the document and return the whole thing.
|
||||||
"""
|
"""
|
||||||
text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||||
text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _ocr(self, pngs, lang):
|
def _ocr(self, imgs, lang):
|
||||||
"""
|
"""
|
||||||
Performs a single OCR attempt.
|
Performs a single OCR attempt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not pngs:
|
if not imgs:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
self.log("info", "Parsing for {}".format(lang))
|
self.log("info", "Parsing for {}".format(lang))
|
||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=self.THREADS) as pool:
|
||||||
r = pool.map(image_to_string, itertools.product(pngs, [lang]))
|
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||||
r = " ".join(r)
|
r = " ".join(r)
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
@ -374,16 +386,9 @@ class Consumer(object):
|
|||||||
|
|
||||||
|
|
||||||
def image_to_string(args):
|
def image_to_string(args):
|
||||||
"""
|
img, lang = args
|
||||||
I have no idea why, but if this function were a method of Consumer, it
|
|
||||||
would explode with:
|
|
||||||
|
|
||||||
`TypeError: cannot serialize '_io.TextIOWrapper' object`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
png, lang = args
|
|
||||||
ocr = pyocr.get_available_tools()[0]
|
ocr = pyocr.get_available_tools()[0]
|
||||||
with Image.open(os.path.join(Consumer.SCRATCH, png)) as f:
|
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
|
||||||
if ocr.can_detect_orientation():
|
if ocr.can_detect_orientation():
|
||||||
try:
|
try:
|
||||||
orientation = ocr.detect_orientation(f, lang=lang)
|
orientation = ocr.detect_orientation(f, lang=lang)
|
||||||
@ -391,3 +396,10 @@ def image_to_string(args):
|
|||||||
except TesseractError:
|
except TesseractError:
|
||||||
pass
|
pass
|
||||||
return ocr.image_to_string(f, lang=lang)
|
return ocr.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
|
|
||||||
|
def run_unpaper(args):
|
||||||
|
unpaper, pnm = args
|
||||||
|
subprocess.Popen((
|
||||||
|
unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||||
|
)).wait()
|
||||||
|
@ -189,6 +189,9 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
|
|||||||
# Convert is part of the ImageMagick package
|
# Convert is part of the ImageMagick package
|
||||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
|
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
|
||||||
|
|
||||||
|
# Unpaper
|
||||||
|
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||||
|
|
||||||
# This will be created if it doesn't exist
|
# This will be created if it doesn't exist
|
||||||
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
|
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user