Add unpaper as another pre-processing step

This commit is contained in:
Pit Kleyersburg 2016-02-16 10:49:55 +01:00
parent 0e2d8cc0d8
commit fb36a49c26
5 changed files with 53 additions and 36 deletions

View File

@ -5,7 +5,7 @@ MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
sudo \ sudo \
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \ tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install python dependencies # Install python dependencies

View File

@ -10,11 +10,13 @@ should work) that has the following software installed on it:
* `GNU Privacy Guard`_ * `GNU Privacy Guard`_
* `Tesseract`_ * `Tesseract`_
* `Imagemagick`_ * `Imagemagick`_
* `unpaper`_
.. _Python3: https://python.org/ .. _Python3: https://python.org/
.. _GNU Privacy Guard: https://gnupg.org .. _GNU Privacy Guard: https://gnupg.org
.. _Tesseract: https://github.com/tesseract-ocr .. _Tesseract: https://github.com/tesseract-ocr
.. _Imagemagick: http://imagemagick.org/ .. _Imagemagick: http://imagemagick.org/
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
Notably, you should confirm how you access your Python3 installation. Many Notably, you should confirm how you access your Python3 installation. Many
Linux distributions will install Python3 in parallel to Python2, using the names Linux distributions will install Python3 in parallel to Python2, using the names

View File

@ -5,7 +5,7 @@ apt-get update
apt-get build-dep -y python-imaging apt-get build-dep -y python-imaging
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
# Python dependencies # Python dependencies
pip3 install -r /opt/paperless/requirements.txt pip3 install -r /opt/paperless/requirements.txt

View File

@ -39,8 +39,8 @@ class ConsumerError(Exception):
class Consumer(object): class Consumer(object):
""" """
Loop over every file found in CONSUMPTION_DIR and: Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png 1. Convert it to a greyscale pnm
2. Use tesseract on the png 2. Use tesseract on the pnm
3. Encrypt and store the document in the MEDIA_ROOT 3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database 4. Store the OCR'd text in the database
5. Delete the document and image(s) 5. Delete the document and image(s)
@ -48,6 +48,7 @@ class Consumer(object):
SCRATCH = settings.SCRATCH_DIR SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY CONVERT = settings.CONVERT_BINARY
UNPAPER = settings.UNPAPER_BINARY
CONSUME = settings.CONSUMPTION_DIR CONSUME = settings.CONSUMPTION_DIR
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
@ -118,11 +119,11 @@ class Consumer(object):
self.log("info", "Consuming {}".format(doc)) self.log("info", "Consuming {}".format(doc))
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
pngs = self._get_greyscale(tempdir, doc) imgs = self._get_greyscale(tempdir, doc)
thumbnail = self._get_thumbnail(tempdir, doc) thumbnail = self._get_thumbnail(tempdir, doc)
try: try:
text = self._get_ocr(pngs) text = self._get_ocr(imgs)
self._store(text, doc, thumbnail) self._store(text, doc, thumbnail)
except OCRError as e: except OCRError as e:
self._ignore.append(doc) self._ignore.append(doc)
@ -140,19 +141,30 @@ class Consumer(object):
self.log("info", "Generating greyscale image from {}".format(doc)) self.log("info", "Generating greyscale image from {}".format(doc))
png = os.path.join(tempdir, "convert-%04d.jpg") # Convert PDF to multiple PNMs
pnm = os.path.join(tempdir, "convert-%04d.pnm")
subprocess.Popen(( subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8", self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png "-type", "grayscale", doc, pnm
)).wait() )).wait()
pngs = [] # Get a list of converted images
pnms = []
for f in os.listdir(tempdir): for f in os.listdir(tempdir):
if f.startswith("convert"): if f.endswith(".pnm"):
pngs.append(os.path.join(tempdir, f)) pnms.append(os.path.join(tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pngs)) # Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
# Return list of converted images, processed with unpaper
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".unpaper.pnm"):
pnms.append(os.path.join(tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pnms))
def _get_thumbnail(self, tempdir, doc): def _get_thumbnail(self, tempdir, doc):
""" """
@ -179,21 +191,21 @@ class Consumer(object):
except Exception as e: except Exception as e:
self.log("warning", "Language detection error: {}".format(e)) self.log("warning", "Language detection error: {}".format(e))
def _get_ocr(self, pngs): def _get_ocr(self, imgs):
""" """
Attempts to do the best job possible OCR'ing the document based on Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error. simple language detection trial & error.
""" """
if not pngs: if not imgs:
raise OCRError("No images found") raise OCRError("No images found")
self.log("info", "OCRing the document") self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works # Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1 # for every edge-case, i.e. 1
middle = int(len(pngs) / 2) middle = int(len(imgs) / 2)
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE) raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text) guessed_language = self._guess_language(raw_text)
@ -205,16 +217,16 @@ class Consumer(object):
"As FORGIVING_OCR is enabled, we're going to make the " "As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have." "best with what we have."
) )
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text return raw_text
raise OCRError("Language detection failed") raise OCRError("Language detection failed")
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text return raw_text
try: try:
return self._ocr(pngs, ISO639[guessed_language]) return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError: except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR: if settings.FORGIVING_OCR:
self.log( self.log(
@ -224,34 +236,34 @@ class Consumer(object):
guessed_language guessed_language
) )
) )
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text return raw_text
raise OCRError( raise OCRError(
"The guessed language is not available in this instance of " "The guessed language is not available in this instance of "
"Tesseract." "Tesseract."
) )
def _assemble_ocr_sections(self, pngs, middle, text): def _assemble_ocr_sections(self, imgs, middle, text):
""" """
Given a `middle` value and the text that middle page represents, we OCR Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing. the remainder of the document and return the whole thing.
""" """
text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE) text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text return text
def _ocr(self, pngs, lang): def _ocr(self, imgs, lang):
""" """
Performs a single OCR attempt. Performs a single OCR attempt.
""" """
if not pngs: if not imgs:
return "" return ""
self.log("info", "Parsing for {}".format(lang)) self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool: with Pool(processes=self.THREADS) as pool:
r = pool.map(image_to_string, itertools.product(pngs, [lang])) r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r) r = " ".join(r)
# Strip out excess white space to allow matching to go smoother # Strip out excess white space to allow matching to go smoother
@ -374,16 +386,9 @@ class Consumer(object):
def image_to_string(args): def image_to_string(args):
""" img, lang = args
I have no idea why, but if this function were a method of Consumer, it
would explode with:
`TypeError: cannot serialize '_io.TextIOWrapper' object`.
"""
png, lang = args
ocr = pyocr.get_available_tools()[0] ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(Consumer.SCRATCH, png)) as f: with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
if ocr.can_detect_orientation(): if ocr.can_detect_orientation():
try: try:
orientation = ocr.detect_orientation(f, lang=lang) orientation = ocr.detect_orientation(f, lang=lang)
@ -391,3 +396,10 @@ def image_to_string(args):
except TesseractError: except TesseractError:
pass pass
return ocr.image_to_string(f, lang=lang) return ocr.image_to_string(f, lang=lang)
def run_unpaper(args):
unpaper, pnm = args
subprocess.Popen((
unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
)).wait()

View File

@ -189,6 +189,9 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
# Convert is part of the ImageMagick package # Convert is part of the ImageMagick package
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
# Unpaper
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
# This will be created if it doesn't exist # This will be created if it doesn't exist
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")