mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
new setting: PAPERLESS_OCR_PAGES
This commit is contained in:
parent
ea089de3b3
commit
fec9e54049
@ -26,6 +26,9 @@ next
|
|||||||
|
|
||||||
* Much better admin for mail rule editing.
|
* Much better admin for mail rule editing.
|
||||||
|
|
||||||
|
* New setting ``PAPERLESS_OCR_PAGES`` limits the tesseract parser
|
||||||
|
to the first n pages of scanned documents.
|
||||||
|
|
||||||
|
|
||||||
paperless-ng 0.9.1
|
paperless-ng 0.9.1
|
||||||
##################
|
##################
|
||||||
|
@ -184,6 +184,16 @@ PAPERLESS_TIME_ZONE=<timezone>
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
PAPERLESS_OCR_PAGES=<num>
|
||||||
|
Tells paperless to use only the specified amount of pages for OCR. Documents
|
||||||
|
with less than the specified amount of pages get OCR'ed completely.
|
||||||
|
|
||||||
|
Specifying 1 here will only use the first page.
|
||||||
|
|
||||||
|
Defaults to 0, which disables this feature and always uses all pages.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||||
Customize the default language that tesseract will attempt to use when
|
Customize the default language that tesseract will attempt to use when
|
||||||
parsing documents. The default language is used whenever
|
parsing documents. The default language is used whenever
|
||||||
|
@ -358,4 +358,30 @@ management commands as below.
|
|||||||
7. Start paperless.
|
7. Start paperless.
|
||||||
|
|
||||||
|
|
||||||
|
Considerations for less powerful devices
|
||||||
|
########################################
|
||||||
|
|
||||||
|
Paperless runs on Raspberry Pi. However, some things are rather slow on the Pi and
|
||||||
|
configuring some options in paperless can help improve performance immensely:
|
||||||
|
|
||||||
|
* Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR
|
||||||
|
the first page of your documents.
|
||||||
|
* ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
|
||||||
|
to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
|
||||||
|
paperless will use 2 workers and 2 threads per worker. This may result in
|
||||||
|
slugish response times during consumption, so you might want to lower these
|
||||||
|
settings (example: 2 workers and 1 thread to always have some computing power
|
||||||
|
left for other tasks).
|
||||||
|
* Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
|
||||||
|
your documents before feeding them into paperless. Some scanners are able to
|
||||||
|
do this!
|
||||||
|
* Lower ``PAPERLESS_CONVERT_DENSITY`` from its default value 300 to 200. This
|
||||||
|
will still result in rather accurate OCR, but will decrease consumption time
|
||||||
|
by quite a bit.
|
||||||
|
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
|
||||||
|
times. Thumbnails will be about 20% larger.
|
||||||
|
|
||||||
|
For details, refer to :ref:`configuration`.
|
||||||
|
|
||||||
|
|
||||||
.. _redis: https://redis.io/
|
.. _redis: https://redis.io/
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#PAPERLESS_TASK_WORKERS=1
|
#PAPERLESS_TASK_WORKERS=1
|
||||||
#PAPERLESS_THREADS_PER_WORKER=1
|
#PAPERLESS_THREADS_PER_WORKER=1
|
||||||
#PAPERLESS_TIME_ZONE=UTC
|
#PAPERLESS_TIME_ZONE=UTC
|
||||||
|
#PAPERLESS_OCR_PAGES=1
|
||||||
#PAPERLESS_OCR_LANGUAGE=eng
|
#PAPERLESS_OCR_LANGUAGE=eng
|
||||||
#PAPERLESS_OCR_ALWAYS=false
|
#PAPERLESS_OCR_ALWAYS=false
|
||||||
#PAPERLESS_CONSUMER_POLLING=10
|
#PAPERLESS_CONSUMER_POLLING=10
|
||||||
|
@ -322,6 +322,8 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES
|
|||||||
|
|
||||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
||||||
|
|
||||||
|
OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
|
||||||
|
|
||||||
# The default language that tesseract will attempt to use when parsing
|
# The default language that tesseract will attempt to use when parsing
|
||||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||||
|
@ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
Greyscale images are easier for Tesseract to OCR
|
Greyscale images are easier for Tesseract to OCR
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Convert PDF to multiple PNMs
|
||||||
|
input_file = self.document_path
|
||||||
|
|
||||||
|
if settings.OCR_PAGES == 1:
|
||||||
|
input_file += "[0]"
|
||||||
|
elif settings.OCR_PAGES > 1:
|
||||||
|
input_file += f"[0-{settings.OCR_PAGES - 1}]"
|
||||||
|
|
||||||
self.log(
|
self.log(
|
||||||
"debug",
|
"debug",
|
||||||
f"Converting document {self.document_path} into greyscale images")
|
f"Converting document {input_file} into greyscale images")
|
||||||
|
|
||||||
# Convert PDF to multiple PNMs
|
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
|
||||||
|
|
||||||
run_convert(density=settings.CONVERT_DENSITY,
|
run_convert(density=settings.CONVERT_DENSITY,
|
||||||
depth="8",
|
depth="8",
|
||||||
type="grayscale",
|
type="grayscale",
|
||||||
input_file=self.document_path,
|
input_file=input_file,
|
||||||
output_file=pnm,
|
output_file=output_files,
|
||||||
logging_group=self.logging_group)
|
logging_group=self.logging_group)
|
||||||
|
|
||||||
# Get a list of converted images
|
# Get a list of converted images
|
||||||
|
Loading…
x
Reference in New Issue
Block a user