diff --git a/docs/changelog.rst b/docs/changelog.rst index 2af97b33b..bb119bf1f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -26,6 +26,9 @@ next * Much better admin for mail rule editing. +* New setting ``PAPERLESS_OCR_PAGES`` limits the tesseract parser + to the first n pages of scanned documents. + paperless-ng 0.9.1 ################## diff --git a/docs/configuration.rst b/docs/configuration.rst index 1ddd7ca0e..afb0b5f90 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -184,6 +184,16 @@ PAPERLESS_TIME_ZONE= +PAPERLESS_OCR_PAGES= + Tells paperless to use only the specified amount of pages for OCR. Documents + with less than the specified amount of pages get OCR'ed completely. + + Specifying 1 here will only use the first page. + + Defaults to 0, which disables this feature and always uses all pages. + + + PAPERLESS_OCR_LANGUAGE= Customize the default language that tesseract will attempt to use when parsing documents. The default language is used whenever diff --git a/docs/setup.rst b/docs/setup.rst index 5520f5594..dff605889 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -358,4 +358,30 @@ management commands as below. 7. Start paperless. +Considerations for less powerful devices +######################################## + +Paperless runs on Raspberry Pi. However, some things are rather slow on the Pi and +configuring some options in paperless can help improve performance immensely: + +* Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR + the first page of your documents. +* ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured + to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that + paperless will use 2 workers and 2 threads per worker. This may result in + slugish response times during consumption, so you might want to lower these + settings (example: 2 workers and 1 thread to always have some computing power + left for other tasks). +* Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing + your documents before feeding them into paperless. Some scanners are able to + do this! +* Lower ``PAPERLESS_CONVERT_DENSITY`` from its default value 300 to 200. This + will still result in rather accurate OCR, but will decrease consumption time + by quite a bit. +* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption + times. Thumbnails will be about 20% larger. + +For details, refer to :ref:`configuration`. + + .. _redis: https://redis.io/ diff --git a/paperless.conf.example b/paperless.conf.example index e1fd17a77..4749151e7 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -35,6 +35,7 @@ #PAPERLESS_TASK_WORKERS=1 #PAPERLESS_THREADS_PER_WORKER=1 #PAPERLESS_TIME_ZONE=UTC +#PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_LANGUAGE=eng #PAPERLESS_OCR_ALWAYS=false #PAPERLESS_CONSUMER_POLLING=10 diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 06895e92f..0d64efa57 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -322,6 +322,8 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") +OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) + # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index c9e77486e..b8320a4f0 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser): Greyscale images are easier for Tesseract to OCR """ + # Convert PDF to multiple PNMs + input_file = self.document_path + + if settings.OCR_PAGES == 1: + input_file += "[0]" + elif settings.OCR_PAGES > 1: + input_file += f"[0-{settings.OCR_PAGES - 1}]" + self.log( "debug", - f"Converting document {self.document_path} into greyscale images") + f"Converting document {input_file} into greyscale images") - # Convert PDF to multiple PNMs - pnm = os.path.join(self.tempdir, "convert-%04d.pnm") + output_files = os.path.join(self.tempdir, "convert-%04d.pnm") run_convert(density=settings.CONVERT_DENSITY, depth="8", type="grayscale", - input_file=self.document_path, - output_file=pnm, + input_file=input_file, + output_file=output_files, logging_group=self.logging_group) # Get a list of converted images