diff --git a/docs/configuration.rst b/docs/configuration.rst index d4e7752ec..f86c991f7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -152,6 +152,115 @@ PAPERLESS_AUTO_LOGIN_USERNAME= Defaults to none, which disables this feature. +OCR settings +############ + +Paperless uses `OCRmyPDF `_ for +performing OCR on documents and images. Paperless uses sensible defaults for +most settings, but all of them can be configured to your needs. + + +PAPERLESS_OCR_LANGUAGE= + Customize the language that paperless will attempt to use when + parsing documents. + + It should be a 3-letter language code consistent with ISO + 639: https://www.loc.gov/standards/iso639-2/php/code_list.php + + Set this to the language most of your documents are written in. + + This can be a combination of multiple languages such as ``deu+eng``, + in which case tesseract will use whatever language matches best. + Keep in mind that tesseract uses much more cpu time with multiple + languages enabled. + + Defaults to "eng". + +PAPERLESS_OCR_MODE= + Tell paperless when and how to perform ocr on your documents. Three modes + are available: + + * ``skip``: Paperless skips all pages and will perform ocr only on pages + where no text is present. This is the safest and fastest option. + * ``redo``: Paperless will OCR all pages of your documents and attempt to + replace any existing text layers with new text. This will be useful for + documents from scanners that already performed OCR with insufficient + results. It will also perform OCR on purely digital documents. + + This option may fail on some documents that have features that cannot + be removed, such as forms. In this case, the text from the document is + used instead. + * ``force``: Paperless rasterizes your documents, converting any text + into images and puts the OCRed text on top. This works for all documents, + however, the resulting document may be significantly larger and text + won't appear as sharp when zoomed in. + + The default is ``skip``, which only performs OCR when necessary. + +PAPERLESS_OCR_OUTPUT_TYPE= + Specify the the type of PDF documents that paperless should produce. + + * ``pdf``: Modify the PDF document as little as possible. + * ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a + subset of the entire PDF specification and meant for storing + documents long term. + * ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of + PDF/A you wish to use. + + If not specified, ``pdfa`` is used. Remember that paperless also keeps + the original input file as well as the archived version. + + +PAPERLESS_OCR_PAGES= + Tells paperless to use only the specified amount of pages for OCR. Documents + with less than the specified amount of pages get OCR'ed completely. + + Specifying 1 here will only use the first page. + + When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``, + paperless will not modify any text it finds on excluded pages and copy it + verbatim. + + Defaults to 0, which disables this feature and always uses all pages. + + +PAPERLESS_OCR_IMAGE_DPI= + Paperless will OCR any images you put into the system and convert them + into PDF documents. This is useful if your scanner produces images. + In order to do so, paperless needs to know the DPI of the image. + Most images from scanners will have this information embedded and + paperless will detect and use that information. In case this fails, it + uses this value as a fallback. + + Set this to the DPI your scanner produces images at. + + Default is none, which causes paperless to fail if no DPI information is + present in an image. + + +PAPERLESS_OCR_USER_ARG= + OCRmyPDF offers many more options. Use this parameter to specify any + additional arguments you wish to pass to OCRmyPDF. Since Paperless uses + the API of OCRmyPDF, you have to specify these in a format that can be + passed to the API. See `https://ocrmypdf.readthedocs.io/en/latest/api.html#reference`_ + for valid parameters. All command line options are supported, but they + use underscores instead of dashed. + + .. caution:: + + Paperless has been tested to work with the OCR options provided + above. There are many options that are incompatible with each other, + so specifying invalid options may prevent paperless from consuming + any documents. + + Specify arguments as a JSON dictionary. Keep note of lower case booleans + and double quoted parameter names and strings. Examples: + + .. code:: json + + {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"} + + Software tweaks ############### @@ -193,79 +302,6 @@ PAPERLESS_TIME_ZONE= Defaults to UTC. -PAPERLESS_OCR_LANGUAGE= - Customize the default language that tesseract will attempt to use when - parsing documents. The default language is used whenever - - * No language could be detected on a document - * No tesseract data files are available for the detected language - - It should be a 3-letter language code consistent with ISO - 639: https://www.loc.gov/standards/iso639-2/php/code_list.php - - Set this to the language most of your documents are written in. - - Defaults to "eng". - -PAPERLESS_OCR_MODE= - Tell paperless when and how to perform ocr on your documents. Three modes - are available: - - * ``skip``: Paperless skips all pages and will perform ocr only on pages - where no text is present. This is the safest and fastest option. - * ``redo``: Paperless will OCR all pages of your documents and attempt to - replace any existing text layers with new text. This will be useful for - documents from scanners that already performed OCR with insufficient - results. It will also perform OCR on purely digital documents. - - This option may fail on some documents that have features that cannot - be removed, such as forms. In this case, the text from the document is - used instead. - * ``force``: Paperless rasterizes your documents, converting any text - into images and puts the OCRed text on top. This works for all documents, - however, the resulting document may be significantly larger and text - won't appear as sharp when zoomed in. - - The default is ``skip``, which only performs OCR when necessary. - -PAPERLESS_OCR_OUTPUT_TYPE= - Specify the the type of PDF documents that paperless should produce. - - * ``pdf``: Modify the PDF document as little as possible. - * ``pdfa``: Convert PDF documents into PDF/A documents, which is a - subset of the entire PDF specification and meant for storing - documents long term. - - If not specified, ``pdfa`` is used. Remember that paperless also keeps - the original input file as well as the archived version. - - -PAPERLESS_OCR_PAGES= - Tells paperless to use only the specified amount of pages for OCR. Documents - with less than the specified amount of pages get OCR'ed completely. - - Specifying 1 here will only use the first page. - - When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``, - paperless will not modify any text it finds on excluded pages and copy it - verbatim. - - Defaults to 0, which disables this feature and always uses all pages. - - -PAPERLESS_OCR_IMAGE_DPI= - Paperless will OCR any images you put into the system and convert them - into PDF documents. This is useful if your scanner produces images. - In order to do so, paperless needs to know the DPI of the image. - Most images from scanners will have this information embedded and - paperless will detect and use that information. In case this fails, it - uses this value as a fallback. - - Set this to the DPI your scanner produces images at. - - Default is none, which causes paperless to fail if no DPI information is - present in an image. - PAPERLESS_CONSUMER_POLLING= If paperless won't find documents added to your consume folder, it might not be able to automatically detect filesystem changes. In that case, diff --git a/paperless.conf.example b/paperless.conf.example index e19f4ba0e..32c0e56b4 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -31,20 +31,24 @@ #PAPERLESS_STATIC_URL=/static/ #PAPERLESS_AUTO_LOGIN_USERNAME= +# OCR settings + +#PAPERLESS_OCR_LANGUAGE=eng +#PAPERLESS_OCR_MODE=skip +#PAPERLESS_OCR_OUTPUT_TYPE=pdfa +#PAPERLESS_OCR_PAGES=1 +#PAPERLESS_OCR_IMAGE_DPI=300 +#PAPERLESS_OCR_USER_ARG={} +#PAPERLESS_CONVERT_MEMORY_LIMIT=0 +#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless + # Software tweaks #PAPERLESS_TASK_WORKERS=1 #PAPERLESS_THREADS_PER_WORKER=1 #PAPERLESS_TIME_ZONE=UTC -#PAPERLESS_OCR_PAGES=1 -#PAPERLESS_OCR_LANGUAGE=eng -#PAPERLESS_OCR_OUTPUT_TYPE=pdfa -#PAPERLESS_OCR_MODE=skip -#PAPERLESS_OCR_IMAGE_DPI=300 #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false -#PAPERLESS_CONVERT_MEMORY_LIMIT=0 -#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless #PAPERLESS_OPTIMIZE_THUMBNAILS=true #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 18f7cfac4..d7d061d80 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -350,6 +350,8 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") +OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") + # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 2289619f6..d674f2b70 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,3 +1,4 @@ +import json import os import re import subprocess @@ -118,10 +119,22 @@ class RasterisedDocumentParser(DocumentParser): f"no DPI information is present in this image and " f"OCR_IMAGE_DPI is not set.") + if settings.OCR_USER_ARGS: + try: + user_args = json.loads(settings.OCR_USER_ARGS) + ocr_args = {**ocr_args, **user_args} + except Exception as e: + self.log( + "warning", + f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " + f"they will not be used: {e}") + # This forces tesseract to use one core per page. os.environ['OMP_THREAD_LIMIT'] = "1" try: + self.log("debug", + f"Calling OCRmyPDF with {str(ocr_args)}") ocrmypdf.ocr(**ocr_args) # success! announce results self.archive_path = archive_path