mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	new setting: PAPERLESS_OCR_PAGES
This commit is contained in:
		@@ -26,6 +26,9 @@ next
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
* Much better admin for mail rule editing.
 | 
					* Much better admin for mail rule editing.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* New setting ``PAPERLESS_OCR_PAGES`` limits the tesseract parser
 | 
				
			||||||
 | 
					  to the first n pages of scanned documents.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
paperless-ng 0.9.1
 | 
					paperless-ng 0.9.1
 | 
				
			||||||
##################
 | 
					##################
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -184,6 +184,16 @@ PAPERLESS_TIME_ZONE=<timezone>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PAPERLESS_OCR_PAGES=<num>
 | 
				
			||||||
 | 
					    Tells paperless to use only the specified amount of pages for OCR. Documents
 | 
				
			||||||
 | 
					    with less than the specified amount of pages get OCR'ed completely.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Specifying 1 here will only use the first page.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Defaults to 0, which disables this feature and always uses all pages.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PAPERLESS_OCR_LANGUAGE=<lang>
 | 
					PAPERLESS_OCR_LANGUAGE=<lang>
 | 
				
			||||||
    Customize the default language that tesseract will attempt to use when
 | 
					    Customize the default language that tesseract will attempt to use when
 | 
				
			||||||
    parsing documents. The default language is used whenever
 | 
					    parsing documents. The default language is used whenever
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -358,4 +358,30 @@ management commands as below.
 | 
				
			|||||||
7.  Start paperless.
 | 
					7.  Start paperless.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Considerations for less powerful devices
 | 
				
			||||||
 | 
					########################################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Paperless runs on Raspberry Pi. However, some things are rather slow on the Pi and 
 | 
				
			||||||
 | 
					configuring some options in paperless can help improve performance immensely:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					*   Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR
 | 
				
			||||||
 | 
					    the first page of your documents.
 | 
				
			||||||
 | 
					*   ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
 | 
				
			||||||
 | 
					    to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
 | 
				
			||||||
 | 
					    paperless will use 2 workers and 2 threads per worker. This may result in
 | 
				
			||||||
 | 
					    slugish response times during consumption, so you might want to lower these
 | 
				
			||||||
 | 
					    settings (example: 2 workers and 1 thread to always have some computing power
 | 
				
			||||||
 | 
					    left for other tasks).
 | 
				
			||||||
 | 
					*   Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
 | 
				
			||||||
 | 
					    your documents before feeding them into paperless. Some scanners are able to
 | 
				
			||||||
 | 
					    do this!
 | 
				
			||||||
 | 
					*   Lower ``PAPERLESS_CONVERT_DENSITY`` from its default value 300 to 200. This
 | 
				
			||||||
 | 
					    will still result in rather accurate OCR, but will decrease consumption time
 | 
				
			||||||
 | 
					    by quite a bit.
 | 
				
			||||||
 | 
					*   Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
 | 
				
			||||||
 | 
					    times. Thumbnails will be about 20% larger.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For details, refer to :ref:`configuration`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. _redis: https://redis.io/
 | 
					.. _redis: https://redis.io/
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,6 +35,7 @@
 | 
				
			|||||||
#PAPERLESS_TASK_WORKERS=1
 | 
					#PAPERLESS_TASK_WORKERS=1
 | 
				
			||||||
#PAPERLESS_THREADS_PER_WORKER=1
 | 
					#PAPERLESS_THREADS_PER_WORKER=1
 | 
				
			||||||
#PAPERLESS_TIME_ZONE=UTC
 | 
					#PAPERLESS_TIME_ZONE=UTC
 | 
				
			||||||
 | 
					#PAPERLESS_OCR_PAGES=1
 | 
				
			||||||
#PAPERLESS_OCR_LANGUAGE=eng
 | 
					#PAPERLESS_OCR_LANGUAGE=eng
 | 
				
			||||||
#PAPERLESS_OCR_ALWAYS=false
 | 
					#PAPERLESS_OCR_ALWAYS=false
 | 
				
			||||||
#PAPERLESS_CONSUMER_POLLING=10
 | 
					#PAPERLESS_CONSUMER_POLLING=10
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -322,6 +322,8 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
 | 
					OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The default language that tesseract will attempt to use when parsing
 | 
					# The default language that tesseract will attempt to use when parsing
 | 
				
			||||||
# documents.  It should be a 3-letter language code consistent with ISO 639.
 | 
					# documents.  It should be a 3-letter language code consistent with ISO 639.
 | 
				
			||||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 | 
					OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
        Greyscale images are easier for Tesseract to OCR
 | 
					        Greyscale images are easier for Tesseract to OCR
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Convert PDF to multiple PNMs
 | 
				
			||||||
 | 
					        input_file = self.document_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if settings.OCR_PAGES == 1:
 | 
				
			||||||
 | 
					            input_file += "[0]"
 | 
				
			||||||
 | 
					        elif settings.OCR_PAGES > 1:
 | 
				
			||||||
 | 
					            input_file += f"[0-{settings.OCR_PAGES - 1}]"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.log(
 | 
					        self.log(
 | 
				
			||||||
            "debug",
 | 
					            "debug",
 | 
				
			||||||
            f"Converting document {self.document_path} into greyscale images")
 | 
					            f"Converting document {input_file} into greyscale images")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Convert PDF to multiple PNMs
 | 
					        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
 | 
				
			||||||
        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        run_convert(density=settings.CONVERT_DENSITY,
 | 
					        run_convert(density=settings.CONVERT_DENSITY,
 | 
				
			||||||
                    depth="8",
 | 
					                    depth="8",
 | 
				
			||||||
                    type="grayscale",
 | 
					                    type="grayscale",
 | 
				
			||||||
                    input_file=self.document_path,
 | 
					                    input_file=input_file,
 | 
				
			||||||
                    output_file=pnm,
 | 
					                    output_file=output_files,
 | 
				
			||||||
                    logging_group=self.logging_group)
 | 
					                    logging_group=self.logging_group)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Get a list of converted images
 | 
					        # Get a list of converted images
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user