Feature: Allow a user to disable the pixel limit for OCR entirely (#5996)

This commit is contained in:
Trenton H 2024-03-04 14:37:36 -08:00 committed by GitHub
parent 6379e7b54f
commit 6779042242
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 40 additions and 15 deletions

View File

@ -437,7 +437,7 @@ with Prometheus, as it exports metrics. For details on its capabilities,
refer to the [Flower](https://flower.readthedocs.io/en/latest/index.html) refer to the [Flower](https://flower.readthedocs.io/en/latest/index.html)
documentation. documentation.
Flower can be enabled with the setting [PAPERLESS_ENABLE_FLOWER](configuration/#PAPERLESS_ENABLE_FLOWER). Flower can be enabled with the setting [PAPERLESS_ENABLE_FLOWER](configuration.md#PAPERLESS_ENABLE_FLOWER).
To configure Flower further, create a `flowerconfig.py` and To configure Flower further, create a `flowerconfig.py` and
place it into the `src/paperless` directory. For a Docker place it into the `src/paperless` directory. For a Docker
installation, you can use volumes to accomplish this: installation, you can use volumes to accomplish this:

View File

@ -766,6 +766,8 @@ but could result in missing text content.
If unset, will default to the value determined by If unset, will default to the value determined by
[Pillow](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS). [Pillow](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS).
Setting this value to 0 will entirely disable the limit. See the below warning.
!!! note !!! note
Increasing this limit could cause Paperless to consume additional Increasing this limit could cause Paperless to consume additional
@ -775,7 +777,7 @@ but could result in missing text content.
!!! warning !!! warning
The limit is intended to prevent malicious files from consuming The limit is intended to prevent malicious files from consuming
system resources and causing crashes and other errors. Only increase system resources and causing crashes and other errors. Only change
this value if you are certain your documents are not malicious and this value if you are certain your documents are not malicious and
you need the text which was not OCRed you need the text which was not OCRed

View File

@ -0,0 +1,24 @@
# Generated by Django 4.2.10 on 2024-03-04 17:30
import django.core.validators
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("paperless", "0002_applicationconfiguration_app_logo_and_more"),
]
operations = [
migrations.AlterField(
model_name="applicationconfiguration",
name="max_image_pixels",
field=models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(0.0)],
verbose_name="Sets the maximum image size for decompression",
),
),
]

View File

@ -151,7 +151,7 @@ class ApplicationConfiguration(AbstractSingletonModel):
max_image_pixels = models.FloatField( max_image_pixels = models.FloatField(
verbose_name=_("Sets the maximum image size for decompression"), verbose_name=_("Sets the maximum image size for decompression"),
null=True, null=True,
validators=[MinValueValidator(1_000_000.0)], validators=[MinValueValidator(0.0)],
) )
color_conversion_strategy = models.CharField( color_conversion_strategy = models.CharField(

View File

@ -293,20 +293,19 @@ class RasterisedDocumentParser(DocumentParser):
f"they will not be used. Error: {e}", f"they will not be used. Error: {e}",
) )
if self.settings.max_image_pixel is not None: if (
self.settings.max_image_pixel is not None
and self.settings.max_image_pixel >= 0
):
# Convert pixels to mega-pixels and provide to ocrmypdf # Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0 max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
if max_pixels_mpixels > 0: msg = (
self.log.debug( "OCR pixel limit is disabled!"
f"Calculated {max_pixels_mpixels} megapixels for OCR", if max_pixels_mpixels == 0
) else f"Calculated {max_pixels_mpixels} megapixels for OCR"
)
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels self.log.debug(msg)
else: ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
self.log.warning(
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set",
)
return ocrmypdf_args return ocrmypdf_args