Configures ruff as the one stop linter and resolves warnings it raised

2025-12-20 01:45:58 -06:00 · 2023-03-28 09:39:30 -07:00
parent 5869467db3
commit ce41ac9158
110 changed files with 507 additions and 491 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -56,7 +56,7 @@ class RasterisedDocumentParser(DocumentParser):
                except Exception as e:
                    self.log(
                        "warning",
-                        f"Error while reading metadata {key}: {value}. Error: " f"{e}",
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
                    )
        return result

@@ -160,11 +160,10 @@ class RasterisedDocumentParser(DocumentParser):
            return post_process_text(text)

        except Exception:
-            # TODO catch all for various issues with PDFminer.six.
            #  If pdftotext fails, fall back to OCR.
            self.log(
                "warning",
-                "Error while getting text from PDF document with " "pdfminer.six",
+                "Error while getting text from PDF document with pdftotext",
                exc_info=True,
            )
            # probably not a PDF file.
@@ -284,10 +283,13 @@ class RasterisedDocumentParser(DocumentParser):
    def parse(self, document_path: Path, mime_type, file_name=None):
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
+        VALID_TEXT_LENGTH = 50

        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
-            original_has_text = text_original is not None and len(text_original) > 50
+            original_has_text = (
+                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+            )
        else:
            text_original = None
            original_has_text = False