code style fixes

This commit is contained in:
Jonas Winkler
2020-11-12 21:09:45 +01:00
parent 9c4cf5d7bd
commit 2e04ba1c04
31 changed files with 110 additions and 149 deletions

View File

@@ -5,15 +5,14 @@ import subprocess
from multiprocessing.pool import Pool
import langdetect
import pdftotext
import pyocr
from django.conf import settings
from PIL import Image
from django.conf import settings
from pyocr import PyocrException
import pdftotext
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
run_convert
from .languages import ISO639
@@ -45,8 +44,8 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove",
strip=True,
trim=True,
input="{}[0]".format(self.document_path),
output=out_path,
input_file="{}[0]".format(self.document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
@@ -66,8 +65,8 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove",
strip=True,
trim=True,
input=gs_out_path,
output=out_path,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
@@ -99,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser):
try:
sample_page_index = int(len(images) / 2)
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images)))
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
@@ -139,8 +138,8 @@ class RasterisedDocumentParser(DocumentParser):
run_convert(density=settings.CONVERT_DENSITY,
depth="8",
type="grayscale",
input=self.document_path,
output=pnm,
input_file=self.document_path,
output_file=pnm,
logging_group=self.logging_group)
# Get a list of converted images
@@ -189,7 +188,6 @@ class RasterisedDocumentParser(DocumentParser):
return [sample_page]
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(

View File

@@ -5,10 +5,10 @@ from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase, override_settings
from ..parsers import RasterisedDocumentParser
from django.conf import settings
class TestDate(TestCase):