mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-14 00:26:21 +00:00
Adds better handling for files with invalid utf8 content
This commit is contained in:
@@ -16,11 +16,7 @@ class TextDocumentParser(DocumentParser):
|
||||
logging_name = "paperless.parsing.text"
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
def read_text():
|
||||
with open(document_path) as src:
|
||||
lines = [line.strip() for line in src.readlines()]
|
||||
text = "\n".join(lines[:50])
|
||||
return text
|
||||
text = self.read_file_handle_unicode_errors(document_path)
|
||||
|
||||
img = Image.new("RGB", (500, 700), color="white")
|
||||
draw = ImageDraw.Draw(img)
|
||||
@@ -29,7 +25,7 @@ class TextDocumentParser(DocumentParser):
|
||||
size=20,
|
||||
layout_engine=ImageFont.Layout.BASIC,
|
||||
)
|
||||
draw.text((5, 5), read_text(), font=font, fill="black")
|
||||
draw.text((5, 5), text, font=font, fill="black")
|
||||
|
||||
out_path = os.path.join(self.tempdir, "thumb.webp")
|
||||
img.save(out_path, format="WEBP")
|
||||
@@ -37,5 +33,4 @@ class TextDocumentParser(DocumentParser):
|
||||
return out_path
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
with open(document_path) as f:
|
||||
self.text = f.read()
|
||||
self.text = self.read_file_handle_unicode_errors(document_path)
|
||||
|
Reference in New Issue
Block a user