mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	remove duplicate code
This commit is contained in:
		| @@ -144,6 +144,52 @@ def run_convert(input_file, | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
|  | ||||
|  | ||||
| def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): | ||||
|     """ | ||||
|     The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|     """ | ||||
|     out_path = os.path.join(temp_dir, "convert.png") | ||||
|  | ||||
|     # Run convert to get a decent thumbnail | ||||
|     try: | ||||
|         run_convert(density=300, | ||||
|                     scale="500x5000>", | ||||
|                     alpha="remove", | ||||
|                     strip=True, | ||||
|                     trim=False, | ||||
|                     auto_orient=True, | ||||
|                     input_file="{}[0]".format(in_path), | ||||
|                     output_file=out_path, | ||||
|                     logging_group=logging_group) | ||||
|     except ParseError: | ||||
|         # if convert fails, fall back to extracting | ||||
|         # the first PDF page as a PNG using Ghostscript | ||||
|         logger.warning( | ||||
|             "Thumbnail generation with ImageMagick failed, falling back " | ||||
|             "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", | ||||
|             extra={'group': logging_group} | ||||
|         ) | ||||
|         gs_out_path = os.path.join(temp_dir, "gs_out.png") | ||||
|         cmd = [settings.GS_BINARY, | ||||
|                "-q", | ||||
|                "-sDEVICE=pngalpha", | ||||
|                "-o", gs_out_path, | ||||
|                in_path] | ||||
|         if not subprocess.Popen(cmd).wait() == 0: | ||||
|             raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||
|         # then run convert on the output from gs | ||||
|         run_convert(density=300, | ||||
|                     scale="500x5000>", | ||||
|                     alpha="remove", | ||||
|                     strip=True, | ||||
|                     trim=False, | ||||
|                     auto_orient=True, | ||||
|                     input_file=gs_out_path, | ||||
|                     output_file=out_path, | ||||
|                     logging_group=logging_group) | ||||
|  | ||||
|     return out_path | ||||
|  | ||||
| def parse_date(filename, text): | ||||
|     """ | ||||
|     Returns the date of the document. | ||||
|   | ||||
| @@ -1,7 +1,6 @@ | ||||
| import json | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
|  | ||||
| import ocrmypdf | ||||
| import pdftotext | ||||
| @@ -10,7 +9,8 @@ from PIL import Image | ||||
| from django.conf import settings | ||||
| from ocrmypdf import InputFileError, EncryptedPdfError | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, run_convert | ||||
| from documents.parsers import DocumentParser, ParseError, \ | ||||
|     make_thumbnail_from_pdf | ||||
|  | ||||
|  | ||||
| class RasterisedDocumentParser(DocumentParser): | ||||
| @@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         return result | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
|  | ||||
|         out_path = os.path.join(self.tempdir, "convert.png") | ||||
|  | ||||
|         # Run convert to get a decent thumbnail | ||||
|         try: | ||||
|             run_convert(density=300, | ||||
|                         scale="500x5000>", | ||||
|                         alpha="remove", | ||||
|                         strip=True, | ||||
|                         trim=False, | ||||
|                         auto_orient=True, | ||||
|                         input_file="{}[0]".format(document_path), | ||||
|                         output_file=out_path, | ||||
|                         logging_group=self.logging_group) | ||||
|         except ParseError: | ||||
|             # if convert fails, fall back to extracting | ||||
|             # the first PDF page as a PNG using Ghostscript | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 "Thumbnail generation with ImageMagick failed, falling back " | ||||
|                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!") | ||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") | ||||
|             cmd = [settings.GS_BINARY, | ||||
|                    "-q", | ||||
|                    "-sDEVICE=pngalpha", | ||||
|                    "-o", gs_out_path, | ||||
|                    document_path] | ||||
|             if not subprocess.Popen(cmd).wait() == 0: | ||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||
|             # then run convert on the output from gs | ||||
|             run_convert(density=300, | ||||
|                         scale="500x5000>", | ||||
|                         alpha="remove", | ||||
|                         strip=True, | ||||
|                         trim=False, | ||||
|                         auto_orient=True, | ||||
|                         input_file=gs_out_path, | ||||
|                         output_file=out_path, | ||||
|                         logging_group=self.logging_group) | ||||
|  | ||||
|         return out_path | ||||
|         return make_thumbnail_from_pdf( | ||||
|             document_path, self.tempdir, self.logging_group) | ||||
|  | ||||
|     def is_image(self, mime_type): | ||||
|         return mime_type in [ | ||||
|   | ||||
| @@ -1,14 +1,11 @@ | ||||
| import os | ||||
| import subprocess | ||||
| import tika | ||||
| import requests | ||||
| import dateutil.parser | ||||
|  | ||||
| from PIL import ImageDraw, ImageFont, Image | ||||
| from django.conf import settings | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, run_convert | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
| from documents.parsers import DocumentParser, ParseError, \ | ||||
|     make_thumbnail_from_pdf | ||||
| from tika import parser | ||||
|  | ||||
|  | ||||
| @@ -18,55 +15,11 @@ class TikaDocumentParser(DocumentParser): | ||||
|     """ | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") | ||||
|         archive_path = self.archive_path | ||||
|         if not self.archive_path: | ||||
|             self.archive_path = self.convert_to_pdf(document_path) | ||||
|  | ||||
|         out_path = os.path.join(self.tempdir, "convert.png") | ||||
|  | ||||
|         # Run convert to get a decent thumbnail | ||||
|         try: | ||||
|             run_convert( | ||||
|                 density=300, | ||||
|                 scale="500x5000>", | ||||
|                 alpha="remove", | ||||
|                 strip=True, | ||||
|                 trim=False, | ||||
|                 input_file="{}[0]".format(archive_path), | ||||
|                 output_file=out_path, | ||||
|                 logging_group=self.logging_group, | ||||
|             ) | ||||
|         except ParseError: | ||||
|             # if convert fails, fall back to extracting | ||||
|             # the first PDF page as a PNG using Ghostscript | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "Thumbnail generation with ImageMagick failed, falling back " | ||||
|                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", | ||||
|             ) | ||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") | ||||
|             cmd = [ | ||||
|                 settings.GS_BINARY, | ||||
|                 "-q", | ||||
|                 "-sDEVICE=pngalpha", | ||||
|                 "-o", | ||||
|                 gs_out_path, | ||||
|                 archive_path, | ||||
|             ] | ||||
|             if not subprocess.Popen(cmd).wait() == 0: | ||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||
|             # then run convert on the output from gs | ||||
|             run_convert( | ||||
|                 density=300, | ||||
|                 scale="500x5000>", | ||||
|                 alpha="remove", | ||||
|                 strip=True, | ||||
|                 trim=False, | ||||
|                 input_file=gs_out_path, | ||||
|                 output_file=out_path, | ||||
|                 logging_group=self.logging_group, | ||||
|             ) | ||||
|  | ||||
|         return out_path | ||||
|         return make_thumbnail_from_pdf( | ||||
|             self.archive_path, self.tempdir, self.logging_group) | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") | ||||
| @@ -89,11 +42,9 @@ class TikaDocumentParser(DocumentParser): | ||||
|         except: | ||||
|             pass | ||||
|  | ||||
|         archive_path = os.path.join(self.tempdir, "convert.pdf") | ||||
|         convert_to_pdf(document_path, archive_path) | ||||
|         self.archive_path = archive_path | ||||
|         self.archive_path = self.convert_to_pdf(document_path) | ||||
|  | ||||
|     def convert_to_pdf(document_path, pdf_path): | ||||
|     def convert_to_pdf(self, document_path): | ||||
|         pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||
|         gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT | ||||
|         url = gotenberg_server + "/convert/office" | ||||
| @@ -113,3 +64,5 @@ class TikaDocumentParser(DocumentParser): | ||||
|         file = open(pdf_path, "wb") | ||||
|         file.write(response.content) | ||||
|         file.close() | ||||
|  | ||||
|         return pdf_path | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler