mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	remove duplicate code
This commit is contained in:
		| @@ -144,6 +144,52 @@ def run_convert(input_file, | |||||||
|         raise ParseError("Convert failed at {}".format(args)) |         raise ParseError("Convert failed at {}".format(args)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): | ||||||
|  |     """ | ||||||
|  |     The thumbnail of a PDF is just a 500px wide image of the first page. | ||||||
|  |     """ | ||||||
|  |     out_path = os.path.join(temp_dir, "convert.png") | ||||||
|  |  | ||||||
|  |     # Run convert to get a decent thumbnail | ||||||
|  |     try: | ||||||
|  |         run_convert(density=300, | ||||||
|  |                     scale="500x5000>", | ||||||
|  |                     alpha="remove", | ||||||
|  |                     strip=True, | ||||||
|  |                     trim=False, | ||||||
|  |                     auto_orient=True, | ||||||
|  |                     input_file="{}[0]".format(in_path), | ||||||
|  |                     output_file=out_path, | ||||||
|  |                     logging_group=logging_group) | ||||||
|  |     except ParseError: | ||||||
|  |         # if convert fails, fall back to extracting | ||||||
|  |         # the first PDF page as a PNG using Ghostscript | ||||||
|  |         logger.warning( | ||||||
|  |             "Thumbnail generation with ImageMagick failed, falling back " | ||||||
|  |             "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", | ||||||
|  |             extra={'group': logging_group} | ||||||
|  |         ) | ||||||
|  |         gs_out_path = os.path.join(temp_dir, "gs_out.png") | ||||||
|  |         cmd = [settings.GS_BINARY, | ||||||
|  |                "-q", | ||||||
|  |                "-sDEVICE=pngalpha", | ||||||
|  |                "-o", gs_out_path, | ||||||
|  |                in_path] | ||||||
|  |         if not subprocess.Popen(cmd).wait() == 0: | ||||||
|  |             raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||||
|  |         # then run convert on the output from gs | ||||||
|  |         run_convert(density=300, | ||||||
|  |                     scale="500x5000>", | ||||||
|  |                     alpha="remove", | ||||||
|  |                     strip=True, | ||||||
|  |                     trim=False, | ||||||
|  |                     auto_orient=True, | ||||||
|  |                     input_file=gs_out_path, | ||||||
|  |                     output_file=out_path, | ||||||
|  |                     logging_group=logging_group) | ||||||
|  |  | ||||||
|  |     return out_path | ||||||
|  |  | ||||||
| def parse_date(filename, text): | def parse_date(filename, text): | ||||||
|     """ |     """ | ||||||
|     Returns the date of the document. |     Returns the date of the document. | ||||||
|   | |||||||
| @@ -1,7 +1,6 @@ | |||||||
| import json | import json | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import subprocess |  | ||||||
|  |  | ||||||
| import ocrmypdf | import ocrmypdf | ||||||
| import pdftotext | import pdftotext | ||||||
| @@ -10,7 +9,8 @@ from PIL import Image | |||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from ocrmypdf import InputFileError, EncryptedPdfError | from ocrmypdf import InputFileError, EncryptedPdfError | ||||||
|  |  | ||||||
| from documents.parsers import DocumentParser, ParseError, run_convert | from documents.parsers import DocumentParser, ParseError, \ | ||||||
|  |     make_thumbnail_from_pdf | ||||||
|  |  | ||||||
|  |  | ||||||
| class RasterisedDocumentParser(DocumentParser): | class RasterisedDocumentParser(DocumentParser): | ||||||
| @@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         return result |         return result | ||||||
|  |  | ||||||
|     def get_thumbnail(self, document_path, mime_type): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         """ |         return make_thumbnail_from_pdf( | ||||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. |             document_path, self.tempdir, self.logging_group) | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         out_path = os.path.join(self.tempdir, "convert.png") |  | ||||||
|  |  | ||||||
|         # Run convert to get a decent thumbnail |  | ||||||
|         try: |  | ||||||
|             run_convert(density=300, |  | ||||||
|                         scale="500x5000>", |  | ||||||
|                         alpha="remove", |  | ||||||
|                         strip=True, |  | ||||||
|                         trim=False, |  | ||||||
|                         auto_orient=True, |  | ||||||
|                         input_file="{}[0]".format(document_path), |  | ||||||
|                         output_file=out_path, |  | ||||||
|                         logging_group=self.logging_group) |  | ||||||
|         except ParseError: |  | ||||||
|             # if convert fails, fall back to extracting |  | ||||||
|             # the first PDF page as a PNG using Ghostscript |  | ||||||
|             self.log( |  | ||||||
|                 'warning', |  | ||||||
|                 "Thumbnail generation with ImageMagick failed, falling back " |  | ||||||
|                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!") |  | ||||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") |  | ||||||
|             cmd = [settings.GS_BINARY, |  | ||||||
|                    "-q", |  | ||||||
|                    "-sDEVICE=pngalpha", |  | ||||||
|                    "-o", gs_out_path, |  | ||||||
|                    document_path] |  | ||||||
|             if not subprocess.Popen(cmd).wait() == 0: |  | ||||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) |  | ||||||
|             # then run convert on the output from gs |  | ||||||
|             run_convert(density=300, |  | ||||||
|                         scale="500x5000>", |  | ||||||
|                         alpha="remove", |  | ||||||
|                         strip=True, |  | ||||||
|                         trim=False, |  | ||||||
|                         auto_orient=True, |  | ||||||
|                         input_file=gs_out_path, |  | ||||||
|                         output_file=out_path, |  | ||||||
|                         logging_group=self.logging_group) |  | ||||||
|  |  | ||||||
|         return out_path |  | ||||||
|  |  | ||||||
|     def is_image(self, mime_type): |     def is_image(self, mime_type): | ||||||
|         return mime_type in [ |         return mime_type in [ | ||||||
|   | |||||||
| @@ -1,14 +1,11 @@ | |||||||
| import os | import os | ||||||
| import subprocess |  | ||||||
| import tika |  | ||||||
| import requests | import requests | ||||||
| import dateutil.parser | import dateutil.parser | ||||||
|  |  | ||||||
| from PIL import ImageDraw, ImageFont, Image |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
|  |  | ||||||
| from documents.parsers import DocumentParser, ParseError, run_convert | from documents.parsers import DocumentParser, ParseError, \ | ||||||
| from paperless_tesseract.parsers import RasterisedDocumentParser |     make_thumbnail_from_pdf | ||||||
| from tika import parser | from tika import parser | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -18,55 +15,11 @@ class TikaDocumentParser(DocumentParser): | |||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def get_thumbnail(self, document_path, mime_type): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") |         if not self.archive_path: | ||||||
|         archive_path = self.archive_path |             self.archive_path = self.convert_to_pdf(document_path) | ||||||
|  |  | ||||||
|         out_path = os.path.join(self.tempdir, "convert.png") |         return make_thumbnail_from_pdf( | ||||||
|  |             self.archive_path, self.tempdir, self.logging_group) | ||||||
|         # Run convert to get a decent thumbnail |  | ||||||
|         try: |  | ||||||
|             run_convert( |  | ||||||
|                 density=300, |  | ||||||
|                 scale="500x5000>", |  | ||||||
|                 alpha="remove", |  | ||||||
|                 strip=True, |  | ||||||
|                 trim=False, |  | ||||||
|                 input_file="{}[0]".format(archive_path), |  | ||||||
|                 output_file=out_path, |  | ||||||
|                 logging_group=self.logging_group, |  | ||||||
|             ) |  | ||||||
|         except ParseError: |  | ||||||
|             # if convert fails, fall back to extracting |  | ||||||
|             # the first PDF page as a PNG using Ghostscript |  | ||||||
|             self.log( |  | ||||||
|                 "warning", |  | ||||||
|                 "Thumbnail generation with ImageMagick failed, falling back " |  | ||||||
|                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", |  | ||||||
|             ) |  | ||||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") |  | ||||||
|             cmd = [ |  | ||||||
|                 settings.GS_BINARY, |  | ||||||
|                 "-q", |  | ||||||
|                 "-sDEVICE=pngalpha", |  | ||||||
|                 "-o", |  | ||||||
|                 gs_out_path, |  | ||||||
|                 archive_path, |  | ||||||
|             ] |  | ||||||
|             if not subprocess.Popen(cmd).wait() == 0: |  | ||||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) |  | ||||||
|             # then run convert on the output from gs |  | ||||||
|             run_convert( |  | ||||||
|                 density=300, |  | ||||||
|                 scale="500x5000>", |  | ||||||
|                 alpha="remove", |  | ||||||
|                 strip=True, |  | ||||||
|                 trim=False, |  | ||||||
|                 input_file=gs_out_path, |  | ||||||
|                 output_file=out_path, |  | ||||||
|                 logging_group=self.logging_group, |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         return out_path |  | ||||||
|  |  | ||||||
|     def parse(self, document_path, mime_type): |     def parse(self, document_path, mime_type): | ||||||
|         self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") |         self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") | ||||||
| @@ -89,11 +42,9 @@ class TikaDocumentParser(DocumentParser): | |||||||
|         except: |         except: | ||||||
|             pass |             pass | ||||||
|  |  | ||||||
|         archive_path = os.path.join(self.tempdir, "convert.pdf") |         self.archive_path = self.convert_to_pdf(document_path) | ||||||
|         convert_to_pdf(document_path, archive_path) |  | ||||||
|         self.archive_path = archive_path |  | ||||||
|  |  | ||||||
|     def convert_to_pdf(document_path, pdf_path): |     def convert_to_pdf(self, document_path): | ||||||
|         pdf_path = os.path.join(self.tempdir, "convert.pdf") |         pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||||
|         gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT |         gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT | ||||||
|         url = gotenberg_server + "/convert/office" |         url = gotenberg_server + "/convert/office" | ||||||
| @@ -113,3 +64,5 @@ class TikaDocumentParser(DocumentParser): | |||||||
|         file = open(pdf_path, "wb") |         file = open(pdf_path, "wb") | ||||||
|         file.write(response.content) |         file.write(response.content) | ||||||
|         file.close() |         file.close() | ||||||
|  |  | ||||||
|  |         return pdf_path | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler