mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Runs the pre-commit hooks over all the Python files
This commit is contained in:
		| @@ -1,10 +1,11 @@ | ||||
| import os | ||||
| import requests | ||||
|  | ||||
| import dateutil.parser | ||||
|  | ||||
| import requests | ||||
| from django.conf import settings | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, make_thumbnail_from_pdf | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import make_thumbnail_from_pdf | ||||
| from documents.parsers import ParseError | ||||
| from tika import parser | ||||
|  | ||||
|  | ||||
| @@ -20,7 +21,9 @@ class TikaDocumentParser(DocumentParser): | ||||
|             self.archive_path = self.convert_to_pdf(document_path, file_name) | ||||
|  | ||||
|         return make_thumbnail_from_pdf( | ||||
|             self.archive_path, self.tempdir, self.logging_group | ||||
|             self.archive_path, | ||||
|             self.tempdir, | ||||
|             self.logging_group, | ||||
|         ) | ||||
|  | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
| @@ -53,7 +56,7 @@ class TikaDocumentParser(DocumentParser): | ||||
|         except Exception as err: | ||||
|             raise ParseError( | ||||
|                 f"Could not parse {document_path} with tika server at " | ||||
|                 f"{tika_server}: {err}" | ||||
|                 f"{tika_server}: {err}", | ||||
|             ) | ||||
|  | ||||
|         self.text = parsed["content"].strip() | ||||
| @@ -74,22 +77,23 @@ class TikaDocumentParser(DocumentParser): | ||||
|         url = gotenberg_server + "/forms/libreoffice/convert" | ||||
|  | ||||
|         self.log("info", f"Converting {document_path} to PDF as {pdf_path}") | ||||
|         files = { | ||||
|             "files": ( | ||||
|                 file_name or os.path.basename(document_path), | ||||
|                 open(document_path, "rb"), | ||||
|             ) | ||||
|         } | ||||
|         headers = {} | ||||
|         with open(document_path, "rb") as document_handle: | ||||
|             files = { | ||||
|                 "files": ( | ||||
|                     file_name or os.path.basename(document_path), | ||||
|                     document_handle, | ||||
|                 ), | ||||
|             } | ||||
|             headers = {} | ||||
|  | ||||
|         try: | ||||
|             response = requests.post(url, files=files, headers=headers) | ||||
|             response.raise_for_status()  # ensure we notice bad responses | ||||
|         except Exception as err: | ||||
|             raise ParseError(f"Error while converting document to PDF: {err}") | ||||
|             try: | ||||
|                 response = requests.post(url, files=files, headers=headers) | ||||
|                 response.raise_for_status()  # ensure we notice bad responses | ||||
|             except Exception as err: | ||||
|                 raise ParseError(f"Error while converting document to PDF: {err}") | ||||
|  | ||||
|         file = open(pdf_path, "wb") | ||||
|         file.write(response.content) | ||||
|         file.close() | ||||
|         with open(pdf_path, "wb") as file: | ||||
|             file.write(response.content) | ||||
|             file.close() | ||||
|  | ||||
|         return pdf_path | ||||
|   | ||||
| @@ -10,12 +10,12 @@ def tika_consumer_declaration(sender, **kwargs): | ||||
|         "weight": 10, | ||||
|         "mime_types": { | ||||
|             "application/msword": ".doc", | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",  # NOQA: E501 | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",  # noqa: E501 | ||||
|             "application/vnd.ms-excel": ".xls", | ||||
|             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",  # NOQA: E501 | ||||
|             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",  # noqa: E501 | ||||
|             "application/vnd.ms-powerpoint": ".ppt", | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",  # NOQA: E501 | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",  # NOQA: E501 | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",  # noqa: E501 | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",  # noqa: E501 | ||||
|             "application/vnd.oasis.opendocument.presentation": ".odp", | ||||
|             "application/vnd.oasis.opendocument.spreadsheet": ".ods", | ||||
|             "application/vnd.oasis.opendocument.text": ".odt", | ||||
|   | ||||
| @@ -4,9 +4,8 @@ from pathlib import Path | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from requests import Response | ||||
|  | ||||
| from paperless_tika.parsers import TikaDocumentParser | ||||
| from requests import Response | ||||
|  | ||||
|  | ||||
| class TestTikaParser(TestCase): | ||||
| @@ -42,14 +41,15 @@ class TestTikaParser(TestCase): | ||||
|     @mock.patch("paperless_tika.parsers.parser.from_file") | ||||
|     def test_metadata(self, from_file): | ||||
|         from_file.return_value = { | ||||
|             "metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"} | ||||
|             "metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"}, | ||||
|         } | ||||
|  | ||||
|         file = os.path.join(self.parser.tempdir, "input.odt") | ||||
|         Path(file).touch() | ||||
|  | ||||
|         metadata = self.parser.extract_metadata( | ||||
|             file, "application/vnd.oasis.opendocument.text" | ||||
|             file, | ||||
|             "application/vnd.oasis.opendocument.text", | ||||
|         ) | ||||
|  | ||||
|         self.assertTrue("Creation-Date" in [m["key"] for m in metadata]) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton Holmes
					Trenton Holmes