mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Format Python code with black
This commit is contained in:
@@ -4,8 +4,7 @@ import dateutil.parser
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, \
|
||||
make_thumbnail_from_pdf
|
||||
from documents.parsers import DocumentParser, ParseError, make_thumbnail_from_pdf
|
||||
from tika import parser
|
||||
|
||||
|
||||
@@ -21,15 +20,18 @@ class TikaDocumentParser(DocumentParser):
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path, self.tempdir, self.logging_group)
|
||||
self.archive_path, self.tempdir, self.logging_group
|
||||
)
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
|
||||
try:
|
||||
parsed = parser.from_file(document_path, tika_server)
|
||||
except Exception as e:
|
||||
self.log("warning", f"Error while fetching document metadata for "
|
||||
f"{document_path}: {e}")
|
||||
self.log(
|
||||
"warning",
|
||||
f"Error while fetching document metadata for " f"{document_path}: {e}",
|
||||
)
|
||||
return []
|
||||
|
||||
return [
|
||||
@@ -37,8 +39,9 @@ class TikaDocumentParser(DocumentParser):
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": key,
|
||||
"value": parsed['metadata'][key]
|
||||
} for key in parsed['metadata']
|
||||
"value": parsed["metadata"][key],
|
||||
}
|
||||
for key in parsed["metadata"]
|
||||
]
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
@@ -56,11 +59,12 @@ class TikaDocumentParser(DocumentParser):
|
||||
self.text = parsed["content"].strip()
|
||||
|
||||
try:
|
||||
self.date = dateutil.parser.isoparse(
|
||||
parsed["metadata"]["Creation-Date"])
|
||||
self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
|
||||
except Exception as e:
|
||||
self.log("warning", f"Unable to extract date for document "
|
||||
f"{document_path}: {e}")
|
||||
self.log(
|
||||
"warning",
|
||||
f"Unable to extract date for document " f"{document_path}: {e}",
|
||||
)
|
||||
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
||||
@@ -70,17 +74,19 @@ class TikaDocumentParser(DocumentParser):
|
||||
url = gotenberg_server + "/forms/libreoffice/convert"
|
||||
|
||||
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
|
||||
files = {"files": (file_name or os.path.basename(document_path),
|
||||
open(document_path, "rb"))}
|
||||
files = {
|
||||
"files": (
|
||||
file_name or os.path.basename(document_path),
|
||||
open(document_path, "rb"),
|
||||
)
|
||||
}
|
||||
headers = {}
|
||||
|
||||
try:
|
||||
response = requests.post(url, files=files, headers=headers)
|
||||
response.raise_for_status() # ensure we notice bad responses
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}"
|
||||
)
|
||||
raise ParseError(f"Error while converting document to PDF: {err}")
|
||||
|
||||
file = open(pdf_path, "wb")
|
||||
file.write(response.content)
|
||||
|
@@ -10,7 +10,6 @@ from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
class TestTikaParser(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.parser = TikaDocumentParser(logging_group=None)
|
||||
|
||||
@@ -22,9 +21,7 @@ class TestTikaParser(TestCase):
|
||||
def test_parse(self, post, from_file):
|
||||
from_file.return_value = {
|
||||
"content": "the content",
|
||||
"metadata": {
|
||||
"Creation-Date": "2020-11-21"
|
||||
}
|
||||
"metadata": {"Creation-Date": "2020-11-21"},
|
||||
}
|
||||
response = Response()
|
||||
response._content = b"PDF document"
|
||||
@@ -45,16 +42,15 @@ class TestTikaParser(TestCase):
|
||||
@mock.patch("paperless_tika.parsers.parser.from_file")
|
||||
def test_metadata(self, from_file):
|
||||
from_file.return_value = {
|
||||
"metadata": {
|
||||
"Creation-Date": "2020-11-21",
|
||||
"Some-key": "value"
|
||||
}
|
||||
"metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"}
|
||||
}
|
||||
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
|
||||
metadata = self.parser.extract_metadata(file, "application/vnd.oasis.opendocument.text")
|
||||
metadata = self.parser.extract_metadata(
|
||||
file, "application/vnd.oasis.opendocument.text"
|
||||
)
|
||||
|
||||
self.assertTrue("Creation-Date" in [m['key'] for m in metadata])
|
||||
self.assertTrue("Some-key" in [m['key'] for m in metadata])
|
||||
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
|
||||
self.assertTrue("Some-key" in [m["key"] for m in metadata])
|
||||
|
Reference in New Issue
Block a user