mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Updates tika client library and handle the changes to it
This commit is contained in:
		| @@ -13,7 +13,6 @@ from humanfriendly import format_size | ||||
| from imap_tools import MailAttachment | ||||
| from imap_tools import MailMessage | ||||
| from tika_client import TikaClient | ||||
| from tika_client.data_models import TikaKey | ||||
|  | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import ParseError | ||||
| @@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser): | ||||
|             with TikaClient(tika_url=self.tika_server) as client: | ||||
|                 parsed = client.tika.as_text.from_buffer(html, "text/html") | ||||
|  | ||||
|                 if hasattr(parsed, "content") and parsed.content is not None: | ||||
|                 if parsed.content is not None: | ||||
|                     return parsed.content.strip() | ||||
|                 elif TikaKey.Content in parsed.data: | ||||
|                     # May not be a completely handled type, but | ||||
|                     # the Tika response may still include content | ||||
|                     return parsed.data[TikaKey.Content].strip() | ||||
|                 return "" | ||||
|         except Exception as err: | ||||
|             raise ParseError( | ||||
|   | ||||
| @@ -4,7 +4,6 @@ from pathlib import Path | ||||
| import httpx | ||||
| from django.conf import settings | ||||
| from tika_client import TikaClient | ||||
| from tika_client.data_models import TikaKey | ||||
|  | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import ParseError | ||||
| @@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser): | ||||
|                 f"{settings.TIKA_ENDPOINT}: {err}", | ||||
|             ) from err | ||||
|  | ||||
|         self.text = None | ||||
|         if hasattr(parsed, "content") and parsed.content is not None: | ||||
|             self.text = parsed.content.strip() | ||||
|         elif TikaKey.Content in parsed.data: | ||||
|             # May not be a completely handled type, but | ||||
|             # the Tika response may still include content | ||||
|             self.text = parsed.data[TikaKey.Content].strip() | ||||
|         self.text = parsed.content | ||||
|         if self.text is not None: | ||||
|             self.text = self.text.strip() | ||||
|  | ||||
|         self.date = parsed.created | ||||
|         self.archive_path = self.convert_to_pdf(document_path, file_name) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H