mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Working arround current TIKA Library Bugs
This commit is contained in:
		| @@ -1,8 +1,13 @@ | ||||
| import os | ||||
| from pathlib import Path | ||||
|  | ||||
| import array | ||||
|  | ||||
| import dateutil.parser | ||||
| import httpx | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
|  | ||||
| from tika_client import TikaClient | ||||
|  | ||||
| from documents.parsers import DocumentParser | ||||
| @@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser): | ||||
|  | ||||
|         try: | ||||
|             with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: | ||||
|                 parsed = client.tika.as_text.from_file(document_path, mime_type) | ||||
|                 with open(document_path, 'rb') as f: | ||||
|                     content = f.read() | ||||
|                     parsed = client.tika.as_text.from_buffer(content, mime_type) | ||||
|         except Exception as err: | ||||
|             raise ParseError( | ||||
|                 f"Could not parse {document_path} with tika server at " | ||||
| @@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser): | ||||
|         if self.text is not None: | ||||
|             self.text = self.text.strip() | ||||
|  | ||||
|         self.date = parsed.created | ||||
|         tz = timezone.get_current_timezone() | ||||
|  | ||||
|         self.date = timezone.make_aware(parsed.created,tz) | ||||
|         self.archive_path = self.convert_to_pdf(document_path, file_name) | ||||
|  | ||||
|     def convert_to_pdf(self, document_path, file_name): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Simon Siebert
					Simon Siebert