mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Updates tika client library and handle the changes to it
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @@ -66,6 +66,7 @@ scipy = "==1.8.1" | ||||
| reportlab = "==3.6.12" | ||||
| # Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/) | ||||
| cryptography = "==40.0.1" | ||||
| httpx = "*" | ||||
|  | ||||
| [dev-packages] | ||||
| # Linting | ||||
|   | ||||
							
								
								
									
										10
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										10
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							| @@ -1,7 +1,7 @@ | ||||
| { | ||||
|     "_meta": { | ||||
|         "hash": { | ||||
|             "sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3" | ||||
|             "sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d" | ||||
|         }, | ||||
|         "pipfile-spec": 6, | ||||
|         "requires": {}, | ||||
| @@ -712,7 +712,7 @@ | ||||
|                 "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd", | ||||
|                 "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd" | ||||
|             ], | ||||
|             "markers": "python_version >= '3.7'", | ||||
|             "index": "pypi", | ||||
|             "version": "==0.24.1" | ||||
|         }, | ||||
|         "humanfriendly": { | ||||
| @@ -1746,11 +1746,11 @@ | ||||
|         }, | ||||
|         "tika-client": { | ||||
|             "hashes": [ | ||||
|                 "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0", | ||||
|                 "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d" | ||||
|                 "sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2", | ||||
|                 "sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d" | ||||
|             ], | ||||
|             "index": "pypi", | ||||
|             "version": "==0.1.1" | ||||
|             "version": "==0.2.0" | ||||
|         }, | ||||
|         "tornado": { | ||||
|             "hashes": [ | ||||
|   | ||||
| @@ -13,7 +13,6 @@ from humanfriendly import format_size | ||||
| from imap_tools import MailAttachment | ||||
| from imap_tools import MailMessage | ||||
| from tika_client import TikaClient | ||||
| from tika_client.data_models import TikaKey | ||||
|  | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import ParseError | ||||
| @@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser): | ||||
|             with TikaClient(tika_url=self.tika_server) as client: | ||||
|                 parsed = client.tika.as_text.from_buffer(html, "text/html") | ||||
|  | ||||
|                 if hasattr(parsed, "content") and parsed.content is not None: | ||||
|                 if parsed.content is not None: | ||||
|                     return parsed.content.strip() | ||||
|                 elif TikaKey.Content in parsed.data: | ||||
|                     # May not be a completely handled type, but | ||||
|                     # the Tika response may still include content | ||||
|                     return parsed.data[TikaKey.Content].strip() | ||||
|                 return "" | ||||
|         except Exception as err: | ||||
|             raise ParseError( | ||||
|   | ||||
| @@ -4,7 +4,6 @@ from pathlib import Path | ||||
| import httpx | ||||
| from django.conf import settings | ||||
| from tika_client import TikaClient | ||||
| from tika_client.data_models import TikaKey | ||||
|  | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import ParseError | ||||
| @@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser): | ||||
|                 f"{settings.TIKA_ENDPOINT}: {err}", | ||||
|             ) from err | ||||
|  | ||||
|         self.text = None | ||||
|         if hasattr(parsed, "content") and parsed.content is not None: | ||||
|             self.text = parsed.content.strip() | ||||
|         elif TikaKey.Content in parsed.data: | ||||
|             # May not be a completely handled type, but | ||||
|             # the Tika response may still include content | ||||
|             self.text = parsed.data[TikaKey.Content].strip() | ||||
|         self.text = parsed.content | ||||
|         if self.text is not None: | ||||
|             self.text = self.text.strip() | ||||
|  | ||||
|         self.date = parsed.created | ||||
|         self.archive_path = self.convert_to_pdf(document_path, file_name) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H