mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Updates tika client library and handle the changes to it
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							@@ -66,6 +66,7 @@ scipy = "==1.8.1"
 | 
			
		||||
reportlab = "==3.6.12"
 | 
			
		||||
# Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
 | 
			
		||||
cryptography = "==40.0.1"
 | 
			
		||||
httpx = "*"
 | 
			
		||||
 | 
			
		||||
[dev-packages]
 | 
			
		||||
# Linting
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										10
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										10
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							@@ -1,7 +1,7 @@
 | 
			
		||||
{
 | 
			
		||||
    "_meta": {
 | 
			
		||||
        "hash": {
 | 
			
		||||
            "sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
 | 
			
		||||
            "sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d"
 | 
			
		||||
        },
 | 
			
		||||
        "pipfile-spec": 6,
 | 
			
		||||
        "requires": {},
 | 
			
		||||
@@ -712,7 +712,7 @@
 | 
			
		||||
                "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
 | 
			
		||||
                "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
 | 
			
		||||
            ],
 | 
			
		||||
            "markers": "python_version >= '3.7'",
 | 
			
		||||
            "index": "pypi",
 | 
			
		||||
            "version": "==0.24.1"
 | 
			
		||||
        },
 | 
			
		||||
        "humanfriendly": {
 | 
			
		||||
@@ -1746,11 +1746,11 @@
 | 
			
		||||
        },
 | 
			
		||||
        "tika-client": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
                "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
 | 
			
		||||
                "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
 | 
			
		||||
                "sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2",
 | 
			
		||||
                "sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d"
 | 
			
		||||
            ],
 | 
			
		||||
            "index": "pypi",
 | 
			
		||||
            "version": "==0.1.1"
 | 
			
		||||
            "version": "==0.2.0"
 | 
			
		||||
        },
 | 
			
		||||
        "tornado": {
 | 
			
		||||
            "hashes": [
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,6 @@ from humanfriendly import format_size
 | 
			
		||||
from imap_tools import MailAttachment
 | 
			
		||||
from imap_tools import MailMessage
 | 
			
		||||
from tika_client import TikaClient
 | 
			
		||||
from tika_client.data_models import TikaKey
 | 
			
		||||
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
from documents.parsers import ParseError
 | 
			
		||||
@@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser):
 | 
			
		||||
            with TikaClient(tika_url=self.tika_server) as client:
 | 
			
		||||
                parsed = client.tika.as_text.from_buffer(html, "text/html")
 | 
			
		||||
 | 
			
		||||
                if hasattr(parsed, "content") and parsed.content is not None:
 | 
			
		||||
                if parsed.content is not None:
 | 
			
		||||
                    return parsed.content.strip()
 | 
			
		||||
                elif TikaKey.Content in parsed.data:
 | 
			
		||||
                    # May not be a completely handled type, but
 | 
			
		||||
                    # the Tika response may still include content
 | 
			
		||||
                    return parsed.data[TikaKey.Content].strip()
 | 
			
		||||
                return ""
 | 
			
		||||
        except Exception as err:
 | 
			
		||||
            raise ParseError(
 | 
			
		||||
 
 | 
			
		||||
@@ -4,7 +4,6 @@ from pathlib import Path
 | 
			
		||||
import httpx
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from tika_client import TikaClient
 | 
			
		||||
from tika_client.data_models import TikaKey
 | 
			
		||||
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
from documents.parsers import ParseError
 | 
			
		||||
@@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser):
 | 
			
		||||
                f"{settings.TIKA_ENDPOINT}: {err}",
 | 
			
		||||
            ) from err
 | 
			
		||||
 | 
			
		||||
        self.text = None
 | 
			
		||||
        if hasattr(parsed, "content") and parsed.content is not None:
 | 
			
		||||
            self.text = parsed.content.strip()
 | 
			
		||||
        elif TikaKey.Content in parsed.data:
 | 
			
		||||
            # May not be a completely handled type, but
 | 
			
		||||
            # the Tika response may still include content
 | 
			
		||||
            self.text = parsed.data[TikaKey.Content].strip()
 | 
			
		||||
        self.text = parsed.content
 | 
			
		||||
        if self.text is not None:
 | 
			
		||||
            self.text = self.text.strip()
 | 
			
		||||
 | 
			
		||||
        self.date = parsed.created
 | 
			
		||||
        self.archive_path = self.convert_to_pdf(document_path, file_name)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user