mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Working arround current TIKA Library Bugs
This commit is contained in:
		@@ -1,8 +1,13 @@
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import dateutil.parser
 | 
				
			||||||
import httpx
 | 
					import httpx
 | 
				
			||||||
from django.conf import settings
 | 
					from django.conf import settings
 | 
				
			||||||
 | 
					from django.utils import timezone
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from tika_client import TikaClient
 | 
					from tika_client import TikaClient
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from documents.parsers import DocumentParser
 | 
					from documents.parsers import DocumentParser
 | 
				
			||||||
@@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
 | 
					            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
 | 
				
			||||||
                parsed = client.tika.as_text.from_file(document_path, mime_type)
 | 
					                with open(document_path, 'rb') as f:
 | 
				
			||||||
 | 
					                    content = f.read()
 | 
				
			||||||
 | 
					                    parsed = client.tika.as_text.from_buffer(content, mime_type)
 | 
				
			||||||
        except Exception as err:
 | 
					        except Exception as err:
 | 
				
			||||||
            raise ParseError(
 | 
					            raise ParseError(
 | 
				
			||||||
                f"Could not parse {document_path} with tika server at "
 | 
					                f"Could not parse {document_path} with tika server at "
 | 
				
			||||||
@@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser):
 | 
				
			|||||||
        if self.text is not None:
 | 
					        if self.text is not None:
 | 
				
			||||||
            self.text = self.text.strip()
 | 
					            self.text = self.text.strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.date = parsed.created
 | 
					        tz = timezone.get_current_timezone()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.date = timezone.make_aware(parsed.created,tz)
 | 
				
			||||||
        self.archive_path = self.convert_to_pdf(document_path, file_name)
 | 
					        self.archive_path = self.convert_to_pdf(document_path, file_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def convert_to_pdf(self, document_path, file_name):
 | 
					    def convert_to_pdf(self, document_path, file_name):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user