mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Working arround current TIKA Library Bugs
This commit is contained in:
parent
db48d4c576
commit
d875be60d4
@ -1,8 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import array
|
||||||
|
|
||||||
|
import dateutil.parser
|
||||||
import httpx
|
import httpx
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||||
parsed = client.tika.as_text.from_file(document_path, mime_type)
|
with open(document_path, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
parsed = client.tika.as_text.from_buffer(content, mime_type)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Could not parse {document_path} with tika server at "
|
f"Could not parse {document_path} with tika server at "
|
||||||
@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
if self.text is not None:
|
if self.text is not None:
|
||||||
self.text = self.text.strip()
|
self.text = self.text.strip()
|
||||||
|
|
||||||
self.date = parsed.created
|
tz = timezone.get_current_timezone()
|
||||||
|
|
||||||
|
self.date = timezone.make_aware(parsed.created,tz)
|
||||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||||
|
|
||||||
def convert_to_pdf(self, document_path, file_name):
|
def convert_to_pdf(self, document_path, file_name):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user