mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-13 10:03:49 -05:00
Messing with conversion of azure output to hocr
This commit is contained in:
parent
ec505e41fa
commit
0e9d2f6831
src/paperless_remote
@ -109,6 +109,13 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point]
|
||||||
|
if not polygon:
|
||||||
|
return "0 0 0 0"
|
||||||
|
x_coordinates = [point.x for point in polygon]
|
||||||
|
y_coordinates = [point.y for point in polygon]
|
||||||
|
return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
|
||||||
|
|
||||||
def azure_ai_vision_parse(
|
def azure_ai_vision_parse(
|
||||||
self,
|
self,
|
||||||
file: Path,
|
file: Path,
|
||||||
@ -130,6 +137,21 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
)
|
)
|
||||||
result = poller.result()
|
result = poller.result()
|
||||||
|
|
||||||
|
hocr = "<html><body>"
|
||||||
|
|
||||||
|
for page_number, page in enumerate(result.pages, start=1):
|
||||||
|
hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
|
||||||
|
|
||||||
|
for idx, word in enumerate(page.words):
|
||||||
|
bbox = self.get_bbox_from_polygon(word.polygon)
|
||||||
|
hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
|
||||||
|
|
||||||
|
hocr += "</div>"
|
||||||
|
|
||||||
|
hocr += "</body></html>"
|
||||||
|
|
||||||
|
self.log.info(f"HOCR output: {hocr}")
|
||||||
|
|
||||||
return result.content
|
return result.content
|
||||||
|
|
||||||
def google_cloud_vision_parse(
|
def google_cloud_vision_parse(
|
||||||
|
@ -34,6 +34,44 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
def test_get_text_with_azure(self, mock_azure_client):
|
def test_get_text_with_azure(self, mock_azure_client):
|
||||||
result = mock.Mock()
|
result = mock.Mock()
|
||||||
result.content = "This is a test document."
|
result.content = "This is a test document."
|
||||||
|
result.pages = [
|
||||||
|
mock.Mock(
|
||||||
|
width=100,
|
||||||
|
height=100,
|
||||||
|
words=[
|
||||||
|
mock.Mock(
|
||||||
|
content="This",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=0, y=0),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="is",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=10, y=10),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="a",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=20, y=20),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="test",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=30, y=30),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="document.",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=40, y=40),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
|
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
|
||||||
result
|
result
|
||||||
|
Loading…
x
Reference in New Issue
Block a user