Messing with conversion of azure output to hocr

This commit is contained in:
shamoon 2024-03-08 22:28:14 -08:00
parent ec505e41fa
commit 0e9d2f6831
2 changed files with 60 additions and 0 deletions

View File

@ -109,6 +109,13 @@ class RemoteDocumentParser(RasterisedDocumentParser):
return "\n".join(lines)
def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point]
if not polygon:
return "0 0 0 0"
x_coordinates = [point.x for point in polygon]
y_coordinates = [point.y for point in polygon]
return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
def azure_ai_vision_parse(
self,
file: Path,
@ -130,6 +137,21 @@ class RemoteDocumentParser(RasterisedDocumentParser):
)
result = poller.result()
hocr = "<html><body>"
for page_number, page in enumerate(result.pages, start=1):
hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
for idx, word in enumerate(page.words):
bbox = self.get_bbox_from_polygon(word.polygon)
hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
hocr += "</div>"
hocr += "</body></html>"
self.log.info(f"HOCR output: {hocr}")
return result.content
def google_cloud_vision_parse(

View File

@ -34,6 +34,44 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_get_text_with_azure(self, mock_azure_client):
result = mock.Mock()
result.content = "This is a test document."
result.pages = [
mock.Mock(
width=100,
height=100,
words=[
mock.Mock(
content="This",
polygon=[
mock.Mock(x=0, y=0),
],
),
mock.Mock(
content="is",
polygon=[
mock.Mock(x=10, y=10),
],
),
mock.Mock(
content="a",
polygon=[
mock.Mock(x=20, y=20),
],
),
mock.Mock(
content="test",
polygon=[
mock.Mock(x=30, y=30),
],
),
mock.Mock(
content="document.",
polygon=[
mock.Mock(x=40, y=40),
],
),
],
),
]
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
result