diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py
index f6bd4594e..3bfb1ce0f 100644
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -109,6 +109,13 @@ class RemoteDocumentParser(RasterisedDocumentParser):
return "\n".join(lines)
+ def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point]
+ if not polygon:
+ return "0 0 0 0"
+ x_coordinates = [point.x for point in polygon]
+ y_coordinates = [point.y for point in polygon]
+ return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
+
def azure_ai_vision_parse(
self,
file: Path,
@@ -130,6 +137,21 @@ class RemoteDocumentParser(RasterisedDocumentParser):
)
result = poller.result()
+ hocr = "
"
+
+ for page_number, page in enumerate(result.pages, start=1):
+ hocr += f''
+
+ for idx, word in enumerate(page.words):
+ bbox = self.get_bbox_from_polygon(word.polygon)
+ hocr += f'{word.content}'
+
+ hocr += "
"
+
+ hocr += ""
+
+ self.log.info(f"HOCR output: {hocr}")
+
return result.content
def google_cloud_vision_parse(
diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py
index 3283eeffc..188cac916 100644
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@@ -34,6 +34,44 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_get_text_with_azure(self, mock_azure_client):
result = mock.Mock()
result.content = "This is a test document."
+ result.pages = [
+ mock.Mock(
+ width=100,
+ height=100,
+ words=[
+ mock.Mock(
+ content="This",
+ polygon=[
+ mock.Mock(x=0, y=0),
+ ],
+ ),
+ mock.Mock(
+ content="is",
+ polygon=[
+ mock.Mock(x=10, y=10),
+ ],
+ ),
+ mock.Mock(
+ content="a",
+ polygon=[
+ mock.Mock(x=20, y=20),
+ ],
+ ),
+ mock.Mock(
+ content="test",
+ polygon=[
+ mock.Mock(x=30, y=30),
+ ],
+ ),
+ mock.Mock(
+ content="document.",
+ polygon=[
+ mock.Mock(x=40, y=40),
+ ],
+ ),
+ ],
+ ),
+ ]
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
result