From 0e9d2f6831d5e425d462d378431b885023c28077 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 8 Mar 2024 22:28:14 -0800 Subject: [PATCH] Messing with conversion of azure output to hocr --- src/paperless_remote/parsers.py | 22 +++++++++++++ src/paperless_remote/tests/test_parser.py | 38 +++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py index f6bd4594e..3bfb1ce0f 100644 --- a/src/paperless_remote/parsers.py +++ b/src/paperless_remote/parsers.py @@ -109,6 +109,13 @@ class RemoteDocumentParser(RasterisedDocumentParser): return "\n".join(lines) + def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point] + if not polygon: + return "0 0 0 0" + x_coordinates = [point.x for point in polygon] + y_coordinates = [point.y for point in polygon] + return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}" + def azure_ai_vision_parse( self, file: Path, @@ -130,6 +137,21 @@ class RemoteDocumentParser(RasterisedDocumentParser): ) result = poller.result() + hocr = "" + + for page_number, page in enumerate(result.pages, start=1): + hocr += f'
' + + for idx, word in enumerate(page.words): + bbox = self.get_bbox_from_polygon(word.polygon) + hocr += f'{word.content}' + + hocr += "
" + + hocr += "" + + self.log.info(f"HOCR output: {hocr}") + return result.content def google_cloud_vision_parse( diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py index 3283eeffc..188cac916 100644 --- a/src/paperless_remote/tests/test_parser.py +++ b/src/paperless_remote/tests/test_parser.py @@ -34,6 +34,44 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_get_text_with_azure(self, mock_azure_client): result = mock.Mock() result.content = "This is a test document." + result.pages = [ + mock.Mock( + width=100, + height=100, + words=[ + mock.Mock( + content="This", + polygon=[ + mock.Mock(x=0, y=0), + ], + ), + mock.Mock( + content="is", + polygon=[ + mock.Mock(x=10, y=10), + ], + ), + mock.Mock( + content="a", + polygon=[ + mock.Mock(x=20, y=20), + ], + ), + mock.Mock( + content="test", + polygon=[ + mock.Mock(x=30, y=30), + ], + ), + mock.Mock( + content="document.", + polygon=[ + mock.Mock(x=40, y=40), + ], + ), + ], + ), + ] mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = ( result