mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Messing with conversion of azure output to hocr
This commit is contained in:
parent
ec505e41fa
commit
0e9d2f6831
@ -109,6 +109,13 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point]
|
||||
if not polygon:
|
||||
return "0 0 0 0"
|
||||
x_coordinates = [point.x for point in polygon]
|
||||
y_coordinates = [point.y for point in polygon]
|
||||
return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
|
||||
|
||||
def azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
@ -130,6 +137,21 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
)
|
||||
result = poller.result()
|
||||
|
||||
hocr = "<html><body>"
|
||||
|
||||
for page_number, page in enumerate(result.pages, start=1):
|
||||
hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
|
||||
|
||||
for idx, word in enumerate(page.words):
|
||||
bbox = self.get_bbox_from_polygon(word.polygon)
|
||||
hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
|
||||
|
||||
hocr += "</div>"
|
||||
|
||||
hocr += "</body></html>"
|
||||
|
||||
self.log.info(f"HOCR output: {hocr}")
|
||||
|
||||
return result.content
|
||||
|
||||
def google_cloud_vision_parse(
|
||||
|
@ -34,6 +34,44 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
def test_get_text_with_azure(self, mock_azure_client):
|
||||
result = mock.Mock()
|
||||
result.content = "This is a test document."
|
||||
result.pages = [
|
||||
mock.Mock(
|
||||
width=100,
|
||||
height=100,
|
||||
words=[
|
||||
mock.Mock(
|
||||
content="This",
|
||||
polygon=[
|
||||
mock.Mock(x=0, y=0),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="is",
|
||||
polygon=[
|
||||
mock.Mock(x=10, y=10),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="a",
|
||||
polygon=[
|
||||
mock.Mock(x=20, y=20),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="test",
|
||||
polygon=[
|
||||
mock.Mock(x=30, y=30),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="document.",
|
||||
polygon=[
|
||||
mock.Mock(x=40, y=40),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
|
||||
result
|
||||
|
Loading…
x
Reference in New Issue
Block a user