Merge pull request #532 from paperless-ngx/feature-barcode-splitter

Feature barcode splitter
This commit is contained in:
Florian 2022-04-11 08:51:48 +02:00 committed by GitHub
commit 4dedff00b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 778 additions and 1 deletions

View File

@ -132,7 +132,7 @@ jobs:
name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng
sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils
-
name: Install Python dependencies
run: |

View File

@ -51,6 +51,8 @@ concurrent-log-handler = "*"
"backports.zoneinfo" = {version = "*", markers = "python_version < '3.9'"}
"importlib-resources" = {version = "*", markers = "python_version < '3.9'"}
zipp = {version = "*", markers = "python_version < '3.9'"}
pyzbar = "*"
pdf2image = "*"
[dev-packages]
coveralls = "*"

24
Pipfile.lock generated
View File

@ -693,6 +693,14 @@
"index": "pypi",
"version": "==2.5.0"
},
"pdf2image": {
"hashes": [
"sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65",
"sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb"
],
"index": "pypi",
"version": "==1.16.0"
},
"pdfminer.six": {
"hashes": [
"sha256:af0630f98a292bad4170f54e80f82ca81b916dd0b2c996437ec45c02f11d8762",
@ -960,6 +968,15 @@
],
"version": "==6.0"
},
"pyzbar": {
"hashes": [
"sha256:13e3ee5a2f3a545204a285f41814d5c0db571967e8d4af8699a03afc55182a9c",
"sha256:4559628b8192feb25766d954b36a3753baaf5c97c03135aec7e4a026036b475d",
"sha256:8f4c5264c9c7c6b9f20d01efc52a4eba1ded47d9ba857a94130afe33703eb518"
],
"index": "pypi",
"version": "==0.1.9"
},
"redis": {
"hashes": [
"sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
@ -1784,6 +1801,13 @@
],
"version": "==1.6.0"
},
"mypy-extensions": {
"hashes": [
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
],
"version": "==0.4.3"
},
"packaging": {
"hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",

View File

@ -613,6 +613,27 @@ PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=<bool>
Defaults to false.
PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool>
Enables the scanning and page separation based on detected barcodes.
This allows for scanning and adding multiple documents per uploaded
file, which are separated by one or multiple barcode pages.
For ease of use, it is suggested to use a standardized separation page,
e.g. `here <https://www.alliancegroup.co.uk/patch-codes.htm>`_.
If no barcodes are detected in the uploaded file, no page separation
will happen.
Defaults to false.
PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
Defines the string to be detected as a separator barcode.
If paperless is used with the PATCH-T separator pages, users
shouldn't change this.
Defaults to "PATCHT"
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
On smaller systems, or even in the case of Very Large Documents, the consumer

View File

@ -62,6 +62,8 @@
#PAPERLESS_CONSUMER_RECURSIVE=false
#PAPERLESS_CONSUMER_IGNORE_PATTERNS=[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]
#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
#PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh

View File

@ -61,6 +61,7 @@ ocrmypdf==13.4.2
packaging==21.3; python_version >= '3.6'
pathvalidate==2.5.0
pdfminer.six==20220319
pdf2image==1.16.0
pikepdf==5.1.1
pillow==9.1.0
pluggy==1.0.0; python_version >= '3.6'
@ -79,6 +80,7 @@ python-magic==0.4.25
pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pytz==2022.1
pyyaml==6.0
pyzbar==0.1.9
redis==3.5.3
regex==2022.3.2; python_version >= '3.6'
reportlab==3.6.9; python_version >= '3.7' and python_version < '4'

View File

@ -1,6 +1,12 @@
import logging
import os
import shutil
import tempfile
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
import tqdm
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
from django.db.models.signals import post_save
from documents import index
@ -14,8 +20,12 @@ from documents.models import Document
from documents.models import DocumentType
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from pdf2image import convert_from_path
from pikepdf import Pdf
from pyzbar import pyzbar
from whoosh.writing import AsyncWriter
logger = logging.getLogger("paperless.tasks")
@ -62,6 +72,115 @@ def train_classifier():
logger.warning("Classifier error: " + str(e))
def barcode_reader(image) -> List[str]:
"""
Read any barcodes contained in image
Returns a list containing all found barcodes
"""
barcodes = []
# Decode the barcode image
detected_barcodes = pyzbar.decode(image)
if detected_barcodes:
# Traverse through all the detected barcodes in image
for barcode in detected_barcodes:
if barcode.data:
decoded_barcode = barcode.data.decode("utf-8")
barcodes.append(decoded_barcode)
logger.debug(
f"Barcode of type {str(barcode.type)} found: {decoded_barcode}",
)
return barcodes
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
"""
Scan the provided file for page separating barcodes
Returns a list of pagenumbers, which separate the file
"""
separator_page_numbers = []
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
# use a temporary directory in case the file os too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
current_barcodes = barcode_reader(page)
if separator_barcode in current_barcodes:
separator_page_numbers.append(current_page_number)
return separator_page_numbers
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
"""
Separate the provided file on the pages_to_split_on.
The pages which are defined by page_numbers will be removed.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
fname = os.path.splitext(os.path.basename(filepath))[0]
pdf = Pdf.open(filepath)
document_paths = []
logger.debug(f"Temp dir is {str(tempdir)}")
if not pages_to_split_on:
logger.warning("No pages to split on!")
else:
# go from the first page to the first separator page
dst = Pdf.new()
for n, page in enumerate(pdf.pages):
if n < pages_to_split_on[0]:
dst.pages.append(page)
output_filename = "{}_document_0.pdf".format(fname)
savepath = os.path.join(tempdir, output_filename)
with open(savepath, "wb") as out:
dst.save(out)
document_paths = [savepath]
# iterate through the rest of the document
for count, page_number in enumerate(pages_to_split_on):
logger.debug(f"Count: {str(count)} page_number: {str(page_number)}")
dst = Pdf.new()
try:
next_page = pages_to_split_on[count + 1]
except IndexError:
next_page = len(pdf.pages)
# skip the first page_number. This contains the barcode page
for page in range(page_number + 1, next_page):
logger.debug(
f"page_number: {str(page_number)} next_page: {str(next_page)}",
)
dst.pages.append(pdf.pages[page])
output_filename = "{}_document_{}.pdf".format(fname, str(count + 1))
logger.debug(f"pdf no:{str(count)} has {str(len(dst.pages))} pages")
savepath = os.path.join(tempdir, output_filename)
with open(savepath, "wb") as out:
dst.save(out)
document_paths.append(savepath)
logger.debug(f"Temp files are {str(document_paths)}")
return document_paths
def save_to_dir(
filepath: str,
newname: str = None,
target_dir: str = settings.CONSUMPTION_DIR,
):
"""
Copies filepath to target_dir.
Optionally rename the file.
"""
if os.path.isfile(filepath) and os.path.isdir(target_dir):
dst = shutil.copy(filepath, target_dir)
logging.debug(f"saved {str(filepath)} to {str(dst)}")
if newname:
dst_new = os.path.join(target_dir, newname)
logger.debug(f"moving {str(dst)} to {str(dst_new)}")
os.rename(dst, dst_new)
else:
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
def consume_file(
path,
override_filename=None,
@ -72,6 +191,48 @@ def consume_file(
task_id=None,
):
# check for separators in current document
if settings.CONSUMER_ENABLE_BARCODES:
separators = []
document_list = []
separators = scan_file_for_separating_barcodes(path)
if separators:
logger.debug(f"Pages with separators found in: {str(path)}")
document_list = separate_pages(path, separators)
if document_list:
for n, document in enumerate(document_list):
# save to consumption dir
# rename it to the original filename with number prefix
if override_filename:
newname = f"{str(n)}_" + override_filename
else:
newname = None
save_to_dir(document, newname=newname)
# if we got here, the document was successfully split
# and can safely be deleted
logger.debug("Deleting file {}".format(path))
os.unlink(path)
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": override_filename,
"task_id": task_id,
"current_progress": 100,
"max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except OSError as e:
logger.warning("OSError. It could be, the broker cannot be reached.")
logger.warning(str(e))
return "File successfully split"
# continue with consumption if no barcode was found
document = Consumer().try_consume_file(
path,
override_filename=override_filename,

Binary file not shown.

After

Width:  |  Height:  |  Size: 836 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 891 B

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 337 B

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.4 KiB

View File

@ -1,7 +1,10 @@
import os
import shutil
import tempfile
from unittest import mock
from django.conf import settings
from django.test import override_settings
from django.test import TestCase
from django.utils import timezone
from documents import tasks
@ -12,6 +15,7 @@ from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from documents.sanity_checker import SanityCheckMessages
from documents.tests.utils import DirectoriesMixin
from PIL import Image
class TestTasks(DirectoriesMixin, TestCase):
@ -89,6 +93,318 @@ class TestTasks(DirectoriesMixin, TestCase):
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
self.assertNotEqual(mtime2, mtime3)
def test_barcode_reader(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(tasks.barcode_reader(img), [separator_barcode])
def test_barcode_reader2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t.pbm",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(tasks.barcode_reader(img), [separator_barcode])
def test_barcode_reader_distorsion(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT-distorsion.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(tasks.barcode_reader(img), [separator_barcode])
def test_barcode_reader_distorsion2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT-distorsion2.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(tasks.barcode_reader(img), [separator_barcode])
def test_barcode_reader_unreadable(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT-unreadable.png",
)
img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), [])
def test_barcode_reader_qr(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"qr-code-PATCHT.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(tasks.barcode_reader(img), [separator_barcode])
def test_barcode_reader_128(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-128-PATCHT.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(tasks.barcode_reader(img), [separator_barcode])
def test_barcode_reader_no_barcode(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.png")
img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), [])
def test_barcode_reader_custom_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-custom.png",
)
img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
def test_barcode_reader_custom_qr_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-qr-custom.png",
)
img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
def test_barcode_reader_custom_128_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-128-custom.png",
)
img = Image.open(test_file)
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
def test_scan_file_for_separating_barcodes2(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
def test_scan_file_for_separating_barcodes3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
def test_scan_file_for_separating_barcodes4(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"several-patcht-codes.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [2, 5])
def test_scan_file_for_separating_barcodes_upsidedown(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle_reverse.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
def test_scan_file_for_separating_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-qr.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-custom.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-qr-custom.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_128_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-128-custom.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
def test_scan_file_for_separating_wrong_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-custom.pdf",
)
pages = tasks.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
def test_separate_pages(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
)
pages = tasks.separate_pages(test_file, [1])
self.assertEqual(len(pages), 2)
def test_separate_pages_no_list(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
)
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
pages = tasks.separate_pages(test_file, [])
self.assertEqual(pages, [])
self.assertEqual(
cm.output,
[
f"WARNING:paperless.tasks:No pages to split on!",
],
)
def test_save_to_dir(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
tasks.save_to_dir(test_file, target_dir=tempdir)
target_file = os.path.join(tempdir, "patch-code-t.pdf")
self.assertTrue(os.path.isfile(target_file))
def test_save_to_dir2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t.pdf",
)
nonexistingdir = "/nowhere"
if os.path.isdir(nonexistingdir):
self.fail("non-existing dir exists")
else:
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
tasks.save_to_dir(test_file, target_dir=nonexistingdir)
self.assertEqual(
cm.output,
[
f"WARNING:paperless.tasks:{str(test_file)} or {str(nonexistingdir)} don't exist.",
],
)
def test_save_to_dir3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
tasks.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir)
target_file = os.path.join(tempdir, "newname.pdf")
self.assertTrue(os.path.isfile(target_file))
def test_barcode_splitter(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
separators = tasks.scan_file_for_separating_barcodes(test_file)
self.assertTrue(separators)
document_list = tasks.separate_pages(test_file, separators)
self.assertTrue(document_list)
for document in document_list:
tasks.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_success(self, m):
m.return_value = SanityCheckMessages()

View File

@ -498,6 +498,12 @@ CONSUMER_IGNORE_PATTERNS = list(
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
CONSUMER_ENABLE_BARCODES = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_BARCODES",
)
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))