diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 222b0e0a0..ee22ae929 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,7 +132,7 @@ jobs: name: Install system dependencies run: | sudo apt-get update -qq - sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng + sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils - name: Install Python dependencies run: | diff --git a/Pipfile b/Pipfile index 196bae4c9..0ab6775e6 100644 --- a/Pipfile +++ b/Pipfile @@ -51,6 +51,8 @@ concurrent-log-handler = "*" "backports.zoneinfo" = {version = "*", markers = "python_version < '3.9'"} "importlib-resources" = {version = "*", markers = "python_version < '3.9'"} zipp = {version = "*", markers = "python_version < '3.9'"} +pyzbar = "*" +pdf2image = "*" [dev-packages] coveralls = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 7b74fd5b5..00c82da60 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -693,6 +693,14 @@ "index": "pypi", "version": "==2.5.0" }, + "pdf2image": { + "hashes": [ + "sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65", + "sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb" + ], + "index": "pypi", + "version": "==1.16.0" + }, "pdfminer.six": { "hashes": [ "sha256:af0630f98a292bad4170f54e80f82ca81b916dd0b2c996437ec45c02f11d8762", @@ -960,6 +968,15 @@ ], "version": "==6.0" }, + "pyzbar": { + "hashes": [ + "sha256:13e3ee5a2f3a545204a285f41814d5c0db571967e8d4af8699a03afc55182a9c", + "sha256:4559628b8192feb25766d954b36a3753baaf5c97c03135aec7e4a026036b475d", + "sha256:8f4c5264c9c7c6b9f20d01efc52a4eba1ded47d9ba857a94130afe33703eb518" + ], + "index": "pypi", + "version": "==0.1.9" + }, "redis": { "hashes": [ "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2", @@ -1784,6 +1801,13 @@ ], "version": "==1.6.0" }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", diff --git a/docs/configuration.rst b/docs/configuration.rst index 9a59a80ba..3541f2e07 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -613,6 +613,27 @@ PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS= Defaults to false. +PAPERLESS_CONSUMER_ENABLE_BARCODES= + Enables the scanning and page separation based on detected barcodes. + This allows for scanning and adding multiple documents per uploaded + file, which are separated by one or multiple barcode pages. + + For ease of use, it is suggested to use a standardized separation page, + e.g. `here `_. + + If no barcodes are detected in the uploaded file, no page separation + will happen. + + Defaults to false. + + +PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT + Defines the string to be detected as a separator barcode. + If paperless is used with the PATCH-T separator pages, users + shouldn't change this. + + Defaults to "PATCHT" + PAPERLESS_CONVERT_MEMORY_LIMIT= On smaller systems, or even in the case of Very Large Documents, the consumer diff --git a/paperless.conf.example b/paperless.conf.example index 0b37e210d..be5071636 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -62,6 +62,8 @@ #PAPERLESS_CONSUMER_RECURSIVE=false #PAPERLESS_CONSUMER_IGNORE_PATTERNS=[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"] #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false +#PAPERLESS_CONSUMER_ENABLE_BARCODES=false +#PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT #PAPERLESS_OPTIMIZE_THUMBNAILS=true #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh diff --git a/requirements.txt b/requirements.txt index 045ef12ed..7aaf84cc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -61,6 +61,7 @@ ocrmypdf==13.4.2 packaging==21.3; python_version >= '3.6' pathvalidate==2.5.0 pdfminer.six==20220319 +pdf2image==1.16.0 pikepdf==5.1.1 pillow==9.1.0 pluggy==1.0.0; python_version >= '3.6' @@ -79,6 +80,7 @@ python-magic==0.4.25 pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' pytz==2022.1 pyyaml==6.0 +pyzbar==0.1.9 redis==3.5.3 regex==2022.3.2; python_version >= '3.6' reportlab==3.6.9; python_version >= '3.7' and python_version < '4' diff --git a/src/documents/tasks.py b/src/documents/tasks.py index b43f211de..e9a015d67 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -1,6 +1,12 @@ import logging +import os +import shutil +import tempfile +from typing import List # for type hinting. Can be removed, if only Python >3.8 is used import tqdm +from asgiref.sync import async_to_sync +from channels.layers import get_channel_layer from django.conf import settings from django.db.models.signals import post_save from documents import index @@ -14,8 +20,12 @@ from documents.models import Document from documents.models import DocumentType from documents.models import Tag from documents.sanity_checker import SanityCheckFailedException +from pdf2image import convert_from_path +from pikepdf import Pdf +from pyzbar import pyzbar from whoosh.writing import AsyncWriter + logger = logging.getLogger("paperless.tasks") @@ -62,6 +72,115 @@ def train_classifier(): logger.warning("Classifier error: " + str(e)) +def barcode_reader(image) -> List[str]: + """ + Read any barcodes contained in image + Returns a list containing all found barcodes + """ + barcodes = [] + # Decode the barcode image + detected_barcodes = pyzbar.decode(image) + + if detected_barcodes: + # Traverse through all the detected barcodes in image + for barcode in detected_barcodes: + if barcode.data: + decoded_barcode = barcode.data.decode("utf-8") + barcodes.append(decoded_barcode) + logger.debug( + f"Barcode of type {str(barcode.type)} found: {decoded_barcode}", + ) + return barcodes + + +def scan_file_for_separating_barcodes(filepath: str) -> List[int]: + """ + Scan the provided file for page separating barcodes + Returns a list of pagenumbers, which separate the file + """ + separator_page_numbers = [] + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + # use a temporary directory in case the file os too big to handle in memory + with tempfile.TemporaryDirectory() as path: + pages_from_path = convert_from_path(filepath, output_folder=path) + for current_page_number, page in enumerate(pages_from_path): + current_barcodes = barcode_reader(page) + if separator_barcode in current_barcodes: + separator_page_numbers.append(current_page_number) + return separator_page_numbers + + +def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: + """ + Separate the provided file on the pages_to_split_on. + The pages which are defined by page_numbers will be removed. + Returns a list of (temporary) filepaths to consume. + These will need to be deleted later. + """ + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + fname = os.path.splitext(os.path.basename(filepath))[0] + pdf = Pdf.open(filepath) + document_paths = [] + logger.debug(f"Temp dir is {str(tempdir)}") + if not pages_to_split_on: + logger.warning("No pages to split on!") + else: + # go from the first page to the first separator page + dst = Pdf.new() + for n, page in enumerate(pdf.pages): + if n < pages_to_split_on[0]: + dst.pages.append(page) + output_filename = "{}_document_0.pdf".format(fname) + savepath = os.path.join(tempdir, output_filename) + with open(savepath, "wb") as out: + dst.save(out) + document_paths = [savepath] + + # iterate through the rest of the document + for count, page_number in enumerate(pages_to_split_on): + logger.debug(f"Count: {str(count)} page_number: {str(page_number)}") + dst = Pdf.new() + try: + next_page = pages_to_split_on[count + 1] + except IndexError: + next_page = len(pdf.pages) + # skip the first page_number. This contains the barcode page + for page in range(page_number + 1, next_page): + logger.debug( + f"page_number: {str(page_number)} next_page: {str(next_page)}", + ) + dst.pages.append(pdf.pages[page]) + output_filename = "{}_document_{}.pdf".format(fname, str(count + 1)) + logger.debug(f"pdf no:{str(count)} has {str(len(dst.pages))} pages") + savepath = os.path.join(tempdir, output_filename) + with open(savepath, "wb") as out: + dst.save(out) + document_paths.append(savepath) + logger.debug(f"Temp files are {str(document_paths)}") + return document_paths + + +def save_to_dir( + filepath: str, + newname: str = None, + target_dir: str = settings.CONSUMPTION_DIR, +): + """ + Copies filepath to target_dir. + Optionally rename the file. + """ + if os.path.isfile(filepath) and os.path.isdir(target_dir): + dst = shutil.copy(filepath, target_dir) + logging.debug(f"saved {str(filepath)} to {str(dst)}") + if newname: + dst_new = os.path.join(target_dir, newname) + logger.debug(f"moving {str(dst)} to {str(dst_new)}") + os.rename(dst, dst_new) + else: + logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") + + def consume_file( path, override_filename=None, @@ -72,6 +191,48 @@ def consume_file( task_id=None, ): + # check for separators in current document + if settings.CONSUMER_ENABLE_BARCODES: + separators = [] + document_list = [] + separators = scan_file_for_separating_barcodes(path) + if separators: + logger.debug(f"Pages with separators found in: {str(path)}") + document_list = separate_pages(path, separators) + if document_list: + for n, document in enumerate(document_list): + # save to consumption dir + # rename it to the original filename with number prefix + if override_filename: + newname = f"{str(n)}_" + override_filename + else: + newname = None + save_to_dir(document, newname=newname) + # if we got here, the document was successfully split + # and can safely be deleted + logger.debug("Deleting file {}".format(path)) + os.unlink(path) + # notify the sender, otherwise the progress bar + # in the UI stays stuck + payload = { + "filename": override_filename, + "task_id": task_id, + "current_progress": 100, + "max_progress": 100, + "status": "SUCCESS", + "message": "finished", + } + try: + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except OSError as e: + logger.warning("OSError. It could be, the broker cannot be reached.") + logger.warning(str(e)) + return "File successfully split" + + # continue with consumption if no barcode was found document = Consumer().try_consume_file( path, override_filename=override_filename, diff --git a/src/documents/tests/samples/barcodes/barcode-128-PATCHT.png b/src/documents/tests/samples/barcodes/barcode-128-PATCHT.png new file mode 100644 index 000000000..80517d56d Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-128-PATCHT.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-128-custom.pdf b/src/documents/tests/samples/barcodes/barcode-128-custom.pdf new file mode 100644 index 000000000..f603dff5f Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-128-custom.pdf differ diff --git a/src/documents/tests/samples/barcodes/barcode-128-custom.png b/src/documents/tests/samples/barcodes/barcode-128-custom.png new file mode 100644 index 000000000..c3f1b803a Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-128-custom.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png new file mode 100644 index 000000000..3f858f6ad Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png new file mode 100644 index 000000000..cc81f8e36 Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-unreadable.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-unreadable.png new file mode 100644 index 000000000..1e24b4d84 Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-unreadable.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT.png new file mode 100644 index 000000000..0078026c8 Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-39-PATCHT.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-39-custom.pdf b/src/documents/tests/samples/barcodes/barcode-39-custom.pdf new file mode 100644 index 000000000..ca78b93be --- /dev/null +++ b/src/documents/tests/samples/barcodes/barcode-39-custom.pdf @@ -0,0 +1,243 @@ +%PDF-1.6 +%äüöß +2 0 obj +<> +stream +x]Qj0 +t.+Vl1=:m_ަb<xH1\?Ը*"uW8 D?%L Ztأǐ;l7p6s!3. +A-:k<⟿2ΤZj}3z(L zB -#.B<2sHk& +NYRDZ~'6:-%.[';n+ٜ%DQYV7Nun"g#`Rmx +endstream +endobj + +3 0 obj +284 +endobj + +4 0 obj +<> +stream +xn`IdY̢`A8+Mei~2jޭvҶؒƶի$zmIOw4̟75ݲN[ضz5DV/-4fC287o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o޼y͛7o;g +endstream +endobj + +5 0 obj +11207 +endobj + +7 0 obj +<> +stream +xzy\U=UA7BoС + KQ!lI0tD: 4CwqqAQ22sflLEQtt9>͸ddqy&dԧNݪf +s9ls {@KP B~CX:ƹ5MU؞ ;ӄ=C!ٿ *YR< ,^~/"uw"{?C%/_J_wqrI.Tꈟ={=C^EtvxArH䜗N_͛O(ˣMXJ^POAG$#Ov޹X19з] d#vH#1ݰ9Qσ3kl&=d/^s-27 +|#}s({T_ m "1;7]xugBm%i%)eʉQ)>.$uz2 476$U kT/~THM/ԛDQc{y7mP_W[zPvU++W,_VW\T(/w!8f1=Mj*R% X`s9s]C E\ &*oh >\"_9`B@0SK5%>h[G;izJ%I& QFӧׇS +Xzl%`kW1D&Mֆ-m5.7PT!!Lh'%+Nn +LJou4lM-Eڪxn˘5oo{܄uu-m0VXnHP (u8#qڊw=.2DGU@'P. ya&y<:@Q#B[0 0zdBZ )Ic\c AXǦ9f4p!V +>Cc)KB A٩[3OZ0\=lVj.)PFkMZDcHꄞUJk$CC{%e6}$y*Ie~Z, ` jI gtn :Hau3)!L{X`Ä!;CQœvx#t,j&;x O^:dJ(ûttK(d4G&;D$'s4CtE%pڝkYHcLʜi.sw^Xf\[7{b((е^uwKQRn}VϿuB*Xr{ Rbfd=Y@6bǑ Q[jӈ&<0*sQ=paJ+2ၑ9gEwԺLjQ3XuUJ6T\F9U*Pmת˾3(^|;/vJzkn,*dvfOeH-5,…SEHxZG&lvٔaIJ6flFh6T<`:H8`v9@0I!:s~H&hVq?r~Χ0)kehvV˅BA(,R(fջr_j'eJke<'ܫfaH,^e?rX?eCK K%h_MK )/A# ABg$\TQ気V:l<>Gcnym'u?*)Zg|K*U~=$4YUziRܞ ?BgɔcKcύ\BtBF(9 L\ ϙ-i@H1 V4rs1]}&.~!z޽=0(=pYES?[|h:zF6>s/9 lqUÙH+FՕilֵ%ҚXF!%]i@]ݜҀ}p̺\g0dXҺ v,)Oښە,Y5SuBIᖔ@ny嚂\GmPZpNU0d3l H>·|C0|ȇ{)~ +|%e:lʇIʂPWa)g2Whrd.x+ÚSm`-S˖phݘT\; ??~st S7]/ſO(AVG> '=+܁qg%["V ͮ1̤,c5QkHLjku>*t;P-\a5DC + +3,XCMe<V桝U<AE_c:Ai^pA滐~1-|rʠ7~ɘIŚ=_x87sm5=5o:kn_ӹ5{z5nuE+pG7m[S[oyi~S76;Y&Ac6N+fUW{;/ݫ!F1!ψ fDȁ\̨x?0"DGSW +tܼXVKYi'55Ns^aѳ pA4 d U)*HO +&Ss^ew0['@rɋQ ;04::5q\uW/*=SU/+!b$h>5*K UZͰӃ &ZeЪ`l޳Vx +ǭp +Zj+ĭe+X + ` +So!8V`Va+ Y!j+X +,&+{Yx^&2 /-g L zY`X LM^ 7|UH+)JB3GƢcFd@Xk3\i.=j4*g \)LLv@ +kJKHe1q +QN}0)DTf$ERfbSo;ʬ?3ypMⵐxLJFCO^󋼩tU0pő+~q5}FJ!(VfMsf:Hé5b& \&LdH&D3g9&Osr&/?O}1E3rQ8&TtD@Y]:k&ن`10E`d`R岢;=ನi؉lɆh6A,/-9`iIHƗ0ܻ [^MQ\" x@:WeŧL:="v7uOx +Yn5ZPoFc'^ϰTӤ +5VCo(v)f+4,b +|~y&Y(^c3SmgyMo2"גF,ϵϙD|22>Z੊i.ogOה ~6ᵏm^e[uV쨛Un7})R];g(5=]ʠ71 荘[|q/=EW9TRr/K~ƷT/j< + 0[x,˯WwJV6W[w1-z㍖M*WۍN<[+-wVttZXKΟAˣ'j$l6iȱo/I3UCwކ-_U\m6YoDU8*@V*eݬ=6iܩƵ>Hm*z7OΤmM;6R 9`}%¤Q7 mF@mJ!ZҭYCJ[8)myJ[J[K.#ǕXXitVک0~' fcyKi +VIF^%({©X͐tY B]J[MT*m q~i#cJ;,PSwԟ+m=Y{Mi)zF.JIΕNS^,u!3twWO\̕,-#]}an}d0 {#ũF+"P0Yq`o=}u@Wx+8-<:KKfq!.> +/"ûzc {b\hk!l S`gx0BH%h`oS-V<k4{yx< Tb8J;rzz;{}K4P^d7\rwc=B{BqIp|3w)?T}?6q[#eQ6hS?:Ke,u8Y+Gn=P'Z ACp@QH4_8֌E@8%͈b !N\,DA=Es$ đ4…PqVsO'4s<)\s0cѾPNJ{Jo߾}!5bѰAK_#@r_I \SSq +B!̥K)Ќx8Wkk$5;e$Lw!luK A(Gߥ,ś#>zl" Q2i4|;RlmUhԅڀȡ:pt._4SH/YrكrĐ*8]#8ym)E]rNķ9qq:"IOa6{pދH(Ww+b4S,?, Ps؄3v#}'d"BvbӋރT.J-3玍f*^:y.ct1E/f-TB%[CIy{h;DEP(;0oShC_{)%BO9j~SNϥ*Gsu(h]=/G6{ۀ7x*rt+qQ(#To$MTR+)ܲl=4:BԷaqA^]Q +)"4.Vlz>sr-876%QycsxPift-a)3|ti܊6qe d˱A=z95˅}# +]f"K?]=4d%>N)q8wt*kXR ]3Yu{ߤ'15|UNwi՜3Ҝ9_ 9{Ĩ-p gh$J-NHR: Lz` NZaYzKbXCo b5Wa +&[(A }JH2> +{#]^Z&Ab^EOJ8SpS+Cdrɧ'?~cc0~ :#G|I5 4XN+9,9?9t2qR}wX4΍GLJ_ =u)O<'=O2cMcAσ]xnG=Gݞo[䙸m6uOB45hilpeħoMxG>7y݃lA悛/k9| ;tOebŞ@g~'٪-c[58 .lͯ vDڱij~'ҪFUhd=lFCӬVl{?gi5''&/r8PPc_zM{=܋u=] +u}uB[j/j*32.#>㴑1ۍTfj8Z +6Noݔw$DnlOhK;)pAR)Qܖf6%!H!l: ރOR;c2̌0E(HrY !@#C,$ŽpTQlV +endstream +endobj + +8 0 obj +8993 +endobj + +9 0 obj +<> +endobj + +10 0 obj +<> +stream +x]n0E|"RH$}@!E*9d3R5㸬sG,Y 3\zDNQra]7y: Q-]lºM f!穝brkyY˟}@$VŌSkQ.e!ө{d˹3RRfov)YoS8֐w^"5GX'kár%3dΟFq+̟Hפy9Fuί?2ט_q )ίꞟzV]<[Mq ޙQ s>.';{8o@g +endstream +endobj + +11 0 obj +<> +endobj + +12 0 obj +<> +stream +x8kxוΌ~J%ȃm9~1x001_H$#- I!miPh>iCڦiKxLдM ]HMڔmMS.I$ܫ14͏ߎ4s׽wd|40<_Y\.$Ͽ7)>ӂEPlȗOm t9o[dCA_ /j +!$ >C|KB#~/[8pﻹpI>g/;͟q1 >8q-"ĢKHIʏŃ1yd%OGFClxA32sr ]uֈEښvUD TH$+*8CIiIy^fhSHrMɖ7‡$[,T0$5W[]q M{G2NZ̬,RRl8pUSsi/t۬vf+1c&:ELa:#N՜w;)VxNw+jJjSn#*5R[RMvyuJhʍ86`8[S)r@A[^^Vzb% ozv|$X;hbz=VuG1zCdG ۩lu+\K ~n{l`r0v;M=2@DqqvXO(rq,3>ǙWJX>"oH{|]0Q}j&LJɊՆ@XT4$Z}CU& }'=e *LyJ P;RW QnMnEnC@kwϋ b*N)R}n)u +xlgJl]S8{qjh} +.hpu7$vE`=;mhea'ulqߨ:fPsByuf$5mPїE7g=(hDB@jmƧ+mĄ3*mfM0'n(Ub{M5F5uYR=U[![T'F =Mj )d?u0emz-%McaYVrj5؂da9&SqU[&WYy:98:&qI5h 7l/ Z½W4f zbJbF ̤q?z;+:IZ$rgJ&#W<}#:ogj OȨR"EDPK虼)`qF`h\fLOT&Csi8k +hL3l.NJ:xJfx<j24ʐiq:uـj ۥ(c] F'4XHkLtDdJV%KjJoIӵ%DZ +m%)u7 jѹ6Ҁ<&h9 z / )\ij45.Ϸ&|F5M +o_s]h+ +w=V`(¼ ]6[Al3sf {,h4pѠG{Nɸk'v"I=v20GZMyY=}l!s}V.'f`1ku,0(afW,̴pwdu𻏧>o,{zS&ȁOI>{́'ob$yq;ͩwb4An +heIeNg57-:z=Z-xrù\m. \.C,C3M `l &hA#680Gü6Ǩ1x_D"2XDEXKZMhiii^deRbi\56Zh` OLJv.- ͞SZÛߚyC7b֢Z%fKf3,|I6˓5krxAYkRL &#`J 5uD*3r\;+$/ϼ+~uBp)1xLʉd3>r >݇e᯴zĤ-d<F b[H|^wcvH, +~Jͤ.$~fᎽJSMHj NӏK}e 9+&yy33ʤ<.Zz-KL"N%wԈH81檱s!a=i97 +Y?M|H>h|OBv>? R˽'=4SOx>o<(0h*yS&o-)؍C`iތo$FVA+.3m"׮-gٲpW|%.ɼ;37CHC؉@L=7_;u&>v睴GoUσ>'{uZ"KY6@YqV[uCh2&vd{T];K5iJlYn1ZJPzk+|0vS;7y>c$'/6n?Izo%$Jt"L6/_b'=eN-քBoϵEl+bҡ;!gE..d41ܲucIW?[fyjc~%k+JVlXg}Fs#7ɋ3s9PPjZ|75Υ5jW4Z-zjCԡ/g[Cyw>T3?} ԳPVRN_wTflca o6flz pO:&A^q;ȱ9 AA.:9Q七;Ug⬇ +Qu(]H_=h,C3N^=wKvŗx8Ca 7H?614߱9ZMbx題u}7iZ!rp11e.>vư21rUw;[F2'bD <@X0Ý*}AnS*3YKU8I +g"븗U8x +B zB U@@T`Sa *,Cت(Ta-T_Ua\~zphNp,Ҽ™ T8 nԟUlؖQؕS\X\[xg8=OGcᝡW Kᠸ.Ed8\wX؋&:|qC_1#|Dkt86FX+^ǽG%4-*@@'}b2 G|btZWxpg8 Ƒu}ub/$E_$ nW +Ɠ>&Cx8l8$/o%D4K\xx$Z#8K`"3kuD0H$M ֠C`"4-`<<!_F>L~>,H Uw`7|.eS*Gb^^m#8/'F1Y?9cHmh< o*n &t$ $h!0*nP4E#IT@cDE#Ds><ذ/VFud2UcQвx}Z822+UeAm(v0?.tNTjĹ\ZT%up]4m!;Q@ۇ!?D! ΤBHUU86@=,[(E0ꋸiGQ>ƞ>f7 +Lpk ^t06-lD]hW>"VćQs-~ LeEu?+IK4WK@ sLvZD܁'~uՍ`~ҳP[Q^*{5nRTdQ\jYEA|l@a6Oڏ hPpy;0(κ͸Gl@l-[+{Yޮڎ0o,VgJGnT|d~.XsÝg#%bj̉Yks)1"ݍaē̟ea'p/p'-@J'kԱm#kqx4UHGC7G`5_wlıq:֐*E*߀8Vx8Dʘ|W#G&:{NA&f338C=2ѷ|&%4x)zW/iuW~{W^]}7BkH)6S[ƛ=CJeG"ׯYB~%4~"_y= :_,;*\IdI0I<{tQ⣗{vԣ';m4)$Ќun=pU1Ev _>jwT GlGG9rD{+l]^"ߛepq`,^{$ b#|\G ň'Fg/w0H-jƢ~]#߯źP;`q6Wml[\Km yGyb[.>5}DsԸ2|v~^k⮪.I\*mp:\vz /.LYa/h`v,ڦ5\b47e^5t-Hd@ &G6UWwNf{;]VTS٢h*пe{O{:|Zw* }nŻө)0qTzDrZH"ICD%͋0r"L&!N@5}"Le0~ }MCd"I(}2R!v P?OAO3 +endstream +endobj + +13 0 obj +6443 +endobj + +14 0 obj +<> +endobj + +15 0 obj +<> +stream +x]n <bL'1&Sp$ ·/\msn])iW,e3T$I*4W sƥ,Iz5;=\2^#Du~ fPƤyb זv?:˟}@SpX5`q\Ѳm+J%E #dI'<ȧs|> +endobj + +17 0 obj +<> +endobj + +18 0 obj +<> +/ProcSet[/PDF/Text/ImageC/ImageI/ImageB] +>> +endobj + +1 0 obj +<>/Contents 2 0 R>> +endobj + +6 0 obj +<> +endobj + +19 0 obj +<> +endobj + +20 0 obj +< +/Producer +/CreationDate(D:20220401110308+02'00')>> +endobj + +xref +0 21 +0000000000 65535 f +0000029273 00000 n +0000000019 00000 n +0000000374 00000 n +0000000394 00000 n +0000011762 00000 n +0000029442 00000 n +0000011784 00000 n +0000020862 00000 n +0000020883 00000 n +0000021078 00000 n +0000021508 00000 n +0000021790 00000 n +0000028320 00000 n +0000028342 00000 n +0000028545 00000 n +0000028904 00000 n +0000029131 00000 n +0000029174 00000 n +0000029541 00000 n +0000029638 00000 n +trailer +< +<77F1D5E7090F17C94EF7E43CF16B016F> ] +/DocChecksum /2F8AB12D558369FCB5C37CF4905E79AE +>> +startxref +29813 +%%EOF diff --git a/src/documents/tests/samples/barcodes/barcode-39-custom.png b/src/documents/tests/samples/barcodes/barcode-39-custom.png new file mode 100644 index 000000000..5c2d7b4f7 Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-39-custom.png differ diff --git a/src/documents/tests/samples/barcodes/barcode-qr-custom.pdf b/src/documents/tests/samples/barcodes/barcode-qr-custom.pdf new file mode 100644 index 000000000..0d60b9eed Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-qr-custom.pdf differ diff --git a/src/documents/tests/samples/barcodes/barcode-qr-custom.png b/src/documents/tests/samples/barcodes/barcode-qr-custom.png new file mode 100644 index 000000000..6574638bc Binary files /dev/null and b/src/documents/tests/samples/barcodes/barcode-qr-custom.png differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle.pdf b/src/documents/tests/samples/barcodes/patch-code-t-middle.pdf new file mode 100644 index 000000000..2ccb76947 Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle.pdf differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-middle_reverse.pdf b/src/documents/tests/samples/barcodes/patch-code-t-middle_reverse.pdf new file mode 100644 index 000000000..e1f8bc39c Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-middle_reverse.pdf differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t-qr.pdf b/src/documents/tests/samples/barcodes/patch-code-t-qr.pdf new file mode 100644 index 000000000..9d2299510 Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t-qr.pdf differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t.pbm b/src/documents/tests/samples/barcodes/patch-code-t.pbm new file mode 100644 index 000000000..7e7214070 Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t.pbm differ diff --git a/src/documents/tests/samples/barcodes/patch-code-t.pdf b/src/documents/tests/samples/barcodes/patch-code-t.pdf new file mode 100644 index 000000000..3a8a2a2ff Binary files /dev/null and b/src/documents/tests/samples/barcodes/patch-code-t.pdf differ diff --git a/src/documents/tests/samples/barcodes/qr-code-PATCHT.png b/src/documents/tests/samples/barcodes/qr-code-PATCHT.png new file mode 100644 index 000000000..6f1d587ff Binary files /dev/null and b/src/documents/tests/samples/barcodes/qr-code-PATCHT.png differ diff --git a/src/documents/tests/samples/barcodes/several-patcht-codes.pdf b/src/documents/tests/samples/barcodes/several-patcht-codes.pdf new file mode 100644 index 000000000..de4c715c8 Binary files /dev/null and b/src/documents/tests/samples/barcodes/several-patcht-codes.pdf differ diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 952d3d920..c78fa16c2 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -1,7 +1,10 @@ import os +import shutil +import tempfile from unittest import mock from django.conf import settings +from django.test import override_settings from django.test import TestCase from django.utils import timezone from documents import tasks @@ -12,6 +15,7 @@ from documents.models import Tag from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckMessages from documents.tests.utils import DirectoriesMixin +from PIL import Image class TestTasks(DirectoriesMixin, TestCase): @@ -89,6 +93,318 @@ class TestTasks(DirectoriesMixin, TestCase): mtime3 = os.stat(settings.MODEL_FILE).st_mtime self.assertNotEqual(mtime2, mtime3) + def test_barcode_reader(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-PATCHT.png", + ) + img = Image.open(test_file) + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) + + def test_barcode_reader2(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t.pbm", + ) + img = Image.open(test_file) + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) + + def test_barcode_reader_distorsion(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-PATCHT-distorsion.png", + ) + img = Image.open(test_file) + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) + + def test_barcode_reader_distorsion2(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-PATCHT-distorsion2.png", + ) + img = Image.open(test_file) + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) + + def test_barcode_reader_unreadable(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-PATCHT-unreadable.png", + ) + img = Image.open(test_file) + self.assertEqual(tasks.barcode_reader(img), []) + + def test_barcode_reader_qr(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "qr-code-PATCHT.png", + ) + img = Image.open(test_file) + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) + + def test_barcode_reader_128(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-128-PATCHT.png", + ) + img = Image.open(test_file) + separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) + + def test_barcode_reader_no_barcode(self): + test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.png") + img = Image.open(test_file) + self.assertEqual(tasks.barcode_reader(img), []) + + def test_barcode_reader_custom_separator(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-custom.png", + ) + img = Image.open(test_file) + self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) + + def test_barcode_reader_custom_qr_separator(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-qr-custom.png", + ) + img = Image.open(test_file) + self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) + + def test_barcode_reader_custom_128_separator(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-128-custom.png", + ) + img = Image.open(test_file) + self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) + + def test_scan_file_for_separating_barcodes(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [0]) + + def test_scan_file_for_separating_barcodes2(self): + test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, []) + + def test_scan_file_for_separating_barcodes3(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [1]) + + def test_scan_file_for_separating_barcodes4(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "several-patcht-codes.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [2, 5]) + + def test_scan_file_for_separating_barcodes_upsidedown(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle_reverse.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [1]) + + def test_scan_file_for_separating_qr_barcodes(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-qr.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [0]) + + @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") + def test_scan_file_for_separating_custom_barcodes(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-custom.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [0]) + + @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") + def test_scan_file_for_separating_custom_qr_barcodes(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-qr-custom.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [0]) + + @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") + def test_scan_file_for_separating_custom_128_barcodes(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-128-custom.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, [0]) + + def test_scan_file_for_separating_wrong_qr_barcodes(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "barcode-39-custom.pdf", + ) + pages = tasks.scan_file_for_separating_barcodes(test_file) + self.assertEqual(pages, []) + + def test_separate_pages(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.pdf", + ) + pages = tasks.separate_pages(test_file, [1]) + self.assertEqual(len(pages), 2) + + def test_separate_pages_no_list(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.pdf", + ) + with self.assertLogs("paperless.tasks", level="WARNING") as cm: + pages = tasks.separate_pages(test_file, []) + self.assertEqual(pages, []) + self.assertEqual( + cm.output, + [ + f"WARNING:paperless.tasks:No pages to split on!", + ], + ) + + def test_save_to_dir(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t.pdf", + ) + tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + tasks.save_to_dir(test_file, target_dir=tempdir) + target_file = os.path.join(tempdir, "patch-code-t.pdf") + self.assertTrue(os.path.isfile(target_file)) + + def test_save_to_dir2(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t.pdf", + ) + nonexistingdir = "/nowhere" + if os.path.isdir(nonexistingdir): + self.fail("non-existing dir exists") + else: + with self.assertLogs("paperless.tasks", level="WARNING") as cm: + tasks.save_to_dir(test_file, target_dir=nonexistingdir) + self.assertEqual( + cm.output, + [ + f"WARNING:paperless.tasks:{str(test_file)} or {str(nonexistingdir)} don't exist.", + ], + ) + + def test_save_to_dir3(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t.pdf", + ) + tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + tasks.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir) + target_file = os.path.join(tempdir, "newname.pdf") + self.assertTrue(os.path.isfile(target_file)) + + def test_barcode_splitter(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.pdf", + ) + tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + separators = tasks.scan_file_for_separating_barcodes(test_file) + self.assertTrue(separators) + document_list = tasks.separate_pages(test_file, separators) + self.assertTrue(document_list) + for document in document_list: + tasks.save_to_dir(document, target_dir=tempdir) + target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") + target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") + self.assertTrue(os.path.isfile(target_file1)) + self.assertTrue(os.path.isfile(target_file2)) + + @override_settings(CONSUMER_ENABLE_BARCODES=True) + def test_consume_barcode_file(self): + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "barcodes", + "patch-code-t-middle.pdf", + ) + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") + shutil.copy(test_file, dst) + + self.assertEqual(tasks.consume_file(dst), "File successfully split") + @mock.patch("documents.tasks.sanity_checker.check_sanity") def test_sanity_check_success(self, m): m.return_value = SanityCheckMessages() diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 30d6e87c4..b267ee10f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -498,6 +498,12 @@ CONSUMER_IGNORE_PATTERNS = list( CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") +CONSUMER_ENABLE_BARCODES = __get_boolean( + "PAPERLESS_CONSUMER_ENABLE_BARCODES", +) + +CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") + OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))