From 0c676b90f24ded60ddee4a95d43658496e99076f Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 8 Feb 2021 20:59:14 +0100 Subject: [PATCH] migration for #511 --- .../migrations/1012_fix_archive_files.py | 181 ++++++++++++++++++ src/documents/tests/samples/simple.jpg | Bin 0 -> 17740 bytes src/documents/tests/samples/simple.txt | 1 + .../tests/test_migration_archive_files.py | 175 +++++++++++++++++ 4 files changed, 357 insertions(+) create mode 100644 src/documents/migrations/1012_fix_archive_files.py create mode 100644 src/documents/tests/samples/simple.jpg create mode 100644 src/documents/tests/samples/simple.txt create mode 100644 src/documents/tests/test_migration_archive_files.py diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py new file mode 100644 index 000000000..e95715265 --- /dev/null +++ b/src/documents/migrations/1012_fix_archive_files.py @@ -0,0 +1,181 @@ +# Generated by Django 3.1.6 on 2021-02-07 22:26 +import hashlib +import logging +import os +import shutil + +from django.conf import settings +from django.db import migrations + + +logger = logging.getLogger("paperless.migrations") + + +def archive_name_from_filename_old(filename): + return os.path.splitext(filename)[0] + ".pdf" + + +def archive_path_old(doc): + if doc.filename: + fname = archive_name_from_filename_old(doc.filename) + else: + fname = "{:07}.pdf".format(doc.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +def archive_name_from_filename_new(filename): + name, ext = os.path.splitext(filename) + if ext == ".pdf": + return filename + else: + return filename + ".pdf" + + +def archive_path_new(doc): + if doc.filename: + fname = archive_name_from_filename_new(doc.filename) + else: + fname = "{:07}.pdf".format(doc.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +STORAGE_TYPE_GPG = "gpg" + + +def source_path(doc): + if doc.filename: + fname = str(doc.filename) + else: + fname = "{:07}{}".format(doc.pk, doc.file_type) + if doc.storage_type == STORAGE_TYPE_GPG: + fname += ".gpg" # pragma: no cover + + return os.path.join( + settings.ORIGINALS_DIR, + fname + ) + + +def move_old_to_new_locations(apps, schema_editor): + Document = apps.get_model("documents", "Document") + + affected_document_ids = set() + + old_archive_path_to_id = {} + + # check for documents that have incorrect archive versions + for doc in Document.objects.filter(archive_checksum__isnull=False): + old_path = archive_path_old(doc) + + if not os.path.isfile(old_path): + raise ValueError( + f"Archived document of {doc.filename} does not exist at: " + f"{old_path}") + + if old_path in old_archive_path_to_id: + affected_document_ids.add(doc.id) + affected_document_ids.add(old_archive_path_to_id[old_path]) + else: + old_archive_path_to_id[old_path] = doc.id + + # check that we can regenerate these archive versions + for doc_id in affected_document_ids: + from documents.parsers import get_parser_class_for_mime_type + + doc = Document.objects.get(id=doc_id) + parser_class = get_parser_class_for_mime_type(doc.mime_type) + if not parser_class: + raise Exception( + f"document {doc.filename} has an invalid archived document, " + f"but no parsers are available. Cannot migrate.") + + # move files + for doc in Document.objects.filter(archive_checksum__isnull=False): + old_path = archive_path_old(doc) + new_path = archive_path_new(doc) + + if old_path != new_path and not os.path.isfile(new_path): + logger.debug( + f"Moving {old_path} to {new_path}" + ) + shutil.move(old_path, new_path) + + # regenerate archive documents + for doc_id in affected_document_ids: + from documents.parsers import get_parser_class_for_mime_type, \ + DocumentParser, \ + ParseError + + doc = Document.objects.get(id=doc_id) + logger.info( + f"Regenerating archive document for {doc.filename}" + ) + parser_class = get_parser_class_for_mime_type(doc.mime_type) + parser: DocumentParser = parser_class(None, None) + try: + parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename)) + doc.content = parser.get_text() + if parser.archive_path and os.path.isfile(parser.archive_path): + with open(parser.archive_path, "rb") as f: + doc.archive_checksum = hashlib.md5(f.read()).hexdigest() + shutil.copy2(parser.archive_path, archive_path_new(doc)) + else: + doc.archive_checksum = None + if os.path.isfile(archive_path_new(doc)): + os.unlink(archive_path_new(doc)) + doc.save() + except ParseError: + logger.exception( + f"Unable to regenerate archive document for {doc.filename}" + ) + finally: + parser.cleanup() + + +def move_new_to_old_locations(apps, schema_editor): + Document = apps.get_model("documents", "Document") + + old_archive_paths = set() + + for doc in Document.objects.filter(archive_checksum__isnull=False): + new_archive_path = archive_path_new(doc) + old_archive_path = archive_path_old(doc) + if old_archive_path in old_archive_paths: + raise ValueError( + f"Cannot migrate: Archive file name {old_archive_path} of " + f"document {doc.filename} would clash with another archive " + f"filename.") + old_archive_paths.add(old_archive_path) + if new_archive_path != old_archive_path and os.path.isfile(old_archive_path): + raise ValueError( + f"Cannot migrate: Cannot move {new_archive_path} to " + f"{old_archive_path}: file already exists." + ) + + for doc in Document.objects.filter(archive_checksum__isnull=False): + new_archive_path = archive_path_new(doc) + old_archive_path = archive_path_old(doc) + shutil.move(new_archive_path, old_archive_path) + logger.debug(f"Moving {new_archive_path} to {old_archive_path}") + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1011_auto_20210101_2340'), + ] + + operations = [ + migrations.RunPython( + move_old_to_new_locations, + move_new_to_old_locations + ) + ] diff --git a/src/documents/tests/samples/simple.jpg b/src/documents/tests/samples/simple.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a8c58af0df6e4b4e6b72179b92aa9df4e2bbd8d8 GIT binary patch literal 17740 zcmd41WmFx_wl>;Jad&rFxVr>*cMq~~cLGaLXzOTg z_lUngVh-=e0097k`H$S@UzqkE?C>uv@DKLZ*OhzJc^oq>ro;chHva?LI(oZ3j$i*# zpYdNYJR$&~^m)7poJdz!B&Q)Cr;oRTjjy*Ymy_*djQ_h(0X~2(pajSQoB$GVecX_b z$PnOrtbG7)zyYuUd>@at02ko&SpUzy|J%s_7vPHwe7rUQAm`y3!VPZGt#`thpZzaJwQMtX*wA?2rhqf3C$p zy#4100RQHrk3B!%e~CbtvH$>EdwBT81OO0$u^HfoLEGhy&gL@jwEQ1SA6~Kq`<9WB{2!7LWtv0&jsl zpa3WYihyFE6et7AfeN4!s0M0)TA&Vi2fPOwfJUGRXa-tKsV3>d;&fL zy+A)O01N^{zz8r3i~-}o7vL)}348;lff-;HmmF$e}hK`;;s2o;15!T@1`ut7K= zJPt7K8*jfE+>2 zAXkt($OGgB@&WmQ{6T@BU{DAs3=|G}0g3=cfnI}RK(U}Vpaf77CkKsBIRP(A29s1eizY5}!@IzS&m-JnmPUQj=15Ht)L1&xEgfF?mx zpc&8{XaTeYS^=$retR>If4p5!ExYt za1uBLoDR+e=YZdW3&2I-QgAuA3S0xO2fqh@05^l%z@6YO@F#FDcmO;M9tBT;zk;X0 zv*3B~5_lE74&DH7fp@_N;3M!U_#Au*z6Rfd@4ycbFa!ocg`h*QAh-|$2oZ!7LII(M z&_Wm>%n$^G1HuL2g$O`|Az~0oh%`hNq5x5bs6jL#IuLz`A;bh?4zYsRK#&kehzrCW z;sx=A1VDlzp^)d0myjq(G$am^07-_VL9!sZkbFoHq!dyCsfN@+-a|e>S|II^kB}Zn zFJu5R0vU&Vg-k(aAq$XY$U0;LvJKgT970YY=a5UtZ^$3WJroRuK~bR?P#h>eln6=+ zrG(Nz>7h(e1e62H4dsUlK}DgG&?itis1j5assYu3>O+m7W>8D04HOA=gt|gKpgvH4 zXb?0M`WzYweGQF+CO}i5>CkNGTWBG)1X=;DhSozHpv}-W=tpP|v=2H69ff{@euK_J z7oaQ9@6b)?F7yC;0zHRbLT{jVFaQRHp~5g=I4}YjF^n8W4WoxK!4NP`7%xl!CIXXy zJ%P!=lwfKwEtoFM5M~OqgxSFCVNNhNm>0|s76c20y?{l+qG4}fNw73n7VIso5LODS zgw?{{!9KuRVV$rZSRZT%HU|3&n}*H9mSO9#P1r8%5OxZ?fc=L3g#&OX92JfU$AuHZ zN#T@mIye&?0q2DC!Uf@?a4EPfToJAc*M#fAjo@Z*E4VG(5$+22g!{sS;Gysr@F;jJ zJOQ2p&xGf~3*e>jN_Z{20p1L6hj+t!;e+ro_*eK0d;z`!{{i2IAHYxG=kROz9~1xu zih_oMg@TVlj6#7zgTjcyio%J)iz0*~h9Zq3hoX$4fue(Ah+>Ljg<^~1h~kFgh2oF$ z3?&>T5+w#D9wh}O6D1d=5Ty*I3Z)+914=8(N0iSfgD9gYUr}aI7ExAFHc)m^j!@1} zE>Uh#9#EmEXsB4I_^2ePl&Ey5%&6?B+^7PmqNq}+a;VCv8mPLcMyTef)~NQVE~uWU zeyG8y;i!?QF{lZssi;|~d8oyx6{xkS4X7=sov5Es2T(^*zoO2dE~2iZZlUg@o}m6h z{f&Bu20=qb!$QMHBSE7=qeo*w<3Qs@6GoFjlR;BNQ%BQ5Gek2(vqrN=b3yY$^GADz z_5$q{S{zz3S_WD!S`k_~S`AtQS_|4ow9jaRXya(#(B{!r&^FL^(T>s1(SD=dp@Y#; z(Xr48(8U!a%{m#K6ZO#h}Ju#9+hV z#t_61$B@BL!qC9b!!W_H!mz_|#_+`O#|XiAi4l#FfRToggHec4j!}!zh|z}8jnR)W ziZO{Xhp~dOfw7NqigAhY2NQ&ef{BGmfJuf)i^+`1fysv{f+>ZmfT@P5gK30mfoX^7 zjOmH#j~R*?ff<9Dh?#-;7PA<$67wBqGiE1dFXk}j7tC4AWy~L#dzdGfmzaOBKv<|) z*jPkZ6j*dvtXN!Ff>`2MvRKMkT37~H=2$jZPFNmT{#YSc5m+%;NmvsUKjM_9kGZm~hwDA?H8MA#JA^wV{9vI zdu&&1AM9Z47ueC*3E1h_Z?Q|TtFarf+pxQ_2e8Mnr?Ho?e_-!npJHEO-{C-UFmUj3 z$Z%+JSa7&-1aTyABSkv z`G&KAvyQWabBuF|^A{I_i;jzrONL8}%Yw^|D}*bFE03#=tA}ffYlG{A>xmnP8;<)5 zHvu;tHxIWIw+8nEZU=5J?g;KA?gH*Q?k?^L?iKDG9t;l?j}VUnj{%Pjj}K22PX8!i@qXh2_^9|e z_$2r=_$>Hb_(J$n_=@p^ zg76Jt8sS^QGQv8-7Q$}ALBg+u^Mv0C_Xy7jZ-_ud=tTHL6hsU}97F;{l0*tbnnXrK zRz!|Ooenp%}oJCwjTuuCexRbb_c!GG2c%68U_>B0L1VVyILP$bI z!c4+VB0?fVqDrDiVori2aVH5Nc}@~Tl0x#9q>QAVq?P0o$q305$uh|n$uY?_DL{%w zicd;G%1FvZDopx>RE1QR)SMJa>P{L!`hql;G?nx%X*uaT(l*jw(lOE*(lydu(x0Ta zWDqh;GGa0sG6WePnK+pOnI@SrnKhXUnJ-xw*=w?7vK+EfvO2O>vQK2AWYc7;WIJR( z$!^IZ(P^41iQB+VgP;^oZP<*9Wq}Zf5 zrnsgAQDRUMQPNN%DETQRDU~R7D9tF5l%ABql#!GPlv$L;l(m#Cl%FZbC}%0xDfcNa zDDSCIsqm>NsaU9Zsl=%isI;j}sqCmcsDh{>s1m5MsEVoTs9LE$Q;k#2QT?Dgq`IUA zsL`p3sA;I#s0FB{s8y)-s4b}-sePzJsiUb=sq?5SsT--gsE4Shs8^_Wsn4nZ(xA}b z(@@f|(D2bn&?wR9(wNgY(0J2?(7dKep~<7EplPJ(q8X-{qFJTcqxnU1Pm4-RKub-F zpcSB%rd6fYr?sMWruCzJP8&y?L0d#yOWR7@OFKckK)XqMLVH68rNf~kr(>eyp%bT5 zq|>1@r*ok5rVFEsp-ZDHpsS{9ru#%UPB%}tL3d2|n;uG!Lr+f6M9)JnL9axwOK(Z< zMDI%Z7Ife~}6NVc`7$YttB_k^%Kch6G8lxej4Wm0_5MvZ$GUHpuO2#I} z9>y`odB#n~6UJL6I1@e-H4}nKkV%F~gUN&m$>hlt!W7Mv##G2u%hbly&os%j!nDiu zi|K(GgPDYxftj0GoLPxkkJ*aZg*kvZk~xX_EpsJv6Z0qLapnc)E#{xhcPwZuL@abH zoGhX&iY&S;mMqRJ{wxtJi7dG+6)a6GpIF9O7Fo7g&RFhP(OHRE8Cbbl#aWeE^;w^? zy0HeazG6*b&1bD)ZDs9eon&2Q-DkZ*fDt$d3Ir=c03n0WK$s#N5I%_Kh&PCAL^+}n z(SsOAEFiWKXNY??3^r0WMmAnHDK=F$BQ`rWFSby&Shh^IQnm)RF1As&dA2RKGq!tn zbaoPUMs{9yDRwn>V|FCFH+wky8}@AWGWJIH9`TTT>CLQXnPZcYhK6;4A=J5DdoFwQv6Y|e7d z51gMlzi=*d?sHyoLAmg_sJS?}M7fl>^tr6LJh(!*V!1N8%D5W2K5vp^gP@=l00fWCOi&2 zzC15^l6dlYYI)juhInRqHhIo?9(b{M$$43Mg?Qz8b$Oriy7Pwc#`0$Imh(38_VP~h zuJIo6-twXH5%V$f@$o(3)8sSfbLI=;d(D^5SHjo8*TXl#x59V8_nRNiPsq=}&&w~( zufcE5@5~>_|B64Izl6Vm{}cZg{uTa1{u==l0b&700e%4)0WASb0at-%0x<$v0_6hD z0(}Bg0^bEr1?~hf1<3^wg2IA|g8G8Cg5HA91rr7H1ZxF51xExI1a}25g`h$NLUckr zLefH-Lgqp)Lcu~YLYYG4Ld`<`LeoM&gnkM=2xALV33CXG3#$kl3p)t=3r7j336}^r z3V#-!6kZoT5xx__6rm7d6A=|r7BLdB7x5E`6iF2+7HJUqEHWvwE^;DrCyFUbA<8Z) zCaNN8Eb1T{Ao@x)U9?oRNwiONO7w^5nHV63BSs^}B_=7RA!aV-BKAxyRxC%XN~}$6 zSZrQwSL{k0E>0xQBrYH>FRmwUEAB1+Qao9_Nc_F{C-F(~b@5a2dkJg_Y6(sWNeK-J za|u_85Q#X6T!|Wq4vA5TC5Z!x8%Z=tGD(D_sHC!_v81D9pk%aUmSlxwtK^X6yyULr zwG@gJi4=>Ju#}RNk(7f}fYfWLOsNW~R;eMWd8u8gYiSf|5@{A`VQD34BWVZeKb$eyr05qqNg#Po^FlV?xjp1gfh`{d)3i6^U1j-TAiV9QX; zaLY)`XvsX4@sxQXlPps#(S1Q*mHz+qRw=4Hs9!;KHo?TvCUR~Zo-d#Ri zK1sevzEQqcep-G@{!#(1K%&5^AgZ9MV5Z=z5UP-%P@wQ$;j_Y&!luH7B21A)kyTMt zQB~1Q(N!^2F;THlu|cs{aawUp@k$9riBt)pB(9{cWTE7)^js-fsaUB=X+UXCX;0}! z8C{uDnNwL>SzFm!*+)52IYYTZxm|fwc}4kH`9TF&gYPo8w>WJ#H>apsB8m=0>nt+<3nz5Rb+B3Cy zwF0#UwLZ03wOzFvb#!$~buRTM>bmN7>i+7{>N)DQ>fP#->Kp188gLCV4R#Gl4J{38 z4IhmtjZBRyjgJ~%G`?$`YeF?iG!dHOni`r`n%xn(Lb9T2L)wErgc1 zmZsKIEpM$TtxTy)(uQl3YO`xgYHMrTX!~iu*3Qwc)9%rp(%#a()wIn(mn% zRF7B>p(mlIrDvn(rx&f4t5>i0S#L&fSMOFIOP^MsUtdw*MBi0ETt7v>OutQkOn*)P z+yG`kYQSzFWuRk#Gzc_^Gsrh+Fc>gcFgP^0H^et&G88dXGqf`FHjFaNHmo)5F`PEs zF}yXxGNLsSFj6uyGjcb2X_RhMW%SW#(rDA@+8EuK+L+f^!PwZ?)%dw_s&R#Jhw&HV z4dY7_G!rTl9us*JV-pvXaFbM%3X=|#FD4r%SEgvDRHnS93Z^EeuBOjT(@ZN(KblUO zZkhfz!!V;U<2O?>Gc)rri!jSHt1;^_n>O1u`(ut{&R{NVu4ew!+}AwX{H^(W^M3OM z^CJt81(5~9LefIV0%;L!kzi45(PA-Xv2O9p64jE@lE+fs(!|ou@}*^lWwm9G<+SCl zuwuin`K*P+iN>-dt?W)Ber9=dtzr`=WG{lmu^>W*JC$hw{LfkBtRmN zQb;|dBQg}3imXI-A*Yag$UA#{dlq|1dtG}6`%wE7`wIIm`ziZ9`#T4G2UZ732R#Qz zhcJg!hbo6|hZ%=`hX+SOM>a=kM*~M^$LEe2jx~;-9p@a6oWM>bPMl71PR35|PLWR8 zPVbxsoR*z_I>VhQo%x)Voh_VwoMW5|oSU6ToxeL@xnR1`yNI}Gy4bk{yCk`kyL@z+ za@ljacO`UXbA94!=<4eF(lyJq-nHL#$@Qlj+>O$W-%Z8M%FWL$&aKF;&FzcZmfNj6 zo;!=Xq`RKGv-@-R4EH+sKKDiUpB``zN)LVy6_2ML{vK~UN<7*J1!4s<1xf_!2f75l49p2^2pkFg9{4*5Cx|6TD#$R%Jt!(D zFQ_SKJZLlMPcT6+Td-`fX|PvtOmITsT}S+#x(XJTv@#_;C35@SEp&&k@gMo|``RejfY03{pi2F#ANS;XLNbAVp$kfQ%$o|OH$g3!vDAp*MDAOqKsJN)o zsII8FsMA-dujpQhz0!N-`YP&G{;Sqkldtw)gI`m;7I>}k+Tr!{*V(TdUyr@sdVLp7 z63r8>5^WRxEIK{9K6)tnd-QD#K@3NXVvJQxU`%pMO-z5xTFiAUZY*1@T&zW`e{5oG zRcvqUO6+AEP8@5TY@At~UtB_5Wn6FEa@^$`>^H1$WZ#&*@q3f_rt(eio0T`0@i_5_ zc-eUKc>nmM_^SB6_|^F91iS?H1o;HZgusN9gxZ9`gzpKri3Ew9iAssqiO&+#6W=9{ zByJ|&C6OfYCaEPMlfsj7lA4mfB<&@GlPQyhlC_gvlB1FflG~G~lTT7mQ|MD9QjAi( zQsPp|Q$D3ErCg-qq#{z~Q!P^iQ&UsxQioGFQvarrr17Syr`e~yNXt!YNt;YNOoye@ zri-T=q6Pnz5Y;WKv`bW@=};WWLHQ%IwUX z%RI}%%woxs&9cY}$V$no%Noww%(~Ae%NEGi%686<$}Y_A%%00W%fZZH$&t;m%n8a# z%XybGnzNk?8E5MGd5&{8l}a8ig? z$W$m(XkHjtm{$0{aIA2*2vS5-Bvxct3JD;8C#iBnQd8kS#DWd z*-Y8ba?El>xk9;hd02T)c}w|p`Dq141#5+T#nXz=itLJ(im8gzN{mX@O8H9b%FxQ3 z%9hIM%F`;$Dnyk+l}%MxRc=*V)lAh{HFh;ywNkZR_4Dey>W=F9>R&auHJml7H4Zfq zHH9@@HA^+uwS=`iwHmd~wXbVSYd_bn)&8j?trM)%t@EgRQ&&|tShraZ)Kk@q)f?6O z)~D3Js~@l5e+PfZ@b1Yw%XiP-WxZ>DH~sGCd#v|t@0H&p-@klc@V@K)();TMq6WSO z?FP4o*oMl6friaSppm*!ywRl5zcH<`q48_u(FgPoEFTm;*nD{YA^$_?hs6(9O@vK+ zOs??O@mEa&ERI5X31u==Ah<`=BDPU=ASLtEgUT>EeR9i%>!j!u>oo54?@aIf(D|+N=SS?1oFCObI)9A*SpIR~<7OAQi?&O; z%c3i!E4Qn?YoY6^o2Z+=TesV*JF&aId!qZW2fYW;qts*H6V+4N)7$go6Yz=Vlhh~k zPtQK(d}{x+@ag(9(Px3rdY`>NCx3qb`RnJCUaVe@UbSB5-k9FX-l5)|K3E@PpIo0! z-;2J&zE6GYefRxT{Sy6V{m=Sy`rG>#`mYB_280F-2K)xn20jc-4V(|+4e|_X4|)tH z4%QD&3?2_*4si^r4Y>@(4pk404DAo24zms`4Lc0K8ZI9m7~UR%jWCYLkJybwjFgP@ zjckm9N9jjpMy*F*j24Z48vQ;7jM0ug8M7J-A1fH^8CxHF7^fMR9=9A18_yr_9$y>3 zpP-(Qny{P*o5-K&o>-f>|3dRc`is?<@Gk{ldcLfGdH72E^~u+#U!Q+1{QCLpk4exZ z-K6ZK&E(6;;>o_r&2NxzjNjzHA-_d^EBiM1ZD$H)ie*Y^%5f@ss%mOv>R=juntfV* z+I2d9x^DW*^vMj)49|@AjMq%cOykV-%*8C>tl+HSY`|>hY}@SO?Cl)+ocNsiT zT+iJ4JTOl;FFS8LA30w(KRCa;fVzNKP+f3Yc(YKq@MYn25qFVyQFqa2F>SGVac=SV z64{d2lG#$oQvOoU()VT1GX1jrGIIIVa^>>K^1%wm3g?REipNUwO5@7R%H=BYs>rIz z>a*3itKF;XYoImyHTgB<+UvE-wb8Ysb*y#nb?tTU^|bZo_4)Oi@8sVlzFU3||6cUH z@B8)-_z%_}sy|$R#Q%8rNC zIwUa=84xy z+DYrl(#hQ^?Wx?U{b|f;?djLk^PfaNMShz74EtI1bKvLR8O9mcna-K-S=L$S+1fei zobg=w+~qvsyy1NI{P!=4U(&y9entJN`ZfOR=LNxq(1qzm=ta>*|Ha-V#wGWq?xp`_ z_GQ=Q_bccX%az)d$5qNz%hlr5-8J2{{I%oto9p-2)7RI(DSk`+w)y?)clGZtzt3)n zZbWY^ZeHA!-HhBE-QwN~+#278+!o&U-|qdv{KNA{|4-nbw|_qU+4_t6m*cPYU%$WE zf4lzvxP#px?lkVa?=tQ>?$++X_bm5n_a66Y_igtp55NP{gUW;3L&`(T!}7z!zw$~R zvj8Cfqym)81Au-A0H{v@Kz0rQFgrZv5tu(LKd#(^A6KORS(AR;|0z(&KQ#;rg+Spj zIQ+kM6f{)$<0=#!j*5YbhW<}^M3@-pnE#yoW8`05!7vB}hK>S9`7f3KN7ch801p*N zeY60g2f%nB2p;I6AE5fLq#-cmpA4b@F8@RVf}um8kIhPtSq?A=6!fTofd)tUN8=Gg zVQ>^wG(3C)bUH$M-T)#7J`qtF9c%xlW?~XjMt)&2uNE>{K4Daws>)B~5%T7f_jgK?TkZXVdwLG5n+}q@$@?@hhX)`!j^y4w>H+XY z5d+NziJJuSQ`jyBab@b&Sn<%R+e^xFu@|az*bfNnZX|6cS2>2x2=}RN_i5>Ft_;B^ zg$~I%S9;wxHK!!+^y`%0XQGkF3930GMRkZcwg?a#YL0?rZ;ghB+Yjg06O`fc0ECZz z>+4JIsw@CUA@Mwce@x(YJ*H8zqmcmWG-V9EUyqT!3#YOY)#KAqHzmCyWt%x9lW7De z_@BD~`6e@JJX#e#F+eyTCzI?uZCz}kxMWj5dVpv9E}_1iGJQ)Hr!i1BMLs06r|n;T z{=?mqWLEA$Ec6ak+o3n3x9Y{K#(H#tmy3q0v_ct?=MBVs#D|C1f1!*s;f40`(jG>I z$0`Tyb+`+eOxv2H7zuyAm{e@{bYZDwXwHXugbi|o3js^ zZw=s!UfXppXQO_K#X5hx?UJ8)^3s$%O}x39d+`A5xOiOGL|VM~^AUfFblj0Ca(lniQu$TL zysdO8pK)(G8zTGX6=~~dc6uZHmOnWDVxFcsQ%RU9C+=Nw3ZYTLPtPthe@2Fvm{<49 zf3Vz@#&@Hu_iA^BSf*sQ+9+jz_7xXK#3rS;si}TOnmZF#)efZ`H{FiqIg_28l^mb= zYUq!Tyzewb&64<|N`#R3?O-gwHi5%fKfxVpZ{8dYK01l9fmv=&sWEz~pv{Fg%(F)K z(YfvkL#8|DZ_Q{1!+fWsGbY|hwc_$TY6@QM12VT3wB7FCcDH{MuDR~a=w(%?|2&tc zQ7G3L$>yd!z#C6ac9};nGAF*Jt8X_BGkKrRFWo@9s@su#$rX6})2cAsaAPr6s#8)S zBV#)F{kTD#6Vy-B{U zDBe^iu^aD(k{ollU*K5Yw*OKS`@u}rqOd$es(P*7T7*tQX2xPiUtLuzu9{4l$~GI{ zQ7Qvh5Cg+CZl-z@i|_FvxY|jYz~Wk;!%wM53pK3C-lWGh)B<(&YvRc9Knn&I?bv|zp!s-{*ta>`_y}iwhbsVBY zWu`}~Tf!`zp0TQg;W2V%G2_Umn@Xj;TRo>Vaq6LdHNA+Ve*OTID8vh82kX~G_1N2! zhsjr~WL;d79#a=?R;85N2{q>sH>!#u*rXy8(}oeo*xgfx50TKuqv?_QTya)3 zI!?ljS36v?c3e1u=r~sy1-D*Xgr`xBxkod~w`c8J( za$w!C*20X-lAj=v*_#7ZQ_7?ngBJTyn>zD4cGa0;*Y<-{hTQH#Np@p#!a1J$wwC%z zBQFh0gJt?`Z*4@Y{EfiUh|rXsfH!H%pU5UMy15nCuO%_2>Qky(wm5|Vb<23$@<)za zgQam>B$28M0sd(ENMW8EJL9jL-^ZWs7r*I=5lp82sUw&`z&EkUi4)Wkbc?T5GIylM zpYp>?uSFW)OKoORxU*@zpRnH@$1-p**b1FIqw3L!!Y=XJbkfUnp%i*1 z&&RLP&&ZPA-zXAeF2uS1eV4^+b)NA-y@g?+x0xoLW`I{yC2W`)w@Jhz*MznGHu;?1{{P)X zy_mS$U!5wa9&ZcRe&%g*vN85*aEy?xYaN?u*FePJPiPE6f1zIB*H^~XJ^9i=cQcvZizNoC4r)|*Muf+H_a>FGM7j8`8Ljls_XU*$)S`$x|wth7Kb zb@{}6qvm3cSg~2ovl`~Sl*T$k1-|v#Tou!OT-%nYiL5Do#nd1v$76<+A1t)?BB*%$oF8tE5ZJMuRzcsQPgv z#DA*|%G{T^nBz6FO)|C3eS7SOQkrhII}XRRl6Ns<)TB!7QhqSX%UsY(_^Ys{ICH7D z({HJ~)79K2Pwj7~!FKa1rZ`uqO8!-6%0MLE2q|9~wL|ks1+#mtXpK+hCwch$!PR7K z$ry)Mwk11NE(YA>1Alae-40&6jRdpNtNI&NZjo1+<4hc$3cOWMM5jFdg^yO^+7&(Z zE=)G^c{~NROQJrx!zzT$Rj1Z`$Y7ZFSjk>8w*F77Y<}f$h9Z2kH=A+%rpx}2s-(0k z)t|ZSvg(tiDdCclIFSec{q_QS*w@n5-+AGLM3`3*^PfXAou93PT z*C`dUzb> zj@V|AlrWUiy7IW!s|;#Rvg(;9N}oMR_TY?{U~6&k&9X4-PVBUxKi3;CTm5Q(q#@7HTT( zL&&<{9Fm>BFX}$VxNeRgYLzEzB8OA#2cmRRnaQr5c8Ku2bHW&2t)y79s}h@g+SmKL zZ*^xLORtl=_xLOcrWSCGm$@umqHmLLj0jH)`l_aqMRlu-rM_7#YW{tT*HlUo8vR>` zr79{K#NEXd{BeA#b{X4ESY$2rggirHn4h!5z_Nx(Ys5cC`^TttE0sykTcKH|xhZFU zJ~yn2y%e%9^TQdR zNhwc`A>614o7=2?@JlP(Rino)k`w6(Jd%rZZLPxB7Y*COyoAu!h7e z$UDZqt%sJ4Br@%yIqtCKn2P?ev_mkam$VukgLVZ&hhC9%#y~?ys8S z%Km9t7CL2)LL8N5lq>g_?#ru8-9`omK6)O)@j%s-;-hrD<4zMM~K5IpmY1FoZTI2f%^(*VT-SEfb8W`v2^D**8zehS<7CRj*mO3eZnd#-u zol>x@NUxgFD^l;=?v#t8IZ;rsZ$XS&^);TSQ+?1Ik>R*8V(ES3c31A-G%b~TR?a;u zCE3PDaV==}n5M-h89=Y^Fde=9$$BVQKB&)$lI-tCmMw0Z@9?ev`(GHEw?mBlc=OhS zEO~0qLfL|sC&k=$zi!&4-NSiMGY|d$>UtI|39PRWvKcO}r-o~-+Xz+s3@mo?NG`J2 zYKLe8x{Te(=roqOx5`SC>9EHor=Tu*#lxfO2l|t z&a#uRm`xIB#s`^A2(k=+ChM3cD6XB}QNkgYsp!rt6;)Zn#@EfVo#g!et<2r&%iUI{^=vDOQ=HPgaqpd) zro;+y8vc!tcJy`qKh%cu3fOgEH?f=*{?H1f@OLM*gYJRrDK9jaRGEtB1f6X5lBbfq zqOwDod6&esrxhKS=2b_{dRa%sLX#GmcQd(=9P z_eeCbl`@D_33Q7(XT~};X)~fc!!>p7Y_twbhW20Wj6`T@adR&0Z07{A3< zo9h-Qo0`I<)L`muDAcdS?X>fbKh!z@yylxiLYp6}Q*_|(54O~bllsh6)LQYssR=^_ zcMgBh-*ot2omi{1QZ2tz>&*YI;8dcQb|VE-`%~g!P=!A^pqk3tlq1E=mywS5GgAXY zHK_@~nCb-T867Qo3#ML$yVxFsRBhu>RYzPTNuY+06RqzT>e(QMFL5{-h=4 zucN;dm;3V#(@JAh6&P0{&`+XLPA7y9wd9QtMiWCf%I1xNHlKAI^Vx?4mpZ;_N5eOj zH4RMtZgPXAkz%M`8)KotG#~u=qwx3nFF9QClxCfli)~0zc zg#5T8CdE!!Q(0-66Tb@Uy_^(gg!VDnyLVSvd-#L1{eMhinQWTIqXU?j+0tBPE(uDRaL|KtJqsAxKAn189F_u1G5rMigxN6n&s zWuj3%8qZEK_;xg9xd3NPDO*`+N<~eB#ICIjFT?KA+VNFUSstg&a(+o2<$juxUQa=+ zXXC&=br~@?uDMlqMs{qkndk2V=j$8wu@`IC-Sf{ryESwUs;)p>-sssQ&7`?U3)p&G z%#A36gEvD$(#BK9%IL1da+pfzrfciM={u^Z4;b!*-IgJt8mkwdUD~s&vP_rimakL2 z_kP#UR7!}uQmiv>@B#qpE06K+4pBx-h z8Tf4d#ZxXnkVBPu@2)C`rI)T$zB)0N9hzb6ghG&D37s zv=}J(;Q9cRvoSaam$4e;oD5}0S5DM@jSO2)N3@Kk>ao^;5-hl3Nzu74%}$6~opzBN zSy)}BbU$q=d+l!J7YyTLbe<@&s2_S>D$(aL=sBTlfyA)hOZ(ZO0+rQJ%-4#+)6ZOIgD2Y$9ydtfCzG2Se^eImz)%=nSE(r#SMis-mHQ@d?Xd zuekM-S@?{|J6@dtky*bay<{VMgOUXAL;WR5&K>7p`J;-wwuQrh6%4KY>y=CgZCvW51 zLUPi&-5sDTY;3oC!CxO^22{S~HLGpcsnBtmr_hf{%F|V$wn1<=>g1bhFN#ER$*JzC z(yP81*Ty2Zo7g$8SB)~AGtB-uh-e@~o_6RvBNy3PJx6?(bokWUdSfYAS;P>|I&JnT z0%eTOCrO8^YyKEhhNnbg0IXxyOFZBJHDQ%{+j!aYo$ zow;GU(AAUo?iOC7Q4!DDo-lHv(e#&c=)dtM>aMzysEv-LGKD^VR%o;TqdDsFX)1Fz zH14!?1Am@5LJLxL`K_pHR(PSUVTWk8KASUM*jilbP$6u`I*6Wg!m{pTM#c!U=x5h# zyT4jeFHb@kSA{X>XF0RNsNWylaUCvhOm{eXOT4c(agSo7=u+=&+7>EpVvo4>PGum3 zxU7#9RfNCqRQfF-GcA7NmKS`Y)^c>r^VB45$#2YK&grs7mPp~3J;!T$y1_6!P0ui? zaft)%5-v2mgu%4z%8ojp*A)g zBs8vDzH%ypZB0z(2t|2^&rvkE7;`DFD^32WwwCozr9ZJdj68^!C{3HGHcKSA$#qt0 z-QB#zxaHls64x+&*0IlUlVLGA&>3XbEZuR%G{O`mGVT<)lxv2Q5@wrlPBa@DBv8fK zCf`G&0RI%vEU+*!{+bTE@c_+DY$-;?RK1-EFS8*zvxlB=QF`;$;~UsM=K+wFrTS5l zTQEyjI&L91m3~+(p^>_O)g)ANhtu%%FcUsx0#N#i^(#)TGUMec&f7sJ$ z;7wP(Yifk2kvk_O_4ET|)CPvJ%YvhldBLJGucqUQh1G>h{(3Wt6(g}qc0E&fbTuqY znecU~^tv%sgkD$2DI1kdDaOiQ8XxzjoqllCbytb6lDK!TF*OV?%uwY1)VsXkdLBRj zeznBQIG|Gq+ivieX2sK$)0~Vxu6Kj2*28A+*rNw|OJ zSLKSYwSUs99(Qpk)!i3x^ttg~?i;bmaPa9|y>7Xc>B}*kVecO861s4oy@oAi1ou!v zSs8retsm!=kvtO`600Re1)+99F@JYfJx_wVABl6N>p5-+phtX4HwqGmjDtE=wk zws&!_N~VX%jV2?hx5(8mEj|e^!fpa(^ZP^xWG8zS+u$C^1UphB#Uhv(s2r$Y$yDRnY&xUkD7O|q zQy)bRG``Y3v=mMqQ(%Lytp};kZzU4|bi3Bj1 z`z<{7TCw0mOXh7#BS-%G9>O$-ibnMf`GMh!LzlwW8GA% zmWplVZ??<#_HV7R)4oQTs4+05+?F+kMes>`68!llej|1szO0}4gYa8isH7NgxNc|E zfe*HMN85yP%J)`e_pi~u>kihGYALp+SB%U$Rj)%T-3D4eu z8vBISt(7a^+KC(>RP*hFTcGwd+p~y zqp?OXGn;0-tRtT00{`7QE!d8f?>N6uIa8HIlYbqerKg|2&r}&>?ro8Hk+{F62P}5z zU+me{3EFnVYg)kj2MMdGs11-8;+zZL%4MCJC0ZsU5uv)~Ioc_^dkPoSyst__RYke< zh*!V062phEvr~}f*Y=v`zn!?hG%1uz{2IDVa^VV%p0V7qc+tg$JC{+@MB%x2s!6K2 z)=}M#9vN+fH6nGcKrKi%SgQDJcqPEp&l|1J%})6wKT=`z&3#et9cOsb2ej}5^YsJM z!U6KX4*=;-{^LV+pXP!s!rzs*BF|&bMd9kpYe9{QU?OtPcvzY zl(i0iFA?cfwyKr~Lu43rnB`W=NKX9ne1Q6Dm%@uqEN+D>b8^2ZY)YY{DzbdBg8XIF zvN1^Ss*I~`?MiLY5*ANfDXBYZxN3%1Az5=BG_0NS(j?=}K z6Q9*oH&sj4!HWYo3`XsGa<^spN|ixM<%zmGWAr&-FyF7LWZcTZ#9!g#0C{sD)$f>e=eNcUSka^ZywL0lbd$a&Yl=jY1rer| zPOySx0hKqIC3m;Z`Gup3#T3`<@MZuE2>>+B?g;<@ literal 0 HcmV?d00001 diff --git a/src/documents/tests/samples/simple.txt b/src/documents/tests/samples/simple.txt new file mode 100644 index 000000000..6de7b8c69 --- /dev/null +++ b/src/documents/tests/samples/simple.txt @@ -0,0 +1 @@ +This is a test file. diff --git a/src/documents/tests/test_migration_archive_files.py b/src/documents/tests/test_migration_archive_files.py new file mode 100644 index 000000000..534a5b499 --- /dev/null +++ b/src/documents/tests/test_migration_archive_files.py @@ -0,0 +1,175 @@ +import hashlib +import os +import shutil +from pathlib import Path + +from django.conf import settings +from django.test import override_settings + +from documents.sanity_checker import SanityFailedError +from documents.tasks import sanity_check +from documents.tests.utils import DirectoriesMixin, TestMigrations + + +STORAGE_TYPE_GPG = "gpg" + + +def archive_name_from_filename_old(filename): + return os.path.splitext(filename)[0] + ".pdf" + + +def archive_path_old(self): + if self.filename: + fname = archive_name_from_filename_old(self.filename) + else: + fname = "{:07}.pdf".format(self.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +def archive_name_from_filename_new(filename): + name, ext = os.path.splitext(filename) + if ext == ".pdf": + return filename + else: + return filename + ".pdf" + + +def archive_path_new(self): + if self.filename: + fname = archive_name_from_filename_new(self.filename) + else: + fname = "{:07}.pdf".format(self.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +def source_path(doc): + if doc.filename: + fname = str(doc.filename) + else: + fname = "{:07}{}".format(doc.pk, doc.file_type) + if doc.storage_type == STORAGE_TYPE_GPG: + fname += ".gpg" # pragma: no cover + + return os.path.join( + settings.ORIGINALS_DIR, + fname + ) + + +def thumbnail_path(doc): + file_name = "{:07}.png".format(doc.pk) + if doc.storage_type == STORAGE_TYPE_GPG: + file_name += ".gpg" + + return os.path.join( + settings.THUMBNAIL_DIR, + file_name + ) + + +def make_test_document(document_class, title: str, filename: str, mime_type: str, original: str, archive: str = None, new: bool = False): + doc = document_class() + doc.filename = filename + doc.title = title + doc.mime_type = mime_type + doc.content = "the content, does not matter for this test" + + shutil.copy2(original, source_path(doc)) + with open(original, "rb") as f: + doc.checksum = hashlib.md5(f.read()).hexdigest() + + if archive: + if new: + shutil.copy2(archive, archive_path_new(doc)) + else: + shutil.copy2(archive, archive_path_old(doc)) + with open(archive, "rb") as f: + doc.archive_checksum = hashlib.md5(f.read()).hexdigest() + + doc.save() + + Path(thumbnail_path(doc)).touch() + + return doc + + +@override_settings(PAPERLESS_FILENAME_FORMAT="{title}") +class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations): + + migrate_from = '1011_auto_20210101_2340' + migrate_to = '1012_fix_archive_files' + + def setUpBeforeMigration(self, apps): + simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg") + simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") + simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf") + simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt") + + Document = apps.get_model("documents", "Document") + + self.doc_unrelated = make_test_document(Document, "unrelated", "unrelated.txt", "application/pdf", simple_pdf2, simple_pdf2) + self.doc_no_archive = make_test_document(Document, "no_archive", "no_archive.txt", "text/plain", simple_txt) + self.clashA = make_test_document(Document, "clash", "clash.pdf", "application/pdf", simple_pdf, simple_pdf) + self.clashB = make_test_document(Document, "clash", "clash.jpg", "image/jpeg", simple_jpg, simple_pdf) + + self.assertEqual(archive_path_old(self.clashA), archive_path_old(self.clashB)) + self.assertRaises(SanityFailedError, sanity_check) + + def testArchiveFilesMigrated(self): + Document = self.apps.get_model('documents', 'Document') + + for doc in Document.objects.all(): + self.assertTrue(os.path.isfile(archive_path_new(self.clashB))) + with open(source_path(doc), "rb") as f: + original_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(original_checksum, doc.checksum) + + if doc.archive_checksum: + self.assertTrue(os.path.isfile(archive_path_new(doc))) + with open(archive_path_new(doc), "rb") as f: + archive_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(archive_checksum, doc.archive_checksum) + + # this will raise errors when any inconsistencies remain after migration + sanity_check() + + +class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations): + + migrate_from = '1012_fix_archive_files' + migrate_to = '1011_auto_20210101_2340' + + def setUpBeforeMigration(self, apps): + simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg") + simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") + simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf") + simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt") + + Document = apps.get_model("documents", "Document") + + self.doc_unrelated = make_test_document(Document, "unrelated", "unrelated.txt", "application/pdf", simple_pdf2, simple_pdf2, new=True) + self.doc_no_archive = make_test_document(Document, "no_archive", "no_archive.txt", "text/plain", simple_txt, new=True) + self.clashB = make_test_document(Document, "clash", "clash.jpg", "image/jpeg", simple_jpg, simple_pdf, new=True) + + def testArchiveFilesReverted(self): + Document = self.apps.get_model('documents', 'Document') + + for doc in Document.objects.all(): + self.assertTrue(os.path.isfile(archive_path_old(self.clashB))) + with open(source_path(doc), "rb") as f: + original_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(original_checksum, doc.checksum) + + if doc.archive_checksum: + self.assertTrue(os.path.isfile(archive_path_old(doc))) + with open(archive_path_old(doc), "rb") as f: + archive_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(archive_checksum, doc.archive_checksum)