From aa5dec252f43a72857ec2abe18d577bf84a1a16d Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 23 Jan 2022 21:41:08 +0100 Subject: [PATCH] Fixes jbig2 writer to write valid jb2 files See: https://github.com/pdfminer/pdfminer.six/pull/653 Squashed commit of the following: commit 8748c9fcddab0826cca243eee45c40d2b6611e80 Author: Pieter Marsman Date: Sun Jan 23 21:40:50 2022 +0100 Remove prints in test commit bb977258a39fc7baa13bba1c3ea29726e17c0f6d Author: Pieter Marsman Date: Sun Jan 23 21:35:12 2022 +0100 Cleanup exception handling for jbig2 global streams commit cf0b47b01b7caad8acbd82097aadadb620606a8b Merge: a5831d1 708dd20 Author: Pieter Marsman Date: Sun Jan 23 21:29:15 2022 +0100 Merge branch 'develop' into jbig2_fix commit a5831d110a80d07f0297c3ce50d7f3e4e05153a5 Author: Forest Gregg Date: Sun Aug 1 22:59:17 2021 -0400 flake8 tests commit 18ffa2938759b707d38a56d2da9e3f5e3892bea6 Author: Forest Gregg Date: Sun Aug 1 22:52:11 2021 -0400 add description in changelog commit 6c7ee43d6cceaa6fc8747bbfda175227093e0e1e Author: Forest Gregg Date: Sun Aug 1 22:43:36 2021 -0400 Fixes jbig2 writer to write valid jb2 files - closes #652 --- CHANGELOG.md | 1 + pdfminer/image.py | 16 ++++++++++++++++ pdfminer/jbig2.py | 18 +++++++++++++----- samples/contrib/XIPLAYER0.jb2 | Bin 0 -> 9026 bytes tests/test_font_size.py | 3 --- tests/test_tools_pdf2txt.py | 19 ++++++++++++++++--- 6 files changed, 46 insertions(+), 11 deletions(-) create mode 100644 samples/contrib/XIPLAYER0.jb2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 35b2ee1..a0a17ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645)) +- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653)) ## [20211012] diff --git a/pdfminer/image.py b/pdfminer/image.py index 1a25006..cfed324 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -112,6 +112,13 @@ class ImageWriter: i.save(fp, 'JPEG2000') elif is_jbig2: input_stream = BytesIO() + global_streams = self.jbig2_global(image) + if len(global_streams) > 1: + msg = 'There should never be more than one JBIG2Globals ' \ + 'associated with a JBIG2 embedded image' + raise ValueError(msg) + if len(global_streams) == 1: + input_stream.write(global_streams[0].get_data().rstrip(b'\n')) input_stream.write(image.stream.get_data()) input_stream.seek(0) reader = JBIG2StreamReader(input_stream) @@ -157,6 +164,15 @@ class ImageWriter: break return is_jbig2 + @staticmethod + def jbig2_global(image): + global_streams = [] + filters = image.stream.get_filters() + for filter_name, params in filters: + if filter_name in LITERALS_JBIG2_DECODE: + global_streams.append(params['JBIG2Globals'].resolve()) + return global_streams + @staticmethod def _get_image_extension( image: LTImage, diff --git a/pdfminer/jbig2.py b/pdfminer/jbig2.py index 10ee7e6..269b028 100644 --- a/pdfminer/jbig2.py +++ b/pdfminer/jbig2.py @@ -27,12 +27,11 @@ DATA_LEN_UNKNOWN = 0xffffffff # segment types SEG_TYPE_IMMEDIATE_GEN_REGION = 38 SEG_TYPE_END_OF_PAGE = 49 -SEG_TYPE_END_OF_FILE = 50 +SEG_TYPE_END_OF_FILE = 51 # file literals FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A' FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 -FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010 def bit_set(bit_pos: int, value: int) -> bool: @@ -243,8 +242,12 @@ class JBIG2StreamWriter: fix_last_page: bool = True ) -> int: header = FILE_HEADER_ID - header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN + header_flags = FILE_HEAD_FLAG_SEQUENTIAL header += pack(">B", header_flags) + # The embedded JBIG2 files in a PDF always + # only have one page + number_of_pages = pack(">L", 1) + header += number_of_pages self.stream.write(header) data_len = len(header) @@ -254,7 +257,11 @@ class JBIG2StreamWriter: for segment in segments: seg_num = cast(int, segment["number"]) - eof_segment = self.get_eof_segment(seg_num + 1) + if fix_last_page: + seg_num_offset = 2 + else: + seg_num_offset = 1 + eof_segment = self.get_eof_segment(seg_num + seg_num_offset) data = self.encode_segment(eof_segment) self.stream.write(data) @@ -305,7 +312,8 @@ class JBIG2StreamWriter: if ref_count <= 4: flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) for ref_index, ref_retain in enumerate(retain_segments): - flags_byte |= 1 << ref_index + if ref_retain: + flags_byte |= 1 << ref_index flags.append(flags_byte) else: bytes_count = math.ceil((ref_count + 1) / 8) diff --git a/samples/contrib/XIPLAYER0.jb2 b/samples/contrib/XIPLAYER0.jb2 new file mode 100644 index 0000000000000000000000000000000000000000..e1d1aece899d5ad7cefdf7e40b3c70bee42e3c4a GIT binary patch literal 9026 zcmXY1V~{3F5**vMZQHhOYscELZCg7tJGO1}8{4+6d-v`-BD=b~BC;ZXRYYZ{%ZRYR z!{ERG0|5d37ysWUpnyPr{<(1e7fEphU|Hy3Wzg!d9#neHo_0mI-7qXYNxj`7C!ULL zg(WkHUL<2^CKBs}W0D$Ve{fU01Ocr&8L!{qKK>GGtzyrMGej7yrE=LA1!ofY>1sw*+JM77icISRPiR?mL5sDFIAXhS|u+7$oND4Lmdg zthl_bo!v3Q*iVcIEm-v-*aV%qed#FJt1sj|V(u#!3X@omeb1Bl8Q-yxEC_EcesT*K z)}N~XN@Ds~8!8YG+%6Ch^a~IW1L?nreEug30tEbza-9DEOLqR}$=Mf&Yif1ibw7iQ zSK}JWbc2}^>Jt1ll?{__#uZyN`q=cDIEE*eHCXBVmF#jy^N$++t-Cv*ogOm6R3=mC zy1!=KKde*-n2;|a>-XL}c7o3GhQjFP2vi^pUphqofpO2;RCzs~`JWY$7U|bbuh1NI zNsf)S!`1ZBEMJ2FS*>5YBfx|&HFy(CHUs%tDlG}>%!e_T=@h<3&445{5)m4TP0s}U zwJ4Di77F^{Kz1plIB(?U7ZpY&LEl=G_P&=^! zg)BUlHE_Xp-F=;XtmR7JKEi|?L%pV))`=LF${BY zgdMF2+qqTMRNWsXSs=RM^@SZCi#QEZK3|*KC7mnTm=>;5Ztu@6awQ>|Y$!f|@nLp~ zV5IIiL|cgK%X!rj^gyFNCvj(CM}QKt=oo;%S&#}F|A=~gFqsd|mKzr$9)8PxH|UM*gSg<2SG3H-pgL<~G4LNL zERFt%0_M=2B$tLvR>3b3rh75D71#>(n?tU!ec`-L=iMUJos7tglf?xbP`Ff-Dd_4( zKs#MciVMPB+h_r?r0G%?5{S~T(rAMsKVn4ab*S|)f!s|Ij0!7qpBnyFEg4xO_qRYW zpL!ueFB{H1BW55GqEMh)3C%Ac!Az9^r z0;Je3wL$XWJ`%TSaKN)c8KPY0Aa?=9lJKCPqQ@lvg9bw?$PBCmW zA6>*|d@pW&36JHs6w(Co3{H%2#k*7_Ph)x{)CGUU zb^%{LhMc~J2Zz3t)?X3}x7`sfIMb2IqT=H+Lbiw)Lx3evc)Z`+4Gyrrp}a5*3gOR8 zq-XG8WM9OQK$$0W=SxOEo`fjze@&B`Uyj#@Sq*Sqc>sNj0c$PU`qamO@Ev62`PrAf~PsuyXf_; zs>F>^pv|TMzR9%EkK)bP+M~iCXxcgSGzsB~LQW2;nR*a@Gio@P;#cuq&UMD0(5MLZ z&=t16s_R>f9QiTsRuD*uz0VN(^t#Bo4}7J}!gk48Y$6%&8SixZv0oZL);K!j_s}Op z3pYV^uko_ue*W;f3b-R9J@##y-DBM5u1{y3i~QckW{3(b7|F?0nW(ksmVlI4GYxlPSw1Y; zRg4Ogn*tq4yfetG${NQ^nlIvcDHQtMXcoiTRzUWIFA+)nn}X1u5oT1*IF2>+U9Xaq z34X>B*-UgNTk8$9sDMw4b-(6^i6)rk5WQ()Uo;_G1`@$Y1oLH6Hw%^|O+D0K|AUX+ z_TCfibH8;qW1<(uNOVb&9&aXGY23eD0wXi}`c|F~p~uKXP5y?7lkP%798{XRDw$I{J2i`l7v;$$7_}@>FLDCGS^}^nr@9cM^^_)w$NU!rPrF3BzwW2*P55$b6 zChjjQa0eDCl990%(}IP`ouPePH)x6GR_Np>m8+<7bW;8wOYmP|SQmy9JPWjN!YC}~ zUCPb$$?1%IwmAUPW?sAoSjn63>BngHhlrJSa=KrB4n1y0BOF5m?;te9Z4d&2eYmG{ zitHRJn%FT~VsQDusPeJU1IECJgRLmb%)wP6Z@RPg+sX4+s~%q>Fq%R*ETLAv4>s3JL-OC_JyNpt9qY@|u^0 z2lsz2Mdp|d`in14cq|~R(N!5$Pc8-JF7XYpr$P%)J}Z2Hm#j9;UhjiB(|KHCombb^ zrxIPnE}MKzyy@4}UxMGa*3hQ}5z{`>^w+pvFuW}+>dI9~Kf^l#b7RW1TsQd_R8z^N zRZFi$`8uOPH*o>Gyf!?cM-!PQZS6=oc4|^NHcEv|{^~zW1e$EoJPQc6e=Y3@IH8Gx zRZUWbwa)SCw~wqJ;tppoYq4N?Ob`o6;o zX}hXD&xbf2xZ_$gZ?oJ|mdE$4+&zf(03nS1L=xR%z=3I2QaSH^pu%w`Z zPBau+4?WeR4aySk(!pZ1U;cz~I+M8j**o<4>28`*PTxD0&}+U~afLR+!GfBUfy8|@ zK9CxLxTaX!l*Rt`)5ovQIcidlL!0od&udLkBW;efShHRV{6$6NRRacv-)LDr&dGte z7w}t4i+$#Huix7f!EOFUVR32uU|Sy7T}&v-&2Oui=My~6<-p!{xyWuVQ#AP4kI}?& zS|=H`aJug0#QG)aL`GQ_h zpiy-RYwV`50IPHDJCr?I+!EALiBS{=)jt^)Ea4b=)aU7U(rhY~;pd(C{7|5bIeAKZ zWSGVfmbJvo1bp`~N^sG^xhrX}q+I9hCS(of*4IgSfd`F4X?&%cMgIZwDg*wGMm+N` zy*RLv{5%cQ2$Zh9>d3ci!#@w{n%zDECLWH|M(wt0yxvFDgi0iG>1|P||Hz}qq=?9I zU)2{aQP$})mt*3$T+oEL5S24`I_en9`cgs`Ae#w4i?vC*Bd}Q&0NO?}s{~eWe4VpP zA{fYQ+3{+D4lwhLOKKC@?kbDKdL6@{1ljeP1lD8Htt#k(TdgcMrO; z6{qaZM#s1h9q^~(qkl?ht}#+CcFxjiOr=h(frK7dCfkg*poY&svX37_ICrN(jF|sKs_-c0W>mEpvUweM9 z_u)HKA|~Oy*(A=1(5;DL;B+_d^uOrD4jlcrqd)L>v&;OoUrO(E{m!_D2>h6UJ7c zkVl@Hf4+8{f+N}ZHXpd}n=BD+CctYg`gh)5Ws2ZCw;SQbs<&@|Vr7q7GEmZje`oJT zUVM)IpsF36jXHHry`Nx(0iDrUOc&eOZv6G-UAOxV-ZZT@N6lW-2FWYlg+qz{7kGs) z+Z>OoT=yGJ`*i>}kRVn~cAO#_aLg}xYNnbL2C^9X!})aEBm0_5soj_TH9tFMKw=!@ za8>m3b2v*qO0g`dTT|M&WdyJC1~YdNo8}+78S?_I3-D1j0)|xdF?3$3l`O9#m(h}l z=@>xP7j-TqY%W$Lzqs}?ySyzu9;TZ-FYy{f zhJEqn5#49L?-*<{D8sz+4VrjqaQd{`Hy@5CwWm8G(u(h3I^IW!Df`oS44)G12M##+{@Va*?OA8tHz6Rd%DLTe zh&4BFCt*ftb@S8udVTbXm@E>-*VL6xv&rf0*$eq5mV-8TgUm+ZNDEQ{`KlBe7}1dh z{zOmygD=S{bVqMMT*UL42yrGJvLk1k}uEaLYa7yRlXU2L+^X%YQr?!J(7?!~@q}R6Oudh81 ziP3tWTL26w8)=a5`f>1=qbl}^1;4q?+UKX?HH7NG?2YJ(A3GE!iD!=(dgEqifX&Hf}86Dp2 z#^Mo;<)FVD-)v0}sliIG^kO46FcOnw-MmN}#4ehu45(c}7feGmRSMnxUTnK4ewqgt z8+Wvp>9vN?Mz6YD(0FlAWk>PdHUrn<6jhUjWfTa@ubE1ly58~0V!eI; zj@Ml-{SJc9My{JiwYs}^{CuvO^c!T1D?-fg((itTIJjZRpbd6Den;E?GGo>akX|_n z^k|XK*Q*{SJ|ZBuWglW=KCLe`Nnz=IC(F9%vK6bnHeY_?Jms;13k+w~*T5}%&0N>_ zq^E28vA&h_80`%+nl{!GF15kw?S~Io7O-#OeeA+ZK9GS+5}#MDy3 z4zjv~LZi;Rd2~wLHWt>DQ>tYnb611lJ_E zdmZ>8=%=I{p2u)EbMmhw=PIgIXn0E0Hc?0YvZ`qt+Lt!}Rky;Sc3Jt(HCRllgQ|dm z#R{A(b223&5m*_ds`&dfu34r=6duF@E9BT(@Gly5dXUry-!rl66ym#XQ(@S=C&j>`Ey01)Y7EQ=>`TVy`Tpb7D1R)b=`-XW^8U$eQ_3a3b}KK5Ui&X$CYLHXP+& zEKuFa`NY6Hs%ET`$^gt|)4Ri_hk;4+uFwO2H7NXQYD@IW4z*2?)kSmQczxXpHD~?rtp3_=89Ee>U^ie;UN>2|?H!5PckLm)#EXxtT@T4=IF+e|@Qk>k5aIGQCGTe44dh;60&K#&G)QM@RaHJD8hDd{Xd-&GG(42Icw zVl22k+KNM>wjt39!9=V|Yn%MC4t7XoGo3aggY5yoRLQy9N3B&wIAr0eA6<(qIi(9M2oO4`+?!Wkud$a{CDgIZ&XewfJF?k#5}o``~IuEPW6V{VDyt{O6SZeLTd-i>qObU`yXdk&dMf23uiJspdT zXfX%Bh&#a-*y)kFUT#D>P<~0n(b?5ASJb3Q%kzo>sO>z!#ppC5ZT}rM?!vEYE;(>H zmo$^XA5aJ=z#DW7D-DvSb#yBeF&B4L{LvWoIg)UDD<~oA%WedGI=y#tB4*~I(w*u< zgHTLJqVMb1;_7?*cxv@6+CYctl!-hT>I{GTu{T447qp9>gs)_H?Ia0#oTtLCxU8|P z=HlcjcJ%-^0)UwY*ld>wckLOClOVF1&M&-+4A<%tbJLL<@?>mMLYF&p*G}KW1k_B0 z*K{$la-|Ne(k#&Y(Cc7QOiQ}neb9rbu=(n9C5>XJ+u(Sc(<9r1>T}+K6h?LT>6MRcTuk20bKKm9zaJ+k!cu<4!5%o><5*_$%JnfM< zStB}R$MmtJSXp5Ltb2}dwllVlV_iQUuD6^11doQ zty@0TZh6*zW7BpoT7}#OvN+s%ya0MO&S2;b61&DIj*Wnal;nRdULYP(y2`Qlbb8xq zRsJXWI@mpU>uv&3YP|XNU#ANcLIMcnzdoMC|GX{`t$)t{UvU3K!pKoL_s975#Z z3J`bys{on*Kl*=?P*6ZXp!<>c*oE$H6iXA`sF#tVoDYnLGX?b<(=Gg{sU-yba5i2p zgH)O=Y|&~#fH#+wrIO<`M-2s>Mwg}c_E5jQLU<=!oo1FG{f?C0kSDg6vtDp8q^&|{;a%Y_k@1S1u7dYF3w|Ip zDDZe2xYp~2dC1q!%zUFI*K!<4YSv!{8&e+6Gr4rhiRo$i_`&$?wXiHi<-H3fFK`W! zJi9mGPyz8w%ZR(0qPv0Z1c9EfElLA(-YJ@&!s0jeWSBFbcsgYMh+;l>^D1GGS~Oy& z%MvxHo{ZROJnn;PwM4@odVq3k7uyf3Ik~{(qx;a-MvkG;* zaE;PHG7Q7F!*A1UR{N$$-r{PpSamDsqkJm~Jh$)zZ_SjWY~sh9n%a0QgcTnV33bO? z;4z%^CI%UAv1xbS*PlzQU{xcWVK@5RE{lLLhrYTyBN1#d_WEGS75 zW~4?|Ak`H>ue$61*h1y*1NE8KfORkEf2Mh*CUGv}*HH{ueRM=WVD;%Q$<4mh3K9b7 zr)d+$dHVsT{IAM_P@I}TYY||e0CKM8yrIi4`}MX%)_FP2$t&|lNxVe`q5iynI%Qs< zuv^!+)T$MqZM1m)V1oiAg1SG1Om9|hJ#jHUr&l4$+g^qRPc302jR5 zh6i*1Z6v4-BiMTsB$cI$6Sb=!($g4Ht>b9qY6Q85?GH7k4QmTQY4mpGY0fvCI$QG8 z6o{^)v-0=wIiC*(Ql=PMk3jnUT$5St&2z1s795)V=8?cO6liEg<;!ST?UQ zU@v&7&AcQt=8~ETeoF9v6#(VoxXr)qgyPkI1c11dVva?iiCCY~)`S^jZx`d$aKJM~l10#>lRg&R9#QY0g@1 zw#MW=E4#K7ZdvWYqBY1FP{7g z86=4`hMRi9njS`*Gm|Y9_tyMwNvk1}Ud*LPmLuidGc|un517UPM*JyNo37#N`4|@p zN1oTr(6xvrV0#?dSLHP2RT9zS?^r24?S;Vd#$Wq%DiPu?uE15vSIAusi3WkWy~T}- z8Uq@O4^Rz__ROH2ne+L=b=%+e92s@LtTL~S4gdV0k%JSw7t^9|ub4JariM1n1uDs z5%rzAKb>n;p?R(JDdL~WHCP*3Hiio0`)62iW6U%~jwS#Wv^{4T0Q<&zdys{hSZ9dJ zt)S2g!)#~Co2h>j+j37;Fh46JgnU`+`GasEU&MVEHoJkQ&GtU-RO>46%U)Jky71TH z%TMGi<{)CC+X8s2T&#E1KrfTtDHALUq;PdQBjI0X4jOs6*bhA(pQhMp6@!+$%G;04 zW_5L6PUwRe`{Sz$&G7$ZEk+$HaCepQ!Mr_oe&0zvz@zBBK1NBxa`yP|bEmCvQz7go z`-k@b&YQ0B4QwnB{BZUbu%-d-8GFHe=Erc3@1}%ByDj;d(;j!v>F2Q6{E? z&3++~-}G)clUi41sH{C66AjrPO;Pn&Cpb1I(PK<~*CCOf*mHId@`D3B%6%2a8_hs< zIwYE^9QAHd&q*1v7>RqNRT4jl^Ig7L^x5e}Lyn7589daXA~I*1bQEG7!l5k{+jptH WyBPl43?Z2R|4Rl42$J=Gjqg9N9X=8O literal 0 HcmV?d00001 diff --git a/tests/test_font_size.py b/tests/test_font_size.py index e6b0ec4..1c388a6 100644 --- a/tests/test_font_size.py +++ b/tests/test_font_size.py @@ -16,7 +16,4 @@ def test_font_size(): for char in line: if isinstance(char, LTChar): actual_size = int(round(char.size)) - print(char, actual_size, expected_size) assert expected_size == actual_size - else: - print(repr(line.get_text())) diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 9fe2e36..f73bc0e 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -1,6 +1,7 @@ import os from shutil import rmtree from tempfile import mkdtemp +import filecmp import tools.pdf2txt as pdf2txt from helpers import absolute_sample_path @@ -144,9 +145,21 @@ class TestDumpImages: Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46 """ - image_files = self.extract_images( - absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf')) - assert image_files[0].endswith('.jb2') + input_file = absolute_sample_path( + '../samples/contrib/pdf-with-jbig2.pdf') + output_dir = mkdtemp() + with TemporaryFilePath() as output_file_name: + commands = ['-o', output_file_name, '--output-dir', + output_dir, input_file] + pdf2txt.main(commands) + image_files = os.listdir(output_dir) + try: + assert image_files[0].endswith('.jb2') + assert filecmp.cmp(output_dir + '/' + image_files[0], + absolute_sample_path( + '../samples/contrib/XIPLAYER0.jb2')) + finally: + rmtree(output_dir) def test_contrib_matplotlib(self): """Test a pdf with Type3 font"""