From 82af7f0aaca9091ab4891439e0638626e9b76af2 Mon Sep 17 00:00:00 2001 From: Philippe Guglielmetti Date: Wed, 19 Apr 2017 14:19:14 +0200 Subject: [PATCH] issue #56 reproduced, solution attempt unsucessful --- pdfminer/pdfdocument.py | 18 ++++++++++++++++-- samples/contrib/2b.pdf | Bin 0 -> 19959 bytes tests/test_tools_pdf2txt.py | 3 +++ 3 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 samples/contrib/2b.pdf diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 3b2b551..90da1a8 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -644,10 +644,24 @@ class PDFDocument(object): def _getobj_parse(self, pos, objid): self._parser.seek(pos) (_, objid1) = self._parser.nexttoken() # objid - if objid1 != objid: - raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid)) (_, genno) = self._parser.nexttoken() # genno (_, kwd) = self._parser.nexttoken() + # #### hack around malformed pdf files + # copied from https://github.com/jaepil/pdfminer3k/blob/master/pdfminer/pdfparser.py#L399 + #to solve https://github.com/pdfminer/pdfminer.six/issues/56 + #assert objid1 == objid, (objid, objid1) + if objid1 != objid: + x = [] + while kwd is not self.KEYWORD_OBJ: + (_,kwd) = self._parser.nexttoken() + x.append(kwd) + if x: + objid1 = x[-2] + genno = x[-1] + # #### end hack around malformed pdf files + if objid1 != objid: + raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid)) + if kwd != KWD(b'obj'): raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) (_, obj) = self._parser.nextobject() diff --git a/samples/contrib/2b.pdf b/samples/contrib/2b.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6a8067608dd41d044193f046b3d71b339cb6d4b3 GIT binary patch literal 19959 zcmeHvdpy(s7ylTWX4({*l+7hc*n1aaF557YQVT_;8Z%NZg^E&hry{y2-RY{!r|7Et zkkY+RsdPo8KJxiQSBkxV@4c5Yz4P1e_x=3&>GA0C*!I4i^Lm|gp65Kz@g61Md)m-! zSvsR$Hr=_egQGFn6o3*FGEc|Znd%u48842ddPWAvi}_+vOsE+8DK;*i!lX_VM~BDH zr9h`sCsAlr|HK7i>g14le(_NpPF#GfI5aJ}xK`_rT42FHds6woTiiVl+t+X@(`CbxSJ ze|NAfCE$t1(UJp(t;1(8ye}dgEnk*U?UI_aYsX{Xho_gmNi;eC*qYt8vFG65(d%!V zdiw6lXYqx(SqpLVTz8CKeQ|5c2iafO8)j`xeLsT#&y3G`ha%=a{xBQmyldh15mRaC zy&JaqZ*`nb>e_3;zI18xj?;@`EBq{un>RfQ^tqK#-74DrrVZ>IX4gY%lrEjV!egbS zUFF};g2KGh?4RWth`x}hm7F^d8~)A@T)`>QF;Bl0v^{C}GtZs1*T_3sHa>jGY*x+YfER9|FC(pwUoN9x)CN#1$Ub)v zEtsx?1^+N$8q;ndhYHPg#iap6kE3+^LJfUAb8VZ{)oxzLhmNTje9vR#c;C{~?@r8G zYsf*Dg#-FE~5h@hUr1JwtYKebV%6g{M(y z)OjPi1CqW!wf!Cp8BCfT)o)QqygV2N`b<)6gog-Q2{oU2G{^}M<&gY0Z_jGGv(l*PtLKSVUE6h&+m_ezPXsrjgMd8NTn`W zvcz@?!!{;1oJwP}*;If|rPFPoGi>4#qvL}UY@*}L6gTO;4__Q7ij7zh9}yExk>58s zBxX^(i>aynK|lT}^a>UIc+P@Fv61joLPb<@q&P|(9Un)f+0v*;*FmfE=V~OBR@e?mp{#E`2`Km@nP?ZljQ{_J^ zpASFr*PF{986OcJDeiX%`3+nnQ~RCSXX$<%iHY@(iHYR6 zdI~7e_Otq2^ZiJ_i+o~2Bf=7UA8c;}Fl}fK{&cz{jSW592C)C};67*ezsGlPSLrT& z&+T&`$SNEKu`!{GMB-Rjdbi+s(Oe4jlSmvF7ZXdFCXS7hJCa;0Tlr!rUD4-Gu%SBg zV?>MOt}~v`S+po3)X`qdW{ClNrcIbg#Ij+sgT*$%qA-ySGn5t<#t<<>*bWSMv3z{` zUi-^^$4AG-2SZ$SqB`rmR5ygvPo z{(lS(f6LWxK?eT%-*OGSKK+gkv|PBKH55eWxR@@1DkOv&&EaQ_ z)?0>Az=>u z@?jZs-HTkycG%+ZpYiwwLG@B8s`3&60Fp9=SF#8sVPTgPiy?+_+tX4m6szZS8Ag&i zK~PQ91%jmJda?C{-!f;Jt>bKH>~L8yN$=`@68v19yP7X*&4S2c!xf7Jj1 zz(niv_7xYebz5b!DPvGRK@am@>U-@>(4-wfUmJ0rnYgdUI8OuI*GX!(alAGAkGhSY z5@EIJ;%c`OCIY|1`&U>?9NfrO+NahB$Bj@Z;A&Sjn$Tw>OgHn&WiN0q3 zI+mRn`&ZUj(q_vy%PdLLt=>3kI*cV*y;=6vYJ*d;HlROb5%63N;GZNc>5>Z3p0Q+7scE)#zJ`?yb*~0&06|Rt6s}@(7DqeZ zKvIXLk~#ph@8%r;uFVjGAKJ8d@@xfqHRu~ZH)jUCIcnn(J!xOyl9%htu6+2(xaW0ijLu|5tAdGn5oxKSQ^ z4+MR+#r>JdGYfHwG!w(=eg3lwcO_1LKWtZ@r7(Bg|`vo{whin0doAADDj zt!&tklgiz6h=E0~n}!+uV%S^$?{1nT6c!nL1uv%p)YO|ccy6p1tL}RoQdcFZsxcBOw^y!yhnA>eXz2fAJe=MgO%re8gQKz{cdVt5= znhs~5GzVnE7V@+efw4H`{sLu0D$gy_lp2H=6`ny zZoxFb|L&KMQ!U3>`MkQfWJ)$mqjlw{_$k7Y7@<%GrldfQjFF)wq-MjmWWkP#%#@wQ zsVNuKGPwXirlbeYb7gv68hMwQpGb*wZAoiWd%VA)^_nIsHIOAj(n>WvrYl9J)JHefeY0gzxStg+k%K=1`&zSrT&Q+j9SWPBzfChIsZiS9zk4BPTH3ViV zz;OVjGlJvvfBS~&3Ly9Ve9+?o{q9P`LuoUg`EXQ&YZ5Qu6U56QPz1T-9vA9s2YRN zJxDqBClz8I`ihqxXMLYU5RPp7iaOBF*p8&1O5p9Fv~7^t zCG#fyKtqbQxyH+sKI!_<0n>GwJY5HalOd0p;RR!L`w4Orl4vSnH4Sj+3xy1(!Y9;q zF+5Cu0ThRW)tcQCcxvd3pY?>IL+xkmLkO#g_Wt-VTEPGkHUvPx0G6mfArv^mil%_(e3 z(0!57WH|BdU%RB8St&i-v~Ccy>@&vbN+u3N3F(w3kDLksDE(R> zRS$i#9Sld!3s8uEV3ZU|e3c;HLAf$z$nzDt+7Js24$7;b(kC;FXIzLjnEXThDX{bk zEJXP#=d?C*Cnocf)zo;;6#485;&3-4*;GPnwz6z?a4zM0*Y21ApiKej0xw!Kou!T& zZdn*84gHaJTkCyv;ibY=^Djxak6gg_v9T-`OJ848lVEIFzBE=1bsIF`BT1erI=n`&!^+?X*}u!Y%Ouc6>VXa>2Z;Syn0XOVgi z2jECoQI6T&B(!z9&ghhIgpy7k94wd4b3)3B>{Y{GhMkfL%VQ<1fx?0z1^Sur2`C5p zy$R2Q6Qb@asVvV`CP^D9NrRjZM+r%4SId!sP{l(gktijpDaEO-6`3X#JGxT3L2MgH z0AwPCKYHa*qmi~lHPB`DQwFjcNEz+k?a)T|-ws0+mhjzv@ldi9Da>cfF!X>AnTyy(+n7;n7(opfLItiKRFPT zp;o1Mm<;LVPbrHa^5u$xJpBa*T;ODrn_L9jhtI2F`bd(ggr96Bw-isH;gSL3+oYp4 zgd+i~93J6bP|Dfa>mAQ#%bt!dd`@&5vTlTK@XB>FNVlRxbj)U2&_kT|mcPltkCblT zcp%Mn`Q_yLH~wbjVAhJ$fQE~{?YY+Tkpxo>Hfc&L2TgpzVFImtB@72~mGacX z6T`cxmW|yw{zvk`5)1@9ib}k*VnxFTZWo~*d<>#|xhvYq5aaqn=sso|oGk_BI#nSF ztr}7S2+@EEdyYcp7RnB_?7r#Ji2)_v0%v>eS&bhgneRX&S%-ahFt3KL3@xq=k&{hFT;bqprigw$$DqAa|npIS?(tN!|@1o)w8)TgEK~L7wa# z`g0zEM1q{zoR&N?t(%PLBH-U+6$R76hs;q(`l*J9G^O+qWofLAa523ognj!e$-FN@kr{y6;OD~RT&=?U>U$(jJxSUR#3_3p^u zu^GDrvKdEwJJsjcj7BO3*s5VCLTHzp4&38YWY}T9v_fVKRCUVPEzQiOm@6Q@4K(Z| z&}$e#9Sx`h0Dq}K4PAAWi?0V6-C%ut7AVDrQI@u+=%(OEWPgu5=s?)e604=!=20d?vEUn}Rn)?aY6(m1FMX^w=*qrgSU7eLBT0?{Ng>m~oS??GB zb3)b}k7B;EE$60gHAVt1wraqv+@V!CzFLU9k%)#VBdC5R?i*$o0wY8P4wNk~B1w zB#h`_0I0Pi(I3bdDABkHlAwUVJ9ZOG5=?i&}!wyxKyE9k#I~Zl4>f! zFpb$SgZOs}Rb|zC#3i<%7g#?}l2)EQQ-Mj>$+NH1Gmgn0 z<9wU4zI~f(OGO-pO=)owT$K{4bQa^=bXW@x!wxf~(pHA3l1jKv>*q1>Ov4m7X#&*b zNs~6Zs4yk9xY*F&g1za+we=IYK@~MT*LwGg8eiA?G1BQ30@wQc0mXYtm<1E+Jxrbu zOUBCg09%!XyMpEUS9MD>yHLbyW%kLsXf9W)?2ZUk%*Nl8@UlCwXd*31m`u5X#q%yF zC97rE8Ih8+FQugHOmT%-sH8T__lUZwm=GvM=DwEoy`KF4;&y~GsT9m|m66l7JE!Sz z-8Gk?HI@rGJiJx%0c`KI&)tb|MoV|K(?L`~#dJW)=Ur|(4Xy<6D8cT*Yma@@`lREV zm0{yMW{6>a9f>#;I7TVFzkA^ZV-r!9)=nciFydqyk9RBhP8L&C==_6BW_Ig)#uOl)bN znU1;m{HRI$!YrAr+_ApVa&i2sku|AFYeg0j1t+>`^TaDCJL@-YJF)G=T`t2;v}0mo zcxhH0{|n}l&r~f$l~l_E0Hw`hKqG!VaxUT0r_UT;N(P&hM?#7&Q`CD4rm>mGFAU$oYoN+dD@EF~_Osn;Ds@JDcV+vm8 z#abLVw*L9Em8}GC^T%t<$L`(X)!G(uT5p}{dM`^Ix1HI`0WOQ~Yl5hvN(q57<0an{ zXiSrG4H6 zK-FNKjsPi8daTR4N6hxapTX(sWq|)|F-$U#uiTgi@;;RPU3?<6W=DNvXxL`T!^RP|- zg-+O#2WQ87j}QD)Ba33m3(

a;eRl)T%Xy-U3lxve7L!_KK$=GJCCnn*7bM=&zd3jfR5HR%ekz+x9_{`$X@Z_szKA|yX@6Vmckyqb2 z*1svgx4>g_G(hP3UI%O?cSB>lIV-xu!H&GHJ6z95Ez8^8O=OTz>Rq(7=m;8=F%%s> zdyF^qC;7j$=4mw55WRmH3@PzHQ%#+Bq%u$qibU^0avdlUNZMUp+$$xRObJQLQftFO zF3Q(}@KC+BrhTUZxg!Vw5)VwHsfOIjTn^;7uo5Bg#0K1`{H2pI@J@KbBfEgCf}g_$ zAqOx6!8{kz19G?DozBHflJfQuPxk9!UD)g+QFVGSw|l1-x`1Ow<7L_iV&{E zN-0nQYPnD7WhJzKVI}B&8tD^F-e<+wlv z`G{&iM?ENMT5j(po0>k!?GGWQ;+E^S+CM$wqhrF>ywr7EY)Q4aU73e{mp1xgrCY1{ z;z4WmkL=poEE@6r=-FqB-kxCZnk{V;*aLqDqf-_xb0#0%${IOo$mH8c&;?&7bB-4V zuCeAGc7C*@M>>106+acX_6yN!=acd=S5Dj+9mZgUy!|-r>oT_^i0iHrUOOmF%>Fy; z{=F_bS7fV>D40qpOH*dI8O-nWpzc1;-u8$E#T88?Sg+r9<5lyuz2qap$b&`iiiM$j zir(op92S-oy;H8^%%B<~#r0zGn#MUj6`&3%6tdbDFcI@oA$_=}x zm_Wb7=Nv?oRyDlsztZv~QyxxwCwM4>4349;NB=>Cry8pKZeh<;pOuFhUwW0``L&uF`=sjul*=w`|kus3uGhk z^iCM2|2i|L4!|KR;zqOVLC?1@WDP@EV+Jhapht)#3yrQCcq-3Ck6cB=snhiMU`ScM zML?mE`BtlG+HZ0Dpy5tFvG$(li5G`nS+d={wHJ1uYu}fUH+@;cbGCz(ytyK(RMMD57ylWp~a^?fZ`c2pl@ zwVrA@wc?5Q#*D+*Z~BqzyU-OTPPRz)LRSq)0VPgc(Tr9LVL$^v3i^)m%Y?}mn0g(M zzN;bgq#!6Hnq*#HY7S18kwNZfNsa`b;eL;&@(|@!2`-hJO$yxabw=1oc=u2>Sl>Q$ zkQqb-5rDs-AcAU6Q2H~SMT$IF=IZ?Qj;sknz)!+ZunP+|l3_!s8t;tM6r<5q!a$m` zp~!I^t}H6~aABdoQZ12U)xWTSm=cIEQS(Yvk~3F6g48L`)a%=KR>&H-n65Mlsgn!} z8$ekaG^CBiw17~15`^XiJL^$bDIBj6<^*4?FX0~c9)XSx-2sO|r^ybJkrtkzQW#|R zoka3lxo1j6bS*%%o3|mk8v9Wiyk^k6ZDk^-Yf+CEk1`l^fD$lbntpn;fqvcLE&F`) z#(vuA@a;Nz;mo$SQ+rEZx|#Z1Y5EfO;(U-m16(|3ZuQ4}tIQ$%spY%ZX&y?LTJd<@ zn*Gdgs38%bhimMA{L$v)^QrE~J336tKj_|cXc)8I_|Lh=A0J*`s%x^0<6M?9YI0QP zO6Sv!99N@GA|^DnDnHPOZ$IDYpXY%_9H*2i<#QahHZ~YiBfY$YyHCb^TF+nnX5xy- z6}fLhaqr&LJ)^GO+-;UB5E|%h&Uat#6?+S(cCh(yETM4J_Ufhwtuq;ueKSap{&r^D z#TbX{r@nHUmh|GY&!eEvE4#ir1Zq!vmil^8eL|6CoTcu~ZCCc4F)p7v<(PMcU|aRk zjfW1Ms`qBcoUXri?VI^4j~!g%*E6!&r^9cJtX=)^U}y=Y;HmD8Hx?7|7s@vk{&UPX zcHip1CwuHGKh&eUJuB#y^vqD}XOV)tcJUqS26L7z>$=oduw`ZU!`aP-n>O=~toQdi zo%vVm=h#2v)53OWU+cV`TuAn+#Rg5td~@7=MC< zWCP4T&Zl^zmmoMcDh_(rmwQa)qNwON=+$0=SaE2C2zv7fg$B`PNG|~!&!6ecXY*++ zHxEw+ok4SBGd*2tEIOUx!17=*`3@`xKF68wz_g<=+?aqT^oke{rl$jw$!D-!Jv{+8 zCJS)2=ggvxj}D8W$WtEQ-WQ6|Sq!RgOiVmQZmhmkzlfz`3XREByh~zJrhUuEjN!OTY zx{5~jG-;^xuqD>)nbX9aes-}zUcgkB=MTs>E%#d~W{xkMSDq@)*vPOpxKQB7Xa?wk z*z~TRg&?ZcGS>!QOUl{yd`f#6z0>^sq6fp;2S3r{lu`v6UxYrv*(Z4`OH&tBFbdbr zFK@T$3c%Irl$$0=lTLfC57IYEp73bKom(MSJ@o3bju~xBaXb?cIml5sxMpQk(kqrv zQ^?GwloAffVt&oZt+lMK!fjhBFIQPmP<9_~R=&GbpR+M;{`K=`*LQw770b04X6z@e w&2U}SbaUykp+eMi?8KP}`whLHBQ`!ERvZSs*p1C(K(8?xHOgZ$G$i+b0HU+A6#xJL literal 0 HcmV?d00001 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 82da814..9292b78 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -47,5 +47,8 @@ class TestDumpPDF(): def test_7(self): run('../samples/contrib/','stamp-no') """ + + def test_8(self): + run('../samples/contrib/','2b','-A -t xml') if __name__ == '__main__': nose.runmodule()