warning removal.
code cleanup. cmap bug fixed. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
7093bdbdfa
commit
98c8367339
34
Makefile
34
Makefile
|
@ -19,14 +19,8 @@ clean:
|
||||||
-$(RM) -r build dist
|
-$(RM) -r build dist
|
||||||
-cd $(PACKAGE) && $(MAKE) clean
|
-cd $(PACKAGE) && $(MAKE) clean
|
||||||
-cd tools && $(MAKE) clean
|
-cd tools && $(MAKE) clean
|
||||||
-cd samples && $(MAKE) clean
|
|
||||||
|
|
||||||
distclean: clean cmap_clean
|
distclean: clean test_clean cmap_clean
|
||||||
|
|
||||||
test:
|
|
||||||
cd samples && $(MAKE) test
|
|
||||||
check:
|
|
||||||
cd $(PACKAGE) && make check
|
|
||||||
|
|
||||||
commit: distclean
|
commit: distclean
|
||||||
$(SVN) commit
|
$(SVN) commit
|
||||||
|
@ -39,13 +33,23 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
|
||||||
publish:
|
publish:
|
||||||
$(CP) docs/*.html $(WEBDIR)
|
$(CP) docs/*.html $(WEBDIR)
|
||||||
|
|
||||||
|
test:
|
||||||
|
cd samples && $(MAKE) test
|
||||||
|
test_clean:
|
||||||
|
-cd samples && $(MAKE) clean
|
||||||
|
|
||||||
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
||||||
CMAPDIR=pdfminer/cmap
|
CMAPSRC=cmaprsrc
|
||||||
CMAPRSRC=cmaprsrc
|
CMAPDST=pdfminer/cmap
|
||||||
cmap: cmaprsrc
|
cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
|
||||||
$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
|
||||||
$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
|
||||||
$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
|
||||||
$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
|
||||||
cmap_clean:
|
cmap_clean:
|
||||||
cd $(CMAPDIR) && make cmap_clean
|
cd $(CMAPDST) && make cmap_clean
|
||||||
|
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
|
||||||
|
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
||||||
|
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
|
||||||
|
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
||||||
|
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
|
||||||
|
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
||||||
|
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
|
||||||
|
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
||||||
|
|
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Dec 20 01:25:02 JST 2009
|
Last Modified: Fri Jan 1 12:04:47 JST 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ Last Modified: Sun Dec 20 01:25:02 JST 2009
|
||||||
<li> <a href="#intro">What's It?</a>
|
<li> <a href="#intro">What's It?</a>
|
||||||
<li> <a href="#source">Download</a>
|
<li> <a href="#source">Download</a>
|
||||||
<li> <a href="#install">Install</a>
|
<li> <a href="#install">Install</a>
|
||||||
<small>(<a href="#cmap">for non-ASCII languages</a>)</small>
|
<small>(<a href="#cmap">for East Asian languages</a>)</small>
|
||||||
<li> <a href="#usage">How to Use</a>
|
<li> <a href="#usage">How to Use</a>
|
||||||
<small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
|
<small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
|
||||||
<li> <a href="#todos">TODOs</a>
|
<li> <a href="#todos">TODOs</a>
|
||||||
|
@ -54,7 +54,7 @@ PDF parser that can be used for other purposes instead of text analysis.
|
||||||
<ul>
|
<ul>
|
||||||
<li> Written entirely in Python. (for version 2.4 or newer)
|
<li> Written entirely in Python. (for version 2.4 or newer)
|
||||||
<li> PDF-1.7 specification support. (well, almost)
|
<li> PDF-1.7 specification support. (well, almost)
|
||||||
<li> Non-ASCII languages and vertical writing scripts support.
|
<li> East Asian languages and vertical writing scripts support.
|
||||||
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
||||||
<li> Basic encryption (RC4) support.
|
<li> Basic encryption (RC4) support.
|
||||||
<li> PDF to HTML conversion (with a sample converter web app).
|
<li> PDF to HTML conversion (with a sample converter web app).
|
||||||
|
@ -125,8 +125,8 @@ W o r l d
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
<a name="cmap"></a>
|
<a name="cmap"></a>
|
||||||
<h3>For non-ASCII languages</h3>
|
<h3>For East Asian languages</h3>
|
||||||
In order to handle non-ASCII languages (e.g. Japanese),
|
In order to handle East Asian languages (Chinese or Japanese, etc.),
|
||||||
you need to install an additional data called <code>CMap</code>,
|
you need to install an additional data called <code>CMap</code>,
|
||||||
which is originally distributed by Adobe. CMap is now included
|
which is originally distributed by Adobe. CMap is now included
|
||||||
in the pdfminer package, but not installed by default.
|
in the pdfminer package, but not installed by default.
|
||||||
|
@ -163,9 +163,6 @@ direction (horizontal or vertical) for each text portion.
|
||||||
You need to provide a password for protected PDF documents when its access is restricted.
|
You need to provide a password for protected PDF documents when its access is restricted.
|
||||||
You cannot extract any text from a PDF document which does not have extraction permission.
|
You cannot extract any text from a PDF document which does not have extraction permission.
|
||||||
<p>
|
<p>
|
||||||
For non-ASCII languages, you can specify the output encoding
|
|
||||||
(such as UTF-8).
|
|
||||||
<p>
|
|
||||||
<strong>Note:</strong> Not all characters in a PDF can be safely converted to Unicode.
|
<strong>Note:</strong> Not all characters in a PDF can be safely converted to Unicode.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
@ -194,7 +191,7 @@ Page numbers are starting from one.
|
||||||
By default, it extracts texts from all the pages.
|
By default, it extracts texts from all the pages.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-c <em>codec</em></code>
|
<dt> <code>-c <em>codec</em></code>
|
||||||
<dd> Specifies the output codec for non-ASCII texts.
|
<dd> Specifies the output codec.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-t <em>type</em></code>
|
<dt> <code>-t <em>type</em></code>
|
||||||
<dd> Specifies the output format. The following formats are currently supported.
|
<dd> Specifies the output format. The following formats are currently supported.
|
||||||
|
@ -344,6 +341,9 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
|
||||||
|
<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
|
||||||
|
<li> 2009/12/20: Experimental polygon shape extraction added. Thanks to Yusuf Dewaswala for reporting.
|
||||||
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
|
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
|
||||||
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
|
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
|
||||||
<li> 2009/10/31: SGML output format is changed and renamed as XML.
|
<li> 2009/10/31: SGML output format is changed and renamed as XML.
|
||||||
|
|
|
@ -1,12 +1,7 @@
|
||||||
# Makefile for pdfminer
|
# Makefile for pdfminer
|
||||||
|
|
||||||
PYCHECKER=pychecker --limit=0
|
|
||||||
|
|
||||||
all:
|
all:
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm *.pyc *.pyo
|
-rm *.pyc *.pyo
|
||||||
cd cmap && make clean
|
cd cmap && make clean
|
||||||
|
|
||||||
check:
|
|
||||||
$(PYCHECKER) *.py
|
|
||||||
|
|
|
@ -90,14 +90,14 @@ class UnicodeMap(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, cid2unicode=None):
|
def __init__(self, cid2unichr=None):
|
||||||
self.cid2unicode = cid2unicode or {}
|
self.cid2unichr = cid2unichr or {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_unicode(self, cid):
|
def get_unichr(self, cid):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
|
print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
|
||||||
return self.cid2unicode.get(cid)
|
return self.cid2unichr[cid]
|
||||||
|
|
||||||
|
|
||||||
## FileCMap
|
## FileCMap
|
||||||
|
@ -151,16 +151,16 @@ class FileUnicodeMap(UnicodeMap):
|
||||||
self.attrs[k] = v
|
self.attrs[k] = v
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_cid2unicode(self, cid, code):
|
def add_cid2unichr(self, cid, code):
|
||||||
assert isinstance(cid, int)
|
assert isinstance(cid, int)
|
||||||
if isinstance(code, PSLiteral):
|
if isinstance(code, PSLiteral):
|
||||||
# Interpret as an Adobe glyph name.
|
# Interpret as an Adobe glyph name.
|
||||||
self.cid2unicode[cid] = name2unicode(code.name)
|
self.cid2unichr[cid] = unichr(name2unicode(code.name))
|
||||||
elif isinstance(code, str):
|
elif isinstance(code, str):
|
||||||
# Interpret as UTF-16BE.
|
# Interpret as UTF-16BE.
|
||||||
self.cid2unicode[cid] = unpack('>H', code)[0]
|
self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
|
||||||
elif isinstance(code, int):
|
elif isinstance(code, int):
|
||||||
self.cid2unicode[cid] = code
|
self.cid2unichr[cid] = unichr(code)
|
||||||
else:
|
else:
|
||||||
raise TypeError(code)
|
raise TypeError(code)
|
||||||
return
|
return
|
||||||
|
@ -189,10 +189,10 @@ class PyUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
def __init__(self, name, module, vertical):
|
def __init__(self, name, module, vertical):
|
||||||
if vertical:
|
if vertical:
|
||||||
cid2unicode = module.CID2UNICODE_V
|
cid2unichr = module.CID2UNICHR_V
|
||||||
else:
|
else:
|
||||||
cid2unicode = module.CID2UNICODE_H
|
cid2unichr = module.CID2UNICHR_H
|
||||||
UnicodeMap.__init__(self, cid2unicode)
|
UnicodeMap.__init__(self, cid2unichr)
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -333,7 +333,7 @@ class CMapParser(PSStackParser):
|
||||||
#assert s1 <= e1
|
#assert s1 <= e1
|
||||||
if isinstance(code, list):
|
if isinstance(code, list):
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
self.cmap.add_cid2unicode(s1+i, code[i])
|
self.cmap.add_cid2unichr(s1+i, code[i])
|
||||||
else:
|
else:
|
||||||
var = code[-4:]
|
var = code[-4:]
|
||||||
base = nunpack(var)
|
base = nunpack(var)
|
||||||
|
@ -341,7 +341,7 @@ class CMapParser(PSStackParser):
|
||||||
vlen = len(var)
|
vlen = len(var)
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = prefix+pack('>L',base+i)[-vlen:]
|
x = prefix+pack('>L',base+i)[-vlen:]
|
||||||
self.cmap.add_cid2unicode(s1+i, x)
|
self.cmap.add_cid2unichr(s1+i, x)
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginbfchar':
|
if name == 'beginbfchar':
|
||||||
|
@ -351,7 +351,7 @@ class CMapParser(PSStackParser):
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
for (cid,code) in choplist(2, objs):
|
for (cid,code) in choplist(2, objs):
|
||||||
if isinstance(cid, str) and isinstance(code, str):
|
if isinstance(cid, str) and isinstance(code, str):
|
||||||
self.cmap.add_cid2unicode(nunpack(cid), code)
|
self.cmap.add_cid2unichr(nunpack(cid), code)
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginnotdefrange':
|
if name == 'beginnotdefrange':
|
||||||
|
|
|
@ -405,9 +405,10 @@ class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
def to_unichr(self, cid):
|
def to_unichr(self, cid):
|
||||||
if self.unicode_map:
|
if self.unicode_map:
|
||||||
code = self.unicode_map.get_unicode(cid)
|
try:
|
||||||
if code is not None:
|
return self.unicode_map.get_unichr(cid)
|
||||||
return unichr(code)
|
except KeyError:
|
||||||
|
pass
|
||||||
try:
|
try:
|
||||||
return self.encoding[cid]
|
return self.encoding[cid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -571,11 +572,10 @@ class PDFCIDFont(PDFFont):
|
||||||
return self.disps.get(cid, self.default_disp)
|
return self.disps.get(cid, self.default_disp)
|
||||||
|
|
||||||
def to_unichr(self, cid):
|
def to_unichr(self, cid):
|
||||||
if not self.unicode_map:
|
try:
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
if not self.unicode_map: raise KeyError(cid)
|
||||||
code = self.unicode_map.get_unicode(cid)
|
return self.unicode_map.get_unichr(cid)
|
||||||
if code is not None:
|
except KeyError:
|
||||||
return unichr(code)
|
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -766,7 +766,9 @@ class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||||
|
|
||||||
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
||||||
doc = PDFDocument()
|
doc = PDFDocument()
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(fp)
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
if not doc.is_extractable:
|
if not doc.is_extractable:
|
||||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import md5
|
|
||||||
import struct
|
import struct
|
||||||
from sys import stderr
|
from sys import stderr
|
||||||
|
try:
|
||||||
|
import hashlib as md5
|
||||||
|
except ImportError:
|
||||||
|
import md5
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -19,7 +22,7 @@ from pdftypes import int_value, float_value, num_value
|
||||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
from arcfour import Arcfour
|
from arcfour import Arcfour
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
from utils import decode_text
|
from utils import decode_text, ObjIdRange
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
|
@ -39,57 +42,29 @@ LITERAL_CATALOG = LIT('Catalog')
|
||||||
|
|
||||||
## XRefs
|
## XRefs
|
||||||
##
|
##
|
||||||
class XRefObjRange(object):
|
|
||||||
def __init__(self, start, nobjs):
|
|
||||||
self.start = start
|
|
||||||
self.nobjs = nobjs
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<XRefObjRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
|
|
||||||
|
|
||||||
def get_start_id(self):
|
|
||||||
return self.start
|
|
||||||
|
|
||||||
def get_end_id(self):
|
|
||||||
return self.start + self.nobjs - 1
|
|
||||||
|
|
||||||
def get_nobjs(self):
|
|
||||||
return self.nobjs
|
|
||||||
|
|
||||||
class PDFBaseXRef(object):
|
class PDFBaseXRef(object):
|
||||||
def __init__(self):
|
|
||||||
self.objid_ranges = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def objids(self):
|
def get_trailer(self):
|
||||||
if self.objid_ranges:
|
raise NotImplementedError
|
||||||
for objid_range in self.objid_ranges:
|
|
||||||
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
def get_pos(self, objid):
|
||||||
yield objid
|
raise KeyError(objid)
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## PDFXRef
|
## PDFXRef
|
||||||
##
|
##
|
||||||
class PDFXRef(PDFBaseXRef):
|
class PDFXRef(PDFBaseXRef):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
PDFBaseXRef.__init__(self)
|
self.offsets = {}
|
||||||
self.offsets = None
|
|
||||||
self.trailer = {}
|
self.trailer = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFXRef: objs=%d>' % len(self.offsets)
|
|
||||||
|
|
||||||
def load(self, parser, debug=0):
|
def load(self, parser, debug=0):
|
||||||
self.offsets = {}
|
|
||||||
self.objid_ranges = []
|
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
if not line.strip():
|
if not line.strip(): continue
|
||||||
continue
|
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||||
if not line:
|
if not line:
|
||||||
|
@ -104,8 +79,6 @@ class PDFXRef(PDFBaseXRef):
|
||||||
(start, nobjs) = map(long, f)
|
(start, nobjs) = map(long, f)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
||||||
self.newoffsets = {}
|
|
||||||
self.objid_ranges.append(XRefObjRange(start, nobjs))
|
|
||||||
for objid in xrange(start, start+nobjs):
|
for objid in xrange(start, start+nobjs):
|
||||||
try:
|
try:
|
||||||
(_, line) = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
|
@ -133,10 +106,33 @@ class PDFXRef(PDFBaseXRef):
|
||||||
if not x:
|
if not x:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||||
(_,dic) = x[0]
|
(_,dic) = x[0]
|
||||||
self.trailer.update( dict_value(dic))
|
self.trailer.update(dict_value(dic))
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||||
|
def load_fallback(self, parser, debug=0):
|
||||||
|
parser.seek(0)
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
(pos, line) = parser.nextline()
|
||||||
|
except PSEOF:
|
||||||
|
break
|
||||||
|
if line.startswith('trailer'):
|
||||||
|
parser.seek(pos)
|
||||||
|
self.load_trailer(parser)
|
||||||
|
if 1 <= debug:
|
||||||
|
print >>stderr, 'trailer: %r' % self.get_trailer()
|
||||||
|
break
|
||||||
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
|
if not m: continue
|
||||||
|
(objid, genno) = m.groups()
|
||||||
|
self.offsets[int(objid)] = (0, pos)
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_trailer(self):
|
||||||
|
return self.trailer
|
||||||
|
|
||||||
|
def get_pos(self, objid):
|
||||||
try:
|
try:
|
||||||
(genno, pos) = self.offsets[objid]
|
(genno, pos) = self.offsets[objid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -149,10 +145,10 @@ class PDFXRef(PDFBaseXRef):
|
||||||
class PDFXRefStream(PDFBaseXRef):
|
class PDFXRefStream(PDFBaseXRef):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
PDFBaseXRef.__init__(self)
|
|
||||||
self.data = None
|
self.data = None
|
||||||
self.entlen = None
|
self.entlen = None
|
||||||
self.fl1 = self.fl2 = self.fl3 = None
|
self.fl1 = self.fl2 = self.fl3 = None
|
||||||
|
self.objid_ranges = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -169,17 +165,22 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
index_array = stream.dic.get('Index', (0,size))
|
index_array = stream.dic.get('Index', (0,size))
|
||||||
if len(index_array) % 2 != 0:
|
if len(index_array) % 2 != 0:
|
||||||
raise PDFSyntaxError('Invalid index number')
|
raise PDFSyntaxError('Invalid index number')
|
||||||
self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ]
|
self.objid_ranges.extend( ObjIdRange(start, nobjs)
|
||||||
|
for (start,nobjs) in choplist(2, index_array) )
|
||||||
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
|
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
|
||||||
self.data = stream.get_data()
|
self.data = stream.get_data()
|
||||||
self.entlen = self.fl1+self.fl2+self.fl3
|
self.entlen = self.fl1+self.fl2+self.fl3
|
||||||
self.trailer = stream.dic
|
self.trailer = stream.dic
|
||||||
if debug:
|
if debug:
|
||||||
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||||
(', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3)))
|
(', '.join(map(repr, self.objid_ranges),
|
||||||
|
self.fl1, self.fl2, self.fl3)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
def get_trailer(self):
|
||||||
|
return self.trailer
|
||||||
|
|
||||||
|
def get_pos(self, objid):
|
||||||
offset = 0
|
offset = 0
|
||||||
found = False
|
found = False
|
||||||
for objid_range in self.objid_ranges:
|
for objid_range in self.objid_ranges:
|
||||||
|
@ -207,14 +208,35 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
|
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
## A PDFPage object is nothing more than a bunch of keys and values
|
|
||||||
## that describe the properties of the page and point to its contents,
|
|
||||||
## and has nothing to do with a real graphical entity. For a real graphical
|
|
||||||
## object, look at layout.LTPage.
|
|
||||||
##
|
|
||||||
class PDFPage(object):
|
class PDFPage(object):
|
||||||
|
|
||||||
|
"""An object that holds the information about a page.
|
||||||
|
|
||||||
|
A PDFPage object is merely a convenience class that has a set
|
||||||
|
of keys and values, which describe the properties of a page
|
||||||
|
and point to its contents.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
doc: a PDFDocument object.
|
||||||
|
pageid: any Python object that can uniquely identify the page.
|
||||||
|
attrs: a dictionary of page attributes.
|
||||||
|
contents: a list of PDFStream objects that represents the page content.
|
||||||
|
lastmod: the last modified time of the page.
|
||||||
|
resources: a list of resources used by the page.
|
||||||
|
mediabox: the physical size of the page.
|
||||||
|
cropbox: the crop rectangle of the page.
|
||||||
|
rotate: the page rotation (in degree).
|
||||||
|
annots: the page annotations.
|
||||||
|
beads: a chain that represents natural reading order.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
def __init__(self, doc, pageid, attrs):
|
||||||
|
"""Initialize a page object.
|
||||||
|
|
||||||
|
doc: a PDFDocument object.
|
||||||
|
pageid: any Python object that can uniquely identify the page.
|
||||||
|
attrs: a dictionary of page attributes.
|
||||||
|
"""
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.pageid = pageid
|
self.pageid = pageid
|
||||||
self.attrs = dict_value(attrs)
|
self.attrs = dict_value(attrs)
|
||||||
|
@ -243,13 +265,23 @@ class PDFPage(object):
|
||||||
|
|
||||||
## PDFDocument
|
## PDFDocument
|
||||||
##
|
##
|
||||||
## A PDFDocument object represents a PDF document.
|
|
||||||
## Since a PDF file is usually pretty big, normally it is not loaded
|
|
||||||
## at once. Rather it is parsed dynamically as processing goes.
|
|
||||||
## A PDF parser is associated with the document.
|
|
||||||
##
|
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
|
||||||
|
"""PDFDocument object represents a PDF document.
|
||||||
|
|
||||||
|
Since a PDF file can be very big, normally it is not loaded at
|
||||||
|
once. Each PDF document has a PDF parser object associated,
|
||||||
|
and the data stream is parsed dynamically as processing goes.
|
||||||
|
|
||||||
|
Typical usage:
|
||||||
|
doc = PDFDocument()
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
|
doc.initialize(password)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -261,24 +293,23 @@ class PDFDocument(object):
|
||||||
self.parser = None
|
self.parser = None
|
||||||
self.encryption = None
|
self.encryption = None
|
||||||
self.decipher = None
|
self.decipher = None
|
||||||
self.ready = False
|
self._initialized = False
|
||||||
return
|
return
|
||||||
|
|
||||||
# set_parser(parser)
|
|
||||||
# Associates the document with an (already initialized) parser object.
|
|
||||||
def set_parser(self, parser):
|
def set_parser(self, parser):
|
||||||
|
"Set the document to use a given PDFParser object."
|
||||||
if self.parser: return
|
if self.parser: return
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
# The document is set to be temporarily ready during collecting
|
# The document is set to be temporarily ready during collecting
|
||||||
# all the basic information about the document, e.g.
|
# all the basic information about the document, e.g.
|
||||||
# the header, the encryption information, and the access rights
|
# the header, the encryption information, and the access rights
|
||||||
# for the document.
|
# for the document.
|
||||||
self.ready = True
|
self._initialized = True
|
||||||
# Retrieve the information of each header that was appended
|
# Retrieve the information of each header that was appended
|
||||||
# (maybe multiple times) at the end of the document.
|
# (maybe multiple times) at the end of the document.
|
||||||
self.xrefs = parser.read_xref()
|
self.xrefs = parser.read_xref()
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.trailer
|
trailer = xref.get_trailer()
|
||||||
if not trailer: continue
|
if not trailer: continue
|
||||||
# If there's an encryption info, remember it.
|
# If there's an encryption info, remember it.
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
|
@ -293,7 +324,7 @@ class PDFDocument(object):
|
||||||
# The document is set to be non-ready again, until all the
|
# The document is set to be non-ready again, until all the
|
||||||
# proper initialization (asking the password key and
|
# proper initialization (asking the password key and
|
||||||
# verifying the access permission, so on) is finished.
|
# verifying the access permission, so on) is finished.
|
||||||
self.ready = False
|
self._initialized = False
|
||||||
return
|
return
|
||||||
|
|
||||||
# set_root(root)
|
# set_root(root)
|
||||||
|
@ -315,7 +346,7 @@ class PDFDocument(object):
|
||||||
def initialize(self, password=''):
|
def initialize(self, password=''):
|
||||||
if not self.encryption:
|
if not self.encryption:
|
||||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
self.ready = True
|
self._initialized = True
|
||||||
return
|
return
|
||||||
(docid, param) = self.encryption
|
(docid, param) = self.encryption
|
||||||
if literal_name(param['Filter']) != 'Standard':
|
if literal_name(param['Filter']) != 'Standard':
|
||||||
|
@ -367,7 +398,7 @@ class PDFDocument(object):
|
||||||
raise PDFPasswordIncorrect
|
raise PDFPasswordIncorrect
|
||||||
self.decrypt_key = key
|
self.decrypt_key = key
|
||||||
self.decipher = self.decrypt_rc4 # XXX may be AES
|
self.decipher = self.decrypt_rc4 # XXX may be AES
|
||||||
self.ready = True
|
self._initialized = True
|
||||||
return
|
return
|
||||||
|
|
||||||
def decrypt_rc4(self, objid, genno, data):
|
def decrypt_rc4(self, objid, genno, data):
|
||||||
|
@ -378,7 +409,7 @@ class PDFDocument(object):
|
||||||
|
|
||||||
KEYWORD_OBJ = KWD('obj')
|
KEYWORD_OBJ = KWD('obj')
|
||||||
def getobj(self, objid):
|
def getobj(self, objid):
|
||||||
if not self.ready:
|
if not self._initialized:
|
||||||
raise PDFException('PDFDocument not initialized')
|
raise PDFException('PDFDocument not initialized')
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
|
@ -389,7 +420,7 @@ class PDFDocument(object):
|
||||||
else:
|
else:
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
try:
|
try:
|
||||||
(strmid, index) = xref.getpos(objid)
|
(strmid, index) = xref.get_pos(objid)
|
||||||
break
|
break
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
@ -411,7 +442,7 @@ class PDFDocument(object):
|
||||||
if strmid in self.parsed_objs:
|
if strmid in self.parsed_objs:
|
||||||
objs = self.parsed_objs[strmid]
|
objs = self.parsed_objs[strmid]
|
||||||
else:
|
else:
|
||||||
parser = PDFObjStrmParser(self, stream.get_data())
|
parser = PDFObjStrmParser(stream.get_data())
|
||||||
objs = []
|
objs = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -458,7 +489,7 @@ class PDFDocument(object):
|
||||||
|
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
def get_pages(self):
|
def get_pages(self):
|
||||||
if not self.ready:
|
if not self._initialized:
|
||||||
raise PDFException('PDFDocument is not initialized')
|
raise PDFException('PDFDocument is not initialized')
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
|
@ -529,14 +560,15 @@ class PDFDocument(object):
|
||||||
##
|
##
|
||||||
class PDFParser(PSStackParser):
|
class PDFParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, doc, fp):
|
def __init__(self, fp):
|
||||||
PSStackParser.__init__(self, fp)
|
PSStackParser.__init__(self, fp)
|
||||||
self.doc = doc
|
self.doc = None
|
||||||
self.doc.set_parser(self)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def set_document(self, doc):
|
||||||
return '<PDFParser>'
|
"Associates the parser with a PDFDocument object."
|
||||||
|
self.doc = doc
|
||||||
|
return
|
||||||
|
|
||||||
KEYWORD_R = KWD('R')
|
KEYWORD_R = KWD('R')
|
||||||
KEYWORD_ENDOBJ = KWD('endobj')
|
KEYWORD_ENDOBJ = KWD('endobj')
|
||||||
|
@ -647,7 +679,7 @@ class PDFParser(PSStackParser):
|
||||||
xref = PDFXRef()
|
xref = PDFXRef()
|
||||||
xref.load(self, debug=self.debug)
|
xref.load(self, debug=self.debug)
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
trailer = xref.trailer
|
trailer = xref.get_trailer()
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'trailer: %r' % trailer
|
print >>stderr, 'trailer: %r' % trailer
|
||||||
if 'XRefStm' in trailer:
|
if 'XRefStm' in trailer:
|
||||||
|
@ -669,26 +701,8 @@ class PDFParser(PSStackParser):
|
||||||
# fallback
|
# fallback
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'no xref, fallback'
|
print >>stderr, 'no xref, fallback'
|
||||||
self.seek(0)
|
|
||||||
pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
|
||||||
offsets = {}
|
|
||||||
xref = PDFXRef()
|
xref = PDFXRef()
|
||||||
while 1:
|
xref.load_fallback(self)
|
||||||
try:
|
|
||||||
(pos, line) = self.nextline()
|
|
||||||
except PSEOF:
|
|
||||||
break
|
|
||||||
if line.startswith('trailer'):
|
|
||||||
xref.offsets = offsets
|
|
||||||
self.seek(pos)
|
|
||||||
xref.load_trailer(self)
|
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>stderr, 'trailer: %r' % xref.trailer
|
|
||||||
continue
|
|
||||||
m = pat.match(line)
|
|
||||||
if not m: continue
|
|
||||||
(objid, genno) = m.groups()
|
|
||||||
offsets[int(objid)] = (0, pos)
|
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
return xrefs
|
return xrefs
|
||||||
|
|
||||||
|
@ -697,8 +711,8 @@ class PDFParser(PSStackParser):
|
||||||
##
|
##
|
||||||
class PDFObjStrmParser(PSStackParser):
|
class PDFObjStrmParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, doc, data):
|
def __init__(self, data):
|
||||||
PDFParser.__init__(self, doc, StringIO(data))
|
PSStackParser.__init__(self, StringIO(data))
|
||||||
return
|
return
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
|
|
|
@ -135,3 +135,27 @@ def enc(x, codec='ascii'):
|
||||||
'''Encodes a string for SGML/XML/HTML'''
|
'''Encodes a string for SGML/XML/HTML'''
|
||||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||||
return x.encode(codec, 'xmlcharrefreplace')
|
return x.encode(codec, 'xmlcharrefreplace')
|
||||||
|
|
||||||
|
|
||||||
|
## ObjIdRange
|
||||||
|
##
|
||||||
|
class ObjIdRange(object):
|
||||||
|
|
||||||
|
"A utility class to represent a range of object IDs."
|
||||||
|
|
||||||
|
def __init__(self, start, nobjs):
|
||||||
|
self.start = start
|
||||||
|
self.nobjs = nobjs
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<ObjIdRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
|
||||||
|
|
||||||
|
def get_start_id(self):
|
||||||
|
return self.start
|
||||||
|
|
||||||
|
def get_end_id(self):
|
||||||
|
return self.start + self.nobjs - 1
|
||||||
|
|
||||||
|
def get_nobjs(self):
|
||||||
|
return self.nobjs
|
||||||
|
|
|
@ -12,7 +12,7 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
else:
|
else:
|
||||||
return (name+'-H', name+'-V')
|
return (name+'-H', name+'-V')
|
||||||
|
|
||||||
def get_unicode(codes):
|
def get_unichr(codes):
|
||||||
# determine the "most popular" candidate.
|
# determine the "most popular" candidate.
|
||||||
d = {}
|
d = {}
|
||||||
for code in codes:
|
for code in codes:
|
||||||
|
@ -26,7 +26,7 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
pass
|
pass
|
||||||
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
|
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
|
||||||
return ord(chars[0])
|
return chars[0]
|
||||||
|
|
||||||
def put(dmap, code, cid, force=False):
|
def put(dmap, code, cid, force=False):
|
||||||
for b in code[:-1]:
|
for b in code[:-1]:
|
||||||
|
@ -45,8 +45,8 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
names = []
|
names = []
|
||||||
code2cid = {} # {'cmapname': ...}
|
code2cid = {} # {'cmapname': ...}
|
||||||
is_vertical = {}
|
is_vertical = {}
|
||||||
cid2unicode_h = {} # {cid: unicode}
|
cid2unichr_h = {} # {cid: unichr}
|
||||||
cid2unicode_v = {} # {cid: unicode}
|
cid2unichr_v = {} # {cid: unichr}
|
||||||
|
|
||||||
for line in fp:
|
for line in fp:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
@ -95,21 +95,21 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
put(hmap, code, cid, True)
|
put(hmap, code, cid, True)
|
||||||
if name.endswith('-UTF8'):
|
if name.endswith('-UTF8'):
|
||||||
if hcodes:
|
if hcodes:
|
||||||
cid2unicode_h[cid] = get_unicode(hcodes)
|
cid2unichr_h[cid] = get_unichr(hcodes)
|
||||||
if vcodes:
|
if vcodes:
|
||||||
cid2unicode_v[cid] = get_unicode(vcodes)
|
cid2unichr_v[cid] = get_unichr(vcodes)
|
||||||
else:
|
else:
|
||||||
for code in hcodes:
|
for code in hcodes:
|
||||||
put(hmap, code, cid)
|
put(hmap, code, cid)
|
||||||
put(vmap, code, cid)
|
put(vmap, code, cid)
|
||||||
if name.endswith('-UTF8') and hcodes:
|
if name.endswith('-UTF8') and hcodes:
|
||||||
code = get_unicode(hcodes)
|
code = get_unichr(hcodes)
|
||||||
if cid not in cid2unicode_h:
|
if cid not in cid2unichr_h:
|
||||||
cid2unicode_h[cid] = code
|
cid2unichr_h[cid] = code
|
||||||
if cid not in cid2unicode_v:
|
if cid not in cid2unichr_v:
|
||||||
cid2unicode_v[cid] = code
|
cid2unichr_v[cid] = code
|
||||||
|
|
||||||
return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
|
return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v)
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
@ -128,7 +128,7 @@ def main(argv):
|
||||||
|
|
||||||
print >>sys.stderr, 'reading %r...' % src
|
print >>sys.stderr, 'reading %r...' % src
|
||||||
fp = file(src)
|
fp = file(src)
|
||||||
(code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
|
(code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
for (name, cmap) in code2cid.iteritems():
|
for (name, cmap) in code2cid.iteritems():
|
||||||
|
@ -146,8 +146,8 @@ def main(argv):
|
||||||
fp = file(os.path.join(outdir, fname), 'w')
|
fp = file(os.path.join(outdir, fname), 'w')
|
||||||
print >>fp, '#!/usr/bin/env python'
|
print >>fp, '#!/usr/bin/env python'
|
||||||
print >>fp, '#', fname
|
print >>fp, '#', fname
|
||||||
print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
|
print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
|
||||||
print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
|
print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -99,7 +99,9 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None):
|
dumpall=False, codec=None):
|
||||||
doc = PDFDocument()
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(fp)
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
||||||
for (level,title,dest,a,se) in doc.get_outlines():
|
for (level,title,dest,a,se) in doc.get_outlines():
|
||||||
|
@ -119,7 +121,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None):
|
dumpall=False, codec=None):
|
||||||
doc = PDFDocument()
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(fp)
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
if objids:
|
if objids:
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
|
|
Loading…
Reference in New Issue