warning removal.

code cleanup.
cmap bug fixed.


git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-01 03:09:26 +00:00
parent 7093bdbdfa
commit 98c8367339
10 changed files with 208 additions and 165 deletions

View File

@ -19,14 +19,8 @@ clean:
-$(RM) -r build dist -$(RM) -r build dist
-cd $(PACKAGE) && $(MAKE) clean -cd $(PACKAGE) && $(MAKE) clean
-cd tools && $(MAKE) clean -cd tools && $(MAKE) clean
-cd samples && $(MAKE) clean
distclean: clean cmap_clean distclean: clean test_clean cmap_clean
test:
cd samples && $(MAKE) test
check:
cd $(PACKAGE) && make check
commit: distclean commit: distclean
$(SVN) commit $(SVN) commit
@ -39,13 +33,23 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish: publish:
$(CP) docs/*.html $(WEBDIR) $(CP) docs/*.html $(WEBDIR)
test:
cd samples && $(MAKE) test
test_clean:
-cd samples && $(MAKE) clean
CONV_CMAP=$(PYTHON) tools/conv_cmap.py CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPDIR=pdfminer/cmap CMAPSRC=cmaprsrc
CMAPRSRC=cmaprsrc CMAPDST=pdfminer/cmap
cmap: cmaprsrc cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5 $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
cmap_clean: cmap_clean:
cd $(CMAPDIR) && make cmap_clean cd $(CMAPDST) && make cmap_clean
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Dec 20 01:25:02 JST 2009 Last Modified: Fri Jan 1 12:04:47 JST 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -27,7 +27,7 @@ Last Modified: Sun Dec 20 01:25:02 JST 2009
<li> <a href="#intro">What's It?</a> <li> <a href="#intro">What's It?</a>
<li> <a href="#source">Download</a> <li> <a href="#source">Download</a>
<li> <a href="#install">Install</a> <li> <a href="#install">Install</a>
&nbsp; <small>(<a href="#cmap">for non-ASCII languages</a>)</small> &nbsp; <small>(<a href="#cmap">for East Asian languages</a>)</small>
<li> <a href="#usage">How to Use</a> <li> <a href="#usage">How to Use</a>
&nbsp; <small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small> &nbsp; <small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
<li> <a href="#todos">TODOs</a> <li> <a href="#todos">TODOs</a>
@ -54,7 +54,7 @@ PDF parser that can be used for other purposes instead of text analysis.
<ul> <ul>
<li> Written entirely in Python. (for version 2.4 or newer) <li> Written entirely in Python. (for version 2.4 or newer)
<li> PDF-1.7 specification support. (well, almost) <li> PDF-1.7 specification support. (well, almost)
<li> Non-ASCII languages and vertical writing scripts support. <li> East Asian languages and vertical writing scripts support.
<li> Various font types (Type1, TrueType, Type3, and CID) support. <li> Various font types (Type1, TrueType, Type3, and CID) support.
<li> Basic encryption (RC4) support. <li> Basic encryption (RC4) support.
<li> PDF to HTML conversion (with a sample converter web app). <li> PDF to HTML conversion (with a sample converter web app).
@ -125,8 +125,8 @@ W o r l d
<p> <p>
<a name="cmap"></a> <a name="cmap"></a>
<h3>For non-ASCII languages</h3> <h3>For East Asian languages</h3>
In order to handle non-ASCII languages (e.g. Japanese), In order to handle East Asian languages (Chinese or Japanese, etc.),
you need to install an additional data called <code>CMap</code>, you need to install an additional data called <code>CMap</code>,
which is originally distributed by Adobe. CMap is now included which is originally distributed by Adobe. CMap is now included
in the pdfminer package, but not installed by default. in the pdfminer package, but not installed by default.
@ -163,9 +163,6 @@ direction (horizontal or vertical) for each text portion.
You need to provide a password for protected PDF documents when its access is restricted. You need to provide a password for protected PDF documents when its access is restricted.
You cannot extract any text from a PDF document which does not have extraction permission. You cannot extract any text from a PDF document which does not have extraction permission.
<p> <p>
For non-ASCII languages, you can specify the output encoding
(such as UTF-8).
<p>
<strong>Note:</strong> Not all characters in a PDF can be safely converted to Unicode. <strong>Note:</strong> Not all characters in a PDF can be safely converted to Unicode.
<p> <p>
@ -194,7 +191,7 @@ Page numbers are starting from one.
By default, it extracts texts from all the pages. By default, it extracts texts from all the pages.
<p> <p>
<dt> <code>-c <em>codec</em></code> <dt> <code>-c <em>codec</em></code>
<dd> Specifies the output codec for non-ASCII texts. <dd> Specifies the output codec.
<p> <p>
<dt> <code>-t <em>type</em></code> <dt> <code>-t <em>type</em></code>
<dd> Specifies the output format. The following formats are currently supported. <dd> Specifies the output format. The following formats are currently supported.
@ -344,6 +341,9 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
<li> 2009/12/20: Experimental polygon shape extraction added. Thanks to Yusuf Dewaswala for reporting.
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them. <li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras. <li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
<li> 2009/10/31: SGML output format is changed and renamed as XML. <li> 2009/10/31: SGML output format is changed and renamed as XML.

View File

@ -1,12 +1,7 @@
# Makefile for pdfminer # Makefile for pdfminer
PYCHECKER=pychecker --limit=0
all: all:
clean: clean:
-rm *.pyc *.pyo -rm *.pyc *.pyo
cd cmap && make clean cd cmap && make clean
check:
$(PYCHECKER) *.py

View File

@ -90,14 +90,14 @@ class UnicodeMap(object):
debug = 0 debug = 0
def __init__(self, cid2unicode=None): def __init__(self, cid2unichr=None):
self.cid2unicode = cid2unicode or {} self.cid2unichr = cid2unichr or {}
return return
def get_unicode(self, cid): def get_unichr(self, cid):
if self.debug: if self.debug:
print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid) print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
return self.cid2unicode.get(cid) return self.cid2unichr[cid]
## FileCMap ## FileCMap
@ -151,16 +151,16 @@ class FileUnicodeMap(UnicodeMap):
self.attrs[k] = v self.attrs[k] = v
return return
def add_cid2unicode(self, cid, code): def add_cid2unichr(self, cid, code):
assert isinstance(cid, int) assert isinstance(cid, int)
if isinstance(code, PSLiteral): if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name. # Interpret as an Adobe glyph name.
self.cid2unicode[cid] = name2unicode(code.name) self.cid2unichr[cid] = unichr(name2unicode(code.name))
elif isinstance(code, str): elif isinstance(code, str):
# Interpret as UTF-16BE. # Interpret as UTF-16BE.
self.cid2unicode[cid] = unpack('>H', code)[0] self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
elif isinstance(code, int): elif isinstance(code, int):
self.cid2unicode[cid] = code self.cid2unichr[cid] = unichr(code)
else: else:
raise TypeError(code) raise TypeError(code)
return return
@ -189,10 +189,10 @@ class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical): def __init__(self, name, module, vertical):
if vertical: if vertical:
cid2unicode = module.CID2UNICODE_V cid2unichr = module.CID2UNICHR_V
else: else:
cid2unicode = module.CID2UNICODE_H cid2unichr = module.CID2UNICHR_H
UnicodeMap.__init__(self, cid2unicode) UnicodeMap.__init__(self, cid2unichr)
self.name = name self.name = name
return return
@ -333,7 +333,7 @@ class CMapParser(PSStackParser):
#assert s1 <= e1 #assert s1 <= e1
if isinstance(code, list): if isinstance(code, list):
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
self.cmap.add_cid2unicode(s1+i, code[i]) self.cmap.add_cid2unichr(s1+i, code[i])
else: else:
var = code[-4:] var = code[-4:]
base = nunpack(var) base = nunpack(var)
@ -341,7 +341,7 @@ class CMapParser(PSStackParser):
vlen = len(var) vlen = len(var)
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:] x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.add_cid2unicode(s1+i, x) self.cmap.add_cid2unichr(s1+i, x)
return return
if name == 'beginbfchar': if name == 'beginbfchar':
@ -351,7 +351,7 @@ class CMapParser(PSStackParser):
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs): for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str): if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unicode(nunpack(cid), code) self.cmap.add_cid2unichr(nunpack(cid), code)
return return
if name == 'beginnotdefrange': if name == 'beginnotdefrange':

View File

@ -405,9 +405,10 @@ class PDFSimpleFont(PDFFont):
def to_unichr(self, cid): def to_unichr(self, cid):
if self.unicode_map: if self.unicode_map:
code = self.unicode_map.get_unicode(cid) try:
if code is not None: return self.unicode_map.get_unichr(cid)
return unichr(code) except KeyError:
pass
try: try:
return self.encoding[cid] return self.encoding[cid]
except KeyError: except KeyError:
@ -571,11 +572,10 @@ class PDFCIDFont(PDFFont):
return self.disps.get(cid, self.default_disp) return self.disps.get(cid, self.default_disp)
def to_unichr(self, cid): def to_unichr(self, cid):
if not self.unicode_map: try:
raise PDFUnicodeNotDefined(self.cidcoding, cid) if not self.unicode_map: raise KeyError(cid)
code = self.unicode_map.get_unicode(cid) return self.unicode_map.get_unichr(cid)
if code is not None: except KeyError:
return unichr(code)
raise PDFUnicodeNotDefined(self.cidcoding, cid) raise PDFUnicodeNotDefined(self.cidcoding, cid)

View File

@ -766,7 +766,9 @@ class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''): def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
doc = PDFDocument() doc = PDFDocument()
parser = PDFParser(doc, fp) parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password) doc.initialize(password)
if not doc.is_extractable: if not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)

View File

@ -1,9 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
import re import re
import md5
import struct import struct
from sys import stderr from sys import stderr
try:
import hashlib as md5
except ImportError:
import md5
try: try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
@ -19,7 +22,7 @@ from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value from pdftypes import str_value, list_value, dict_value, stream_value
from arcfour import Arcfour from arcfour import Arcfour
from utils import choplist, nunpack from utils import choplist, nunpack
from utils import decode_text from utils import decode_text, ObjIdRange
## Exceptions ## Exceptions
@ -39,57 +42,29 @@ LITERAL_CATALOG = LIT('Catalog')
## XRefs ## XRefs
## ##
class XRefObjRange(object):
def __init__(self, start, nobjs):
self.start = start
self.nobjs = nobjs
return
def __repr__(self):
return '<XRefObjRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
def get_start_id(self):
return self.start
def get_end_id(self):
return self.start + self.nobjs - 1
def get_nobjs(self):
return self.nobjs
class PDFBaseXRef(object): class PDFBaseXRef(object):
def __init__(self):
self.objid_ranges = None
return
def objids(self): def get_trailer(self):
if self.objid_ranges: raise NotImplementedError
for objid_range in self.objid_ranges:
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1): def get_pos(self, objid):
yield objid raise KeyError(objid)
return
## PDFXRef ## PDFXRef
## ##
class PDFXRef(PDFBaseXRef): class PDFXRef(PDFBaseXRef):
def __init__(self): def __init__(self):
PDFBaseXRef.__init__(self) self.offsets = {}
self.offsets = None
self.trailer = {} self.trailer = {}
return return
def __repr__(self):
return '<PDFXRef: objs=%d>' % len(self.offsets)
def load(self, parser, debug=0): def load(self, parser, debug=0):
self.offsets = {}
self.objid_ranges = []
while 1: while 1:
try: try:
(pos, line) = parser.nextline() (pos, line) = parser.nextline()
if not line.strip(): if not line.strip(): continue
continue
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line: if not line:
@ -104,8 +79,6 @@ class PDFXRef(PDFBaseXRef):
(start, nobjs) = map(long, f) (start, nobjs) = map(long, f)
except ValueError: except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
self.newoffsets = {}
self.objid_ranges.append(XRefObjRange(start, nobjs))
for objid in xrange(start, start+nobjs): for objid in xrange(start, start+nobjs):
try: try:
(_, line) = parser.nextline() (_, line) = parser.nextline()
@ -136,7 +109,30 @@ class PDFXRef(PDFBaseXRef):
self.trailer.update(dict_value(dic)) self.trailer.update(dict_value(dic))
return return
def getpos(self, objid): PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load_fallback(self, parser, debug=0):
parser.seek(0)
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
break
if line.startswith('trailer'):
parser.seek(pos)
self.load_trailer(parser)
if 1 <= debug:
print >>stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
if not m: continue
(objid, genno) = m.groups()
self.offsets[int(objid)] = (0, pos)
return
def get_trailer(self):
return self.trailer
def get_pos(self, objid):
try: try:
(genno, pos) = self.offsets[objid] (genno, pos) = self.offsets[objid]
except KeyError: except KeyError:
@ -149,10 +145,10 @@ class PDFXRef(PDFBaseXRef):
class PDFXRefStream(PDFBaseXRef): class PDFXRefStream(PDFBaseXRef):
def __init__(self): def __init__(self):
PDFBaseXRef.__init__(self)
self.data = None self.data = None
self.entlen = None self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None self.fl1 = self.fl2 = self.fl3 = None
self.objid_ranges = []
return return
def __repr__(self): def __repr__(self):
@ -169,17 +165,22 @@ class PDFXRefStream(PDFBaseXRef):
index_array = stream.dic.get('Index', (0,size)) index_array = stream.dic.get('Index', (0,size))
if len(index_array) % 2 != 0: if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number') raise PDFSyntaxError('Invalid index number')
self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ] self.objid_ranges.extend( ObjIdRange(start, nobjs)
for (start,nobjs) in choplist(2, index_array) )
(self.fl1, self.fl2, self.fl3) = stream.dic['W'] (self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic self.trailer = stream.dic
if debug: if debug:
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3))) (', '.join(map(repr, self.objid_ranges),
self.fl1, self.fl2, self.fl3)))
return return
def getpos(self, objid): def get_trailer(self):
return self.trailer
def get_pos(self, objid):
offset = 0 offset = 0
found = False found = False
for objid_range in self.objid_ranges: for objid_range in self.objid_ranges:
@ -207,14 +208,35 @@ class PDFXRefStream(PDFBaseXRef):
## PDFPage ## PDFPage
## ##
## A PDFPage object is nothing more than a bunch of keys and values
## that describe the properties of the page and point to its contents,
## and has nothing to do with a real graphical entity. For a real graphical
## object, look at layout.LTPage.
##
class PDFPage(object): class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs): def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc self.doc = doc
self.pageid = pageid self.pageid = pageid
self.attrs = dict_value(attrs) self.attrs = dict_value(attrs)
@ -243,13 +265,23 @@ class PDFPage(object):
## PDFDocument ## PDFDocument
## ##
## A PDFDocument object represents a PDF document.
## Since a PDF file is usually pretty big, normally it is not loaded
## at once. Rather it is parsed dynamically as processing goes.
## A PDF parser is associated with the document.
##
class PDFDocument(object): class PDFDocument(object):
"""PDFDocument object represents a PDF document.
Since a PDF file can be very big, normally it is not loaded at
once. Each PDF document has a PDF parser object associated,
and the data stream is parsed dynamically as processing goes.
Typical usage:
doc = PDFDocument()
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
"""
debug = 0 debug = 0
def __init__(self): def __init__(self):
@ -261,24 +293,23 @@ class PDFDocument(object):
self.parser = None self.parser = None
self.encryption = None self.encryption = None
self.decipher = None self.decipher = None
self.ready = False self._initialized = False
return return
# set_parser(parser)
# Associates the document with an (already initialized) parser object.
def set_parser(self, parser): def set_parser(self, parser):
"Set the document to use a given PDFParser object."
if self.parser: return if self.parser: return
self.parser = parser self.parser = parser
# The document is set to be temporarily ready during collecting # The document is set to be temporarily ready during collecting
# all the basic information about the document, e.g. # all the basic information about the document, e.g.
# the header, the encryption information, and the access rights # the header, the encryption information, and the access rights
# for the document. # for the document.
self.ready = True self._initialized = True
# Retrieve the information of each header that was appended # Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document. # (maybe multiple times) at the end of the document.
self.xrefs = parser.read_xref() self.xrefs = parser.read_xref()
for xref in self.xrefs: for xref in self.xrefs:
trailer = xref.trailer trailer = xref.get_trailer()
if not trailer: continue if not trailer: continue
# If there's an encryption info, remember it. # If there's an encryption info, remember it.
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
@ -293,7 +324,7 @@ class PDFDocument(object):
# The document is set to be non-ready again, until all the # The document is set to be non-ready again, until all the
# proper initialization (asking the password key and # proper initialization (asking the password key and
# verifying the access permission, so on) is finished. # verifying the access permission, so on) is finished.
self.ready = False self._initialized = False
return return
# set_root(root) # set_root(root)
@ -315,7 +346,7 @@ class PDFDocument(object):
def initialize(self, password=''): def initialize(self, password=''):
if not self.encryption: if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True self.is_printable = self.is_modifiable = self.is_extractable = True
self.ready = True self._initialized = True
return return
(docid, param) = self.encryption (docid, param) = self.encryption
if literal_name(param['Filter']) != 'Standard': if literal_name(param['Filter']) != 'Standard':
@ -367,7 +398,7 @@ class PDFDocument(object):
raise PDFPasswordIncorrect raise PDFPasswordIncorrect
self.decrypt_key = key self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES self.decipher = self.decrypt_rc4 # XXX may be AES
self.ready = True self._initialized = True
return return
def decrypt_rc4(self, objid, genno, data): def decrypt_rc4(self, objid, genno, data):
@ -378,7 +409,7 @@ class PDFDocument(object):
KEYWORD_OBJ = KWD('obj') KEYWORD_OBJ = KWD('obj')
def getobj(self, objid): def getobj(self, objid):
if not self.ready: if not self._initialized:
raise PDFException('PDFDocument not initialized') raise PDFException('PDFDocument not initialized')
#assert self.xrefs #assert self.xrefs
if 2 <= self.debug: if 2 <= self.debug:
@ -389,7 +420,7 @@ class PDFDocument(object):
else: else:
for xref in self.xrefs: for xref in self.xrefs:
try: try:
(strmid, index) = xref.getpos(objid) (strmid, index) = xref.get_pos(objid)
break break
except KeyError: except KeyError:
pass pass
@ -411,7 +442,7 @@ class PDFDocument(object):
if strmid in self.parsed_objs: if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid] objs = self.parsed_objs[strmid]
else: else:
parser = PDFObjStrmParser(self, stream.get_data()) parser = PDFObjStrmParser(stream.get_data())
objs = [] objs = []
try: try:
while 1: while 1:
@ -458,7 +489,7 @@ class PDFDocument(object):
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self): def get_pages(self):
if not self.ready: if not self._initialized:
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
#assert self.xrefs #assert self.xrefs
def search(obj, parent): def search(obj, parent):
@ -529,14 +560,15 @@ class PDFDocument(object):
## ##
class PDFParser(PSStackParser): class PDFParser(PSStackParser):
def __init__(self, doc, fp): def __init__(self, fp):
PSStackParser.__init__(self, fp) PSStackParser.__init__(self, fp)
self.doc = doc self.doc = None
self.doc.set_parser(self)
return return
def __repr__(self): def set_document(self, doc):
return '<PDFParser>' "Associates the parser with a PDFDocument object."
self.doc = doc
return
KEYWORD_R = KWD('R') KEYWORD_R = KWD('R')
KEYWORD_ENDOBJ = KWD('endobj') KEYWORD_ENDOBJ = KWD('endobj')
@ -647,7 +679,7 @@ class PDFParser(PSStackParser):
xref = PDFXRef() xref = PDFXRef()
xref.load(self, debug=self.debug) xref.load(self, debug=self.debug)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.trailer trailer = xref.get_trailer()
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'trailer: %r' % trailer print >>stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer: if 'XRefStm' in trailer:
@ -669,26 +701,8 @@ class PDFParser(PSStackParser):
# fallback # fallback
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'no xref, fallback' print >>stderr, 'no xref, fallback'
self.seek(0)
pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
offsets = {}
xref = PDFXRef() xref = PDFXRef()
while 1: xref.load_fallback(self)
try:
(pos, line) = self.nextline()
except PSEOF:
break
if line.startswith('trailer'):
xref.offsets = offsets
self.seek(pos)
xref.load_trailer(self)
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % xref.trailer
continue
m = pat.match(line)
if not m: continue
(objid, genno) = m.groups()
offsets[int(objid)] = (0, pos)
xrefs.append(xref) xrefs.append(xref)
return xrefs return xrefs
@ -697,8 +711,8 @@ class PDFParser(PSStackParser):
## ##
class PDFObjStrmParser(PSStackParser): class PDFObjStrmParser(PSStackParser):
def __init__(self, doc, data): def __init__(self, data):
PDFParser.__init__(self, doc, StringIO(data)) PSStackParser.__init__(self, StringIO(data))
return return
def flush(self): def flush(self):

View File

@ -135,3 +135,27 @@ def enc(x, codec='ascii'):
'''Encodes a string for SGML/XML/HTML''' '''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
## ObjIdRange
##
class ObjIdRange(object):
"A utility class to represent a range of object IDs."
def __init__(self, start, nobjs):
self.start = start
self.nobjs = nobjs
return
def __repr__(self):
return '<ObjIdRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
def get_start_id(self):
return self.start
def get_end_id(self):
return self.start + self.nobjs - 1
def get_nobjs(self):
return self.nobjs

View File

@ -12,7 +12,7 @@ def process_cid2code(fp, check_codecs=[]):
else: else:
return (name+'-H', name+'-V') return (name+'-H', name+'-V')
def get_unicode(codes): def get_unichr(codes):
# determine the "most popular" candidate. # determine the "most popular" candidate.
d = {} d = {}
for code in codes: for code in codes:
@ -26,7 +26,7 @@ def process_cid2code(fp, check_codecs=[]):
except UnicodeError: except UnicodeError:
pass pass
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True) chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
return ord(chars[0]) return chars[0]
def put(dmap, code, cid, force=False): def put(dmap, code, cid, force=False):
for b in code[:-1]: for b in code[:-1]:
@ -45,8 +45,8 @@ def process_cid2code(fp, check_codecs=[]):
names = [] names = []
code2cid = {} # {'cmapname': ...} code2cid = {} # {'cmapname': ...}
is_vertical = {} is_vertical = {}
cid2unicode_h = {} # {cid: unicode} cid2unichr_h = {} # {cid: unichr}
cid2unicode_v = {} # {cid: unicode} cid2unichr_v = {} # {cid: unichr}
for line in fp: for line in fp:
line = line.strip() line = line.strip()
@ -95,21 +95,21 @@ def process_cid2code(fp, check_codecs=[]):
put(hmap, code, cid, True) put(hmap, code, cid, True)
if name.endswith('-UTF8'): if name.endswith('-UTF8'):
if hcodes: if hcodes:
cid2unicode_h[cid] = get_unicode(hcodes) cid2unichr_h[cid] = get_unichr(hcodes)
if vcodes: if vcodes:
cid2unicode_v[cid] = get_unicode(vcodes) cid2unichr_v[cid] = get_unichr(vcodes)
else: else:
for code in hcodes: for code in hcodes:
put(hmap, code, cid) put(hmap, code, cid)
put(vmap, code, cid) put(vmap, code, cid)
if name.endswith('-UTF8') and hcodes: if name.endswith('-UTF8') and hcodes:
code = get_unicode(hcodes) code = get_unichr(hcodes)
if cid not in cid2unicode_h: if cid not in cid2unichr_h:
cid2unicode_h[cid] = code cid2unichr_h[cid] = code
if cid not in cid2unicode_v: if cid not in cid2unichr_v:
cid2unicode_v[cid] = code cid2unichr_v[cid] = code
return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v) return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v)
# main # main
def main(argv): def main(argv):
@ -128,7 +128,7 @@ def main(argv):
print >>sys.stderr, 'reading %r...' % src print >>sys.stderr, 'reading %r...' % src
fp = file(src) fp = file(src)
(code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs) (code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs)
fp.close() fp.close()
for (name, cmap) in code2cid.iteritems(): for (name, cmap) in code2cid.iteritems():
@ -146,8 +146,8 @@ def main(argv):
fp = file(os.path.join(outdir, fname), 'w') fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python' print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname print >>fp, '#', fname
print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
fp.close() fp.close()
return 0 return 0

View File

@ -99,7 +99,9 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password) doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines(): for (level,title,dest,a,se) in doc.get_outlines():
@ -119,7 +121,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password) doc.initialize(password)
if objids: if objids:
for objid in objids: for objid in objids: