Merge pull request #76 from speedplane/master

Fix Unicode Bug + Add GitIgnore + Add Debug Flags
pull/55/head
Yusuke Shinyama 2014-12-09 22:22:33 +09:00
commit 4b585221e2
5 changed files with 89 additions and 13 deletions

54
.gitignore vendored Normal file
View File

@ -0,0 +1,54 @@
# Intermediate documents
*.xps
# Password and Key Files
*.pem
*.p12
# Compiled source #
###################
*.pyc
*.com
*.class
*.dll
*.exe
*.o
*.so
# Mecurial Files?
*.i
*.d
*.mo
*.hg/
# Python data files #
*.shelf
*.shelve
# Don't track these files, they are output from scripts
# Subversion files
*.svn-base
all-wcprops
entries
# Logs and databases #
######################
bulkloader-log-*
*.log
*.sql
*.sql3
*.sql3-journal
*.sqlite
# OS generated files #
######################
.DS_Store?
ehthumbs.db
Icon?
Thumbs.db
# tmp files #
#############
~$*.doc
~WRL*.tmp

View File

@ -65,6 +65,8 @@ LITERAL_CATALOG = LIT('Catalog')
## ##
class PDFBaseXRef(object): class PDFBaseXRef(object):
debug = False
def get_trailer(self): def get_trailer(self):
raise NotImplementedError raise NotImplementedError
@ -122,7 +124,7 @@ class PDFXRef(PDFBaseXRef):
if use != b'n': if use != b'n':
continue continue
self.offsets[objid] = (None, long(pos), int(genno)) self.offsets[objid] = (None, long(pos), int(genno))
logging.info('xref objects: %r' % self.offsets) if self.debug: logging.info('xref objects: %r' % self.offsets)
self.load_trailer(parser) self.load_trailer(parser)
return return
@ -173,7 +175,7 @@ class PDFXRefFallback(PDFXRef):
if line.startswith(b'trailer'): if line.startswith(b'trailer'):
parser.seek(pos) parser.seek(pos)
self.load_trailer(parser) self.load_trailer(parser)
logging.info('trailer: %r' % self.get_trailer()) if self.debug: logging.info('trailer: %r' % self.get_trailer())
break break
m = self.PDFOBJ_CUE.match(line) m = self.PDFOBJ_CUE.match(line)
if not m: if not m:
@ -212,6 +214,8 @@ class PDFXRefFallback(PDFXRef):
## ##
class PDFXRefStream(PDFBaseXRef): class PDFXRefStream(PDFBaseXRef):
debug = False
def __init__(self): def __init__(self):
self.data = None self.data = None
self.entlen = None self.entlen = None
@ -238,6 +242,7 @@ class PDFXRefStream(PDFBaseXRef):
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs self.trailer = stream.attrs
if self.debug:
logging.info('xref stream: objid=%s, fields=%d,%d,%d' % logging.info('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.ranges)), (', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)) self.fl1, self.fl2, self.fl3))
@ -761,6 +766,7 @@ class PDFDocument(object):
prev = line prev = line
else: else:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if self.debug:
logging.info('xref found: pos=%r' % prev) logging.info('xref found: pos=%r' % prev)
return long(prev) return long(prev)
@ -773,6 +779,7 @@ class PDFDocument(object):
(pos, token) = parser.nexttoken() (pos, token) = parser.nexttoken()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if self.debug:
logging.info('read_xref_from: start=%d, token=%r' % (start, token)) logging.info('read_xref_from: start=%d, token=%r' % (start, token))
if isinstance(token, int): if isinstance(token, int):
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
@ -787,6 +794,7 @@ class PDFDocument(object):
xref.load(parser) xref.load(parser)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.get_trailer() trailer = xref.get_trailer()
if self.debug:
logging.info('trailer: %r' % trailer) logging.info('trailer: %r' % trailer)
if 'XRefStm' in trailer: if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm']) pos = int_value(trailer['XRefStm'])

View File

@ -139,6 +139,8 @@ class PDFResourceManager(object):
allocated multiple times. allocated multiple times.
""" """
debug = False
def __init__(self, caching=True): def __init__(self, caching=True):
self.caching = caching self.caching = caching
self._cached_fonts = {} self._cached_fonts = {}
@ -167,6 +169,7 @@ class PDFResourceManager(object):
if objid and objid in self._cached_fonts: if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid] font = self._cached_fonts[objid]
else: else:
if self.debug:
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec))
if STRICT: if STRICT:
if spec['Type'] is not LITERAL_FONT: if spec['Type'] is not LITERAL_FONT:
@ -799,7 +802,7 @@ class PDFPageInterpreter(object):
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return return
logging.info('Processing xobj: %r' % xobj) if self.debug: logging.info('Processing xobj: %r' % xobj)
subtype = xobj.get('Subtype') subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj: if subtype is LITERAL_FORM and 'BBox' in xobj:
interpreter = self.dup() interpreter = self.dup()
@ -822,7 +825,7 @@ class PDFPageInterpreter(object):
return return
def process_page(self, page): def process_page(self, page):
logging.info('Processing page: %r' % page) if self.debug: logging.info('Processing page: %r' % page)
(x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
if page.rotate == 90: if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1) ctm = (0, -1, 1, 0, -y0, x1)
@ -841,6 +844,7 @@ class PDFPageInterpreter(object):
# Render the content streams. # Render the content streams.
# This method may be called recursively. # This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
if self.debug:
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % logging.info('render_contents: resources=%r, streams=%r, ctm=%r' %
(resources, streams, ctm)) (resources, streams, ctm))
self.init_resources(resources) self.init_resources(resources)

View File

@ -39,6 +39,8 @@ class PDFPage(object):
beads: a chain that represents natural reading order. beads: a chain that represents natural reading order.
""" """
debug = False
def __init__(self, doc, pageid, attrs): def __init__(self, doc, pageid, attrs):
"""Initialize a page object. """Initialize a page object.
@ -86,12 +88,12 @@ class PDFPage(object):
if k in klass.INHERITABLE_ATTRS and k not in tree: if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
logging.info('Pages: Kids=%r' % tree['Kids']) if klass.debug: logging.info('Pages: Kids=%r' % tree['Kids'])
for c in list_value(tree['Kids']): for c in list_value(tree['Kids']):
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree.get('Type') is LITERAL_PAGE: elif tree.get('Type') is LITERAL_PAGE:
logging.info('Page: %r' % tree) if klass.debug: logging.info('Page: %r' % tree)
yield (objid, tree) yield (objid, tree)
pages = False pages = False
if 'Pages' in document.catalog: if 'Pages' in document.catalog:

View File

@ -343,7 +343,15 @@ class PSBaseParser(object):
self.hex = b'' self.hex = b''
self._parse1 = self._parse_literal_hex self._parse1 = self._parse_literal_hex
return j+1 return j+1
self._add_token(LIT(unicode(self._curtoken)))
try:
# Try to interpret the token as a utf-8 string
utoken = self._curtoken.decode('utf-8')
except UnicodeDecodeError:
# We failed, there is possibly a corrupt PDF here.
if STRICT: raise
utoken = ""
self._add_token(LIT(utoken))
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j return j