Merge pull request #76 from speedplane/master
Fix Unicode Bug + Add GitIgnore + Add Debug Flagspull/55/head
commit
4b585221e2
|
@ -0,0 +1,54 @@
|
||||||
|
# Intermediate documents
|
||||||
|
*.xps
|
||||||
|
|
||||||
|
# Password and Key Files
|
||||||
|
*.pem
|
||||||
|
*.p12
|
||||||
|
|
||||||
|
# Compiled source #
|
||||||
|
###################
|
||||||
|
*.pyc
|
||||||
|
*.com
|
||||||
|
*.class
|
||||||
|
*.dll
|
||||||
|
*.exe
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Mecurial Files?
|
||||||
|
*.i
|
||||||
|
*.d
|
||||||
|
*.mo
|
||||||
|
*.hg/
|
||||||
|
|
||||||
|
# Python data files #
|
||||||
|
*.shelf
|
||||||
|
*.shelve
|
||||||
|
|
||||||
|
# Don't track these files, they are output from scripts
|
||||||
|
|
||||||
|
# Subversion files
|
||||||
|
*.svn-base
|
||||||
|
all-wcprops
|
||||||
|
entries
|
||||||
|
|
||||||
|
# Logs and databases #
|
||||||
|
######################
|
||||||
|
bulkloader-log-*
|
||||||
|
*.log
|
||||||
|
*.sql
|
||||||
|
*.sql3
|
||||||
|
*.sql3-journal
|
||||||
|
*.sqlite
|
||||||
|
|
||||||
|
# OS generated files #
|
||||||
|
######################
|
||||||
|
.DS_Store?
|
||||||
|
ehthumbs.db
|
||||||
|
Icon?
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# tmp files #
|
||||||
|
#############
|
||||||
|
~$*.doc
|
||||||
|
~WRL*.tmp
|
|
@ -65,6 +65,8 @@ LITERAL_CATALOG = LIT('Catalog')
|
||||||
##
|
##
|
||||||
class PDFBaseXRef(object):
|
class PDFBaseXRef(object):
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
def get_trailer(self):
|
def get_trailer(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -122,7 +124,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
if use != b'n':
|
if use != b'n':
|
||||||
continue
|
continue
|
||||||
self.offsets[objid] = (None, long(pos), int(genno))
|
self.offsets[objid] = (None, long(pos), int(genno))
|
||||||
logging.info('xref objects: %r' % self.offsets)
|
if self.debug: logging.info('xref objects: %r' % self.offsets)
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -173,7 +175,7 @@ class PDFXRefFallback(PDFXRef):
|
||||||
if line.startswith(b'trailer'):
|
if line.startswith(b'trailer'):
|
||||||
parser.seek(pos)
|
parser.seek(pos)
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
logging.info('trailer: %r' % self.get_trailer())
|
if self.debug: logging.info('trailer: %r' % self.get_trailer())
|
||||||
break
|
break
|
||||||
m = self.PDFOBJ_CUE.match(line)
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
if not m:
|
if not m:
|
||||||
|
@ -212,6 +214,8 @@ class PDFXRefFallback(PDFXRef):
|
||||||
##
|
##
|
||||||
class PDFXRefStream(PDFBaseXRef):
|
class PDFXRefStream(PDFBaseXRef):
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.data = None
|
self.data = None
|
||||||
self.entlen = None
|
self.entlen = None
|
||||||
|
@ -238,6 +242,7 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
self.data = stream.get_data()
|
self.data = stream.get_data()
|
||||||
self.entlen = self.fl1+self.fl2+self.fl3
|
self.entlen = self.fl1+self.fl2+self.fl3
|
||||||
self.trailer = stream.attrs
|
self.trailer = stream.attrs
|
||||||
|
if self.debug:
|
||||||
logging.info('xref stream: objid=%s, fields=%d,%d,%d' %
|
logging.info('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||||
(', '.join(map(repr, self.ranges)),
|
(', '.join(map(repr, self.ranges)),
|
||||||
self.fl1, self.fl2, self.fl3))
|
self.fl1, self.fl2, self.fl3))
|
||||||
|
@ -761,6 +766,7 @@ class PDFDocument(object):
|
||||||
prev = line
|
prev = line
|
||||||
else:
|
else:
|
||||||
raise PDFNoValidXRef('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
|
if self.debug:
|
||||||
logging.info('xref found: pos=%r' % prev)
|
logging.info('xref found: pos=%r' % prev)
|
||||||
return long(prev)
|
return long(prev)
|
||||||
|
|
||||||
|
@ -773,6 +779,7 @@ class PDFDocument(object):
|
||||||
(pos, token) = parser.nexttoken()
|
(pos, token) = parser.nexttoken()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
raise PDFNoValidXRef('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
|
if self.debug:
|
||||||
logging.info('read_xref_from: start=%d, token=%r' % (start, token))
|
logging.info('read_xref_from: start=%d, token=%r' % (start, token))
|
||||||
if isinstance(token, int):
|
if isinstance(token, int):
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
|
@ -787,6 +794,7 @@ class PDFDocument(object):
|
||||||
xref.load(parser)
|
xref.load(parser)
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
trailer = xref.get_trailer()
|
trailer = xref.get_trailer()
|
||||||
|
if self.debug:
|
||||||
logging.info('trailer: %r' % trailer)
|
logging.info('trailer: %r' % trailer)
|
||||||
if 'XRefStm' in trailer:
|
if 'XRefStm' in trailer:
|
||||||
pos = int_value(trailer['XRefStm'])
|
pos = int_value(trailer['XRefStm'])
|
||||||
|
|
|
@ -139,6 +139,8 @@ class PDFResourceManager(object):
|
||||||
allocated multiple times.
|
allocated multiple times.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
def __init__(self, caching=True):
|
def __init__(self, caching=True):
|
||||||
self.caching = caching
|
self.caching = caching
|
||||||
self._cached_fonts = {}
|
self._cached_fonts = {}
|
||||||
|
@ -167,6 +169,7 @@ class PDFResourceManager(object):
|
||||||
if objid and objid in self._cached_fonts:
|
if objid and objid in self._cached_fonts:
|
||||||
font = self._cached_fonts[objid]
|
font = self._cached_fonts[objid]
|
||||||
else:
|
else:
|
||||||
|
if self.debug:
|
||||||
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec))
|
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec))
|
||||||
if STRICT:
|
if STRICT:
|
||||||
if spec['Type'] is not LITERAL_FONT:
|
if spec['Type'] is not LITERAL_FONT:
|
||||||
|
@ -799,7 +802,7 @@ class PDFPageInterpreter(object):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||||
return
|
return
|
||||||
logging.info('Processing xobj: %r' % xobj)
|
if self.debug: logging.info('Processing xobj: %r' % xobj)
|
||||||
subtype = xobj.get('Subtype')
|
subtype = xobj.get('Subtype')
|
||||||
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
||||||
interpreter = self.dup()
|
interpreter = self.dup()
|
||||||
|
@ -822,7 +825,7 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
logging.info('Processing page: %r' % page)
|
if self.debug: logging.info('Processing page: %r' % page)
|
||||||
(x0, y0, x1, y1) = page.mediabox
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
if page.rotate == 90:
|
if page.rotate == 90:
|
||||||
ctm = (0, -1, 1, 0, -y0, x1)
|
ctm = (0, -1, 1, 0, -y0, x1)
|
||||||
|
@ -841,6 +844,7 @@ class PDFPageInterpreter(object):
|
||||||
# Render the content streams.
|
# Render the content streams.
|
||||||
# This method may be called recursively.
|
# This method may be called recursively.
|
||||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||||
|
if self.debug:
|
||||||
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' %
|
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' %
|
||||||
(resources, streams, ctm))
|
(resources, streams, ctm))
|
||||||
self.init_resources(resources)
|
self.init_resources(resources)
|
||||||
|
|
|
@ -39,6 +39,8 @@ class PDFPage(object):
|
||||||
beads: a chain that represents natural reading order.
|
beads: a chain that represents natural reading order.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
def __init__(self, doc, pageid, attrs):
|
||||||
"""Initialize a page object.
|
"""Initialize a page object.
|
||||||
|
|
||||||
|
@ -86,12 +88,12 @@ class PDFPage(object):
|
||||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||||
logging.info('Pages: Kids=%r' % tree['Kids'])
|
if klass.debug: logging.info('Pages: Kids=%r' % tree['Kids'])
|
||||||
for c in list_value(tree['Kids']):
|
for c in list_value(tree['Kids']):
|
||||||
for x in search(c, tree):
|
for x in search(c, tree):
|
||||||
yield x
|
yield x
|
||||||
elif tree.get('Type') is LITERAL_PAGE:
|
elif tree.get('Type') is LITERAL_PAGE:
|
||||||
logging.info('Page: %r' % tree)
|
if klass.debug: logging.info('Page: %r' % tree)
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
pages = False
|
pages = False
|
||||||
if 'Pages' in document.catalog:
|
if 'Pages' in document.catalog:
|
||||||
|
|
|
@ -343,7 +343,15 @@ class PSBaseParser(object):
|
||||||
self.hex = b''
|
self.hex = b''
|
||||||
self._parse1 = self._parse_literal_hex
|
self._parse1 = self._parse_literal_hex
|
||||||
return j+1
|
return j+1
|
||||||
self._add_token(LIT(unicode(self._curtoken)))
|
|
||||||
|
try:
|
||||||
|
# Try to interpret the token as a utf-8 string
|
||||||
|
utoken = self._curtoken.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# We failed, there is possibly a corrupt PDF here.
|
||||||
|
if STRICT: raise
|
||||||
|
utoken = ""
|
||||||
|
self._add_token(LIT(utoken))
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue