integrate TODO html.
reorder the code bit. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@177 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
0f8fe3f19e
commit
dda60dcafc
7
TODO
7
TODO
|
@ -1,7 +0,0 @@
|
||||||
TODOs:
|
|
||||||
- PEP-8 conformance.
|
|
||||||
- Better text extraction / layout analysis.
|
|
||||||
- Better API Documentation.
|
|
||||||
- Robust error handling.
|
|
||||||
- Crypt stream filter support. (More sample documents are needed!)
|
|
||||||
- CCITTFax stream filter support.
|
|
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Jan 31 10:38:26 JST 2010
|
Last Modified: Sun Jan 31 11:11:26 JST 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -333,6 +333,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>TODOs</h2>
|
<h2>TODOs</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> Automated testing.
|
||||||
<li> <A href="http://www.python.org/dev/peps/pep-0008/">PEP-8</a> and
|
<li> <A href="http://www.python.org/dev/peps/pep-0008/">PEP-8</a> and
|
||||||
<a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance.
|
<a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance.
|
||||||
<li> Better text extraction / layout analysis.
|
<li> Better text extraction / layout analysis.
|
||||||
|
|
|
@ -10,64 +10,6 @@ from utils import apply_matrix_pt, mult_matrix
|
||||||
from utils import enc, strbbox
|
from utils import enc, strbbox
|
||||||
|
|
||||||
|
|
||||||
## TagExtractor
|
|
||||||
##
|
|
||||||
class TagExtractor(PDFDevice):
|
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8'):
|
|
||||||
PDFDevice.__init__(self, rsrc)
|
|
||||||
self.outfp = outfp
|
|
||||||
self.codec = codec
|
|
||||||
self.pageno = 0
|
|
||||||
self.tag = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def render_string(self, textstate, seq):
|
|
||||||
font = textstate.font
|
|
||||||
text = ''
|
|
||||||
for obj in seq:
|
|
||||||
if not isinstance(obj, str): continue
|
|
||||||
chars = font.decode(obj)
|
|
||||||
for cid in chars:
|
|
||||||
try:
|
|
||||||
char = font.to_unichr(cid)
|
|
||||||
text += char
|
|
||||||
except PDFUnicodeNotDefined:
|
|
||||||
pass
|
|
||||||
self.outfp.write(enc(text, self.codec))
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
|
||||||
(self.pageno, strbbox(page.mediabox), page.rotate))
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_page(self, page):
|
|
||||||
self.outfp.write('</page>\n')
|
|
||||||
self.pageno += 1
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
|
||||||
s = ''
|
|
||||||
if props:
|
|
||||||
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
|
||||||
in sorted(props.iteritems()) )
|
|
||||||
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
|
||||||
self.tag = tag
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_tag(self):
|
|
||||||
assert self.tag
|
|
||||||
self.outfp.write('</%s>' % enc(self.tag.name))
|
|
||||||
self.tag = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
|
||||||
self.begin_tag(tag, props)
|
|
||||||
self.tag = None
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
## PDFPageAggregator
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFTextDevice):
|
class PDFPageAggregator(PDFTextDevice):
|
||||||
|
@ -171,82 +113,34 @@ class PDFConverter(PDFPageAggregator):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## XMLConverter
|
## TextConverter
|
||||||
##
|
##
|
||||||
class XMLConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
|
showpageno=False):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.outdir = outdir
|
self.showpageno = showpageno
|
||||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
return
|
||||||
self.outfp.write('<pages>\n')
|
|
||||||
|
def write(self, text):
|
||||||
|
self.outfp.write(text.encode(self.codec, 'ignore'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_image(self, image):
|
|
||||||
if image.type in LITERALS_DCT_DECODE:
|
|
||||||
ext = '.jpg'
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
name = image.name+ext
|
|
||||||
path = os.path.join(self.outdir, name)
|
|
||||||
fp = file(path, 'wb')
|
|
||||||
fp.write(image.data)
|
|
||||||
fp.close()
|
|
||||||
return name
|
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTText):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
|
||||||
(item.id, strbbox(item.bbox), item.rotate))
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
self.outfp.write('</page>\n')
|
|
||||||
elif isinstance(item, LTLine) and item.direction:
|
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
|
|
||||||
elif isinstance(item, LTRect):
|
|
||||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
|
|
||||||
elif isinstance(item, LTPolygon):
|
|
||||||
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
|
|
||||||
elif isinstance(item, LTFigure):
|
|
||||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
self.outfp.write('</figure>\n')
|
|
||||||
elif isinstance(item, LTTextLine):
|
|
||||||
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
self.outfp.write('</textline>\n')
|
|
||||||
elif isinstance(item, LTTextBox):
|
|
||||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
self.outfp.write('</textbox>\n')
|
|
||||||
elif isinstance(item, LTTextItem):
|
|
||||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
|
||||||
(enc(item.font.fontname), item.is_vertical(),
|
|
||||||
strbbox(item.bbox), item.fontsize))
|
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
elif isinstance(item, LayoutContainer):
|
||||||
elif isinstance(item, LTText):
|
for child in item:
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
render(child)
|
||||||
elif isinstance(item, LTImage):
|
if isinstance(item, LTTextBox):
|
||||||
x = ''
|
self.write('\n')
|
||||||
if self.outdir:
|
|
||||||
name = self.write_image(item)
|
|
||||||
if name:
|
|
||||||
x = 'name="%s" ' % enc(name)
|
|
||||||
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
|
|
||||||
else:
|
|
||||||
assert 0, item
|
|
||||||
return
|
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
|
if self.showpageno:
|
||||||
|
self.write('Page %d\n' % page.id)
|
||||||
render(page)
|
render(page)
|
||||||
return
|
self.write('\f')
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.outfp.write('</pages>\n')
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -344,32 +238,138 @@ class HTMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## XMLConverter
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class XMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||||
showpageno=False):
|
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.showpageno = showpageno
|
self.outdir = outdir
|
||||||
|
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||||
|
self.outfp.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write_image(self, image):
|
||||||
self.outfp.write(text.encode(self.codec, 'ignore'))
|
if image.type in LITERALS_DCT_DECODE:
|
||||||
|
ext = '.jpg'
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
name = image.name+ext
|
||||||
|
path = os.path.join(self.outdir, name)
|
||||||
|
fp = file(path, 'wb')
|
||||||
|
fp.write(image.data)
|
||||||
|
fp.close()
|
||||||
|
return name
|
||||||
|
|
||||||
|
def end_page(self, page):
|
||||||
|
def render(item):
|
||||||
|
if isinstance(item, LTPage):
|
||||||
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
|
(item.id, strbbox(item.bbox), item.rotate))
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</page>\n')
|
||||||
|
elif isinstance(item, LTLine) and item.direction:
|
||||||
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
|
||||||
|
elif isinstance(item, LTRect):
|
||||||
|
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
|
||||||
|
elif isinstance(item, LTPolygon):
|
||||||
|
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
|
||||||
|
elif isinstance(item, LTFigure):
|
||||||
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</figure>\n')
|
||||||
|
elif isinstance(item, LTTextLine):
|
||||||
|
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</textline>\n')
|
||||||
|
elif isinstance(item, LTTextBox):
|
||||||
|
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</textbox>\n')
|
||||||
|
elif isinstance(item, LTTextItem):
|
||||||
|
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||||
|
(enc(item.font.fontname), item.is_vertical(),
|
||||||
|
strbbox(item.bbox), item.fontsize))
|
||||||
|
self.write(item.text)
|
||||||
|
self.outfp.write('</text>\n')
|
||||||
|
elif isinstance(item, LTText):
|
||||||
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
|
elif isinstance(item, LTImage):
|
||||||
|
x = ''
|
||||||
|
if self.outdir:
|
||||||
|
name = self.write_image(item)
|
||||||
|
if name:
|
||||||
|
x = 'name="%s" ' % enc(name)
|
||||||
|
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
|
||||||
|
else:
|
||||||
|
assert 0, item
|
||||||
|
return
|
||||||
|
page = PDFConverter.end_page(self, page)
|
||||||
|
render(page)
|
||||||
|
return
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.outfp.write('</pages>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## TagExtractor
|
||||||
|
##
|
||||||
|
class TagExtractor(PDFDevice):
|
||||||
|
|
||||||
|
def __init__(self, rsrc, outfp, codec='utf-8'):
|
||||||
|
PDFDevice.__init__(self, rsrc)
|
||||||
|
self.outfp = outfp
|
||||||
|
self.codec = codec
|
||||||
|
self.pageno = 0
|
||||||
|
self.tag = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def render_string(self, textstate, seq):
|
||||||
|
font = textstate.font
|
||||||
|
text = ''
|
||||||
|
for obj in seq:
|
||||||
|
if not isinstance(obj, str): continue
|
||||||
|
chars = font.decode(obj)
|
||||||
|
for cid in chars:
|
||||||
|
try:
|
||||||
|
char = font.to_unichr(cid)
|
||||||
|
text += char
|
||||||
|
except PDFUnicodeNotDefined:
|
||||||
|
pass
|
||||||
|
self.outfp.write(enc(text, self.codec))
|
||||||
|
return
|
||||||
|
|
||||||
|
def begin_page(self, page, ctm):
|
||||||
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
||||||
|
(self.pageno, strbbox(page.mediabox), page.rotate))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
self.outfp.write('</page>\n')
|
||||||
if isinstance(item, LTText):
|
self.pageno += 1
|
||||||
self.write(item.text)
|
return
|
||||||
elif isinstance(item, LayoutContainer):
|
|
||||||
for child in item:
|
def begin_tag(self, tag, props=None):
|
||||||
render(child)
|
s = ''
|
||||||
if isinstance(item, LTTextBox):
|
if props:
|
||||||
self.write('\n')
|
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
||||||
page = PDFConverter.end_page(self, page)
|
in sorted(props.iteritems()) )
|
||||||
if self.showpageno:
|
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
||||||
self.write('Page %d\n' % page.id)
|
self.tag = tag
|
||||||
render(page)
|
return
|
||||||
self.write('\f')
|
|
||||||
|
def end_tag(self):
|
||||||
|
assert self.tag
|
||||||
|
self.outfp.write('</%s>' % enc(self.tag.name))
|
||||||
|
self.tag = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def do_tag(self, tag, props=None):
|
||||||
|
self.begin_tag(tag, props)
|
||||||
|
self.tag = None
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue