integrate TODO html.

reorder the code bit.


git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@177 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-31 02:12:51 +00:00
parent 0f8fe3f19e
commit dda60dcafc
3 changed files with 146 additions and 152 deletions

7
TODO
View File

@ -1,7 +0,0 @@
TODOs:
- PEP-8 conformance.
- Better text extraction / layout analysis.
- Better API Documentation.
- Robust error handling.
- Crypt stream filter support. (More sample documents are needed!)
- CCITTFax stream filter support.

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Jan 31 10:38:26 JST 2010 Last Modified: Sun Jan 31 11:11:26 JST 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -333,6 +333,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>TODOs</h2> <h2>TODOs</h2>
<ul> <ul>
<li> Automated testing.
<li> <A href="http://www.python.org/dev/peps/pep-0008/">PEP-8</a> and <li> <A href="http://www.python.org/dev/peps/pep-0008/">PEP-8</a> and
<a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance. <a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance.
<li> Better text extraction / layout analysis. <li> Better text extraction / layout analysis.

View File

@ -10,64 +10,6 @@ from utils import apply_matrix_pt, mult_matrix
from utils import enc, strbbox from utils import enc, strbbox
## TagExtractor
##
class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.tag = None
return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, strbbox(page.mediabox), page.rotate))
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
return
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return
## PDFPageAggregator ## PDFPageAggregator
## ##
class PDFPageAggregator(PDFTextDevice): class PDFPageAggregator(PDFTextDevice):
@ -171,82 +113,34 @@ class PDFConverter(PDFPageAggregator):
return return
## XMLConverter ## TextConverter
## ##
class XMLConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outdir = outdir self.showpageno = showpageno
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n')
return return
def write_image(self, image): def write(self, text):
if image.type in LITERALS_DCT_DECODE: self.outfp.write(text.encode(self.codec, 'ignore'))
ext = '.jpg' return
else:
return None
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
return name
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTText):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, strbbox(item.bbox), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
strbbox(item.bbox), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') elif isinstance(item, LayoutContainer):
elif isinstance(item, LTText): for child in item:
self.outfp.write('<text>%s</text>\n' % item.text) render(child)
elif isinstance(item, LTImage): if isinstance(item, LTTextBox):
x = '' self.write('\n')
if self.outdir:
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
else:
assert 0, item
return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.showpageno:
self.write('Page %d\n' % page.id)
render(page) render(page)
return self.write('\f')
def close(self):
self.outfp.write('</pages>\n')
return return
@ -344,32 +238,138 @@ class HTMLConverter(PDFConverter):
return return
## TextConverter ## XMLConverter
## ##
class TextConverter(PDFConverter): class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.outdir = outdir
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n')
return return
def write(self, text): def write_image(self, image):
self.outfp.write(text.encode(self.codec, 'ignore')) if image.type in LITERALS_DCT_DECODE:
return ext = '.jpg'
else:
return None
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
return name
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTPage):
self.write(item.text) self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
elif isinstance(item, LayoutContainer): (item.id, strbbox(item.bbox), item.rotate))
for child in item: for child in item:
render(child) render(child)
if isinstance(item, LTTextBox): self.outfp.write('</page>\n')
self.write('\n') elif isinstance(item, LTLine) and item.direction:
page = PDFConverter.end_page(self, page) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
if self.showpageno: elif isinstance(item, LTRect):
self.write('Page %d\n' % page.id) self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
render(page) elif isinstance(item, LTPolygon):
self.write('\f') self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
strbbox(item.bbox), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text)
elif isinstance(item, LTImage):
x = ''
if self.outdir:
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
else:
assert 0, item
return
page = PDFConverter.end_page(self, page)
render(page)
return
def close(self):
self.outfp.write('</pages>\n')
return
## TagExtractor
##
class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.tag = None
return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, strbbox(page.mediabox), page.rotate))
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
return
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return return