experimental polygon extraction.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@166 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-12-20 02:38:01 +00:00
parent 665196161c
commit 6590ad42f5
4 changed files with 63 additions and 32 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Dec 20 00:09:12 JST 2009 Last Modified: Sun Dec 20 01:25:02 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -41,7 +41,7 @@ Last Modified: Sun Dec 20 00:09:12 JST 2009
<h2>What's It?</h2> <h2>What's It?</h2>
<p> <p>
PDFMiner is a suite of programs that help PDFMiner is a suite of programs that help
extracting some meaningful informatin out of PDF documents. extracting some meaningful information out of PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting Unlike other PDF-related tools, it focuses entirely on getting
and analyzing text data from PDFs. PDFMiner allows to obtain and analyzing text data from PDFs. PDFMiner allows to obtain
the exact location of texts in a page, as well as the exact location of texts in a page, as well as
@ -95,7 +95,7 @@ http://pdf2html.tabesugi.net:8080/
<ol> <ol>
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer. <li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
<li> Download the <a href="#source">PDFMiner source</a>. <li> Download the <a href="#source">PDFMiner source</a>.
<li> Extract it. <li> Unpack it.
<li> Run <code>setup.py</code> to install:<br> <li> Run <code>setup.py</code> to install:<br>
<blockquote><pre> <blockquote><pre>
# <strong>python setup.py install</strong> # <strong>python setup.py install</strong>
@ -344,7 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them. <li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras. <li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
<li> 2009/10/31: SGML output format is changed and renamed as XML. <li> 2009/10/31: SGML output format is changed and renamed as XML.
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation. <li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.

View File

@ -3,7 +3,7 @@ import sys
from pdfdevice import PDFDevice, PDFTextDevice from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
from utils import enc from utils import enc
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
@ -116,12 +116,7 @@ class PDFPageAggregator(PDFTextDevice):
(_,x1,y1) = path[1] (_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1: self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh': elif shape == 'mlllh':
# rectangle # rectangle
(_,x0,y0) = path[0] (_,x0,y0) = path[0]
@ -135,6 +130,13 @@ class PDFPageAggregator(PDFTextDevice):
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
else:
# other polygon
pts = []
for p in path:
for i in xrange(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
@ -177,10 +179,12 @@ class XMLConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine): elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox())) self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
@ -263,7 +267,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
for child in item: for child in item:

View File

@ -6,6 +6,20 @@ from utils import apply_matrix_pt
from utils import bsearch from utils import bsearch
## get_bounds
##
def get_bounds(pts):
"""Compute a maximal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
## LAParams ## LAParams
## ##
class LAParams(object): class LAParams(object):
@ -228,24 +242,44 @@ class LayoutContainer(LayoutItem):
return None return None
## LTPolygon
##
class LTPolygon(LayoutItem):
def __init__(self, linewidth, pts):
LayoutItem.__init__(self, get_bounds(pts))
self.pts = pts
self.linewidth = linewidth
return
def get_pts(self):
return ','.join( '%.3f,%.3f' % p for p in self.pts )
## LTLine ## LTLine
## ##
class LTLine(LayoutItem): class LTLine(LTPolygon):
def __init__(self, linewidth, direction, bbox): def __init__(self, linewidth, p0, p1):
LayoutItem.__init__(self, bbox) (x0,y0) = p0
self.linewidth = linewidth (x1,y1) = p0
self.direction = direction self.direction = None
if y0 == y1:
# horizontal ruler
self.direction = 'H'
elif x0 == x1:
# vertical ruler
self.direction = 'V'
LTPolygon.__init__(self, linewidth, [p0, p1])
return return
## LTRect ## LTRect
## ##
class LTRect(LayoutItem): class LTRect(LTPolygon):
def __init__(self, linewidth, bbox): def __init__(self, linewidth, (x0,y0,x1,y1)):
LayoutItem.__init__(self, bbox) LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
self.linewidth = linewidth
return return
@ -339,15 +373,8 @@ class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix): def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox (x,y,w,h) = bbox
x0 = y0 = INF bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
x1 = y1 = -INF for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p)
x1 = max(x1, p)
y0 = min(y0, q)
y1 = max(y1, q)
bbox = (x0,y0,x1,y1)
self.matrix = matrix self.matrix = matrix
LayoutContainer.__init__(self, id, bbox) LayoutContainer.__init__(self, id, bbox)
return return

View File

@ -518,7 +518,7 @@ class PDFCIDFont(PDFFont):
try: try:
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical()) self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
except CMapDB.CMapNotFound, e: except CMapDB.CMapNotFound, e:
raise PDFFontError(e) pass
def get_width(seq): def get_width(seq):
dic = {} dic = {}