experimental polygon extraction.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@166 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-12-20 02:38:01 +00:00
parent 665196161c
commit 6590ad42f5
4 changed files with 63 additions and 32 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun Dec 20 00:09:12 JST 2009
Last Modified: Sun Dec 20 01:25:02 JST 2009
<!-- hhmts end -->
</div>
@ -41,7 +41,7 @@ Last Modified: Sun Dec 20 00:09:12 JST 2009
<h2>What's It?</h2>
<p>
PDFMiner is a suite of programs that help
extracting some meaningful informatin out of PDF documents.
extracting some meaningful information out of PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting
and analyzing text data from PDFs. PDFMiner allows to obtain
the exact location of texts in a page, as well as
@ -95,7 +95,7 @@ http://pdf2html.tabesugi.net:8080/
<ol>
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
<li> Download the <a href="#source">PDFMiner source</a>.
<li> Extract it.
<li> Unpack it.
<li> Run <code>setup.py</code> to install:<br>
<blockquote><pre>
# <strong>python setup.py install</strong>
@ -344,7 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
<li> 2009/10/31: SGML output format is changed and renamed as XML.
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.

View File

@ -3,7 +3,7 @@ import sys
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
from utils import enc
from utils import apply_matrix_pt, mult_matrix
@ -116,12 +116,7 @@ class PDFPageAggregator(PDFTextDevice):
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
@ -135,6 +130,13 @@ class PDFPageAggregator(PDFTextDevice):
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
else:
# other polygon
pts = []
for p in path:
for i in xrange(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
@ -177,10 +179,12 @@ class XMLConverter(PDFConverter):
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine):
elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
@ -263,7 +267,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:

View File

@ -6,6 +6,20 @@ from utils import apply_matrix_pt
from utils import bsearch
## get_bounds
##
def get_bounds(pts):
"""Compute a maximal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
## LAParams
##
class LAParams(object):
@ -228,24 +242,44 @@ class LayoutContainer(LayoutItem):
return None
## LTPolygon
##
class LTPolygon(LayoutItem):
def __init__(self, linewidth, pts):
LayoutItem.__init__(self, get_bounds(pts))
self.pts = pts
self.linewidth = linewidth
return
def get_pts(self):
return ','.join( '%.3f,%.3f' % p for p in self.pts )
## LTLine
##
class LTLine(LayoutItem):
class LTLine(LTPolygon):
def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
self.direction = direction
def __init__(self, linewidth, p0, p1):
(x0,y0) = p0
(x1,y1) = p0
self.direction = None
if y0 == y1:
# horizontal ruler
self.direction = 'H'
elif x0 == x1:
# vertical ruler
self.direction = 'V'
LTPolygon.__init__(self, linewidth, [p0, p1])
return
## LTRect
##
class LTRect(LayoutItem):
class LTRect(LTPolygon):
def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
def __init__(self, linewidth, (x0,y0,x1,y1)):
LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
return
@ -339,15 +373,8 @@ class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox
x0 = y0 = INF
x1 = y1 = -INF
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p)
x1 = max(x1, p)
y0 = min(y0, q)
y1 = max(y1, q)
bbox = (x0,y0,x1,y1)
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
return

View File

@ -518,7 +518,7 @@ class PDFCIDFont(PDFFont):
try:
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
pass
def get_width(seq):
dic = {}