experimental polygon extraction.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@166 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
665196161c
commit
6590ad42f5
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Dec 20 00:09:12 JST 2009
|
Last Modified: Sun Dec 20 01:25:02 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ Last Modified: Sun Dec 20 00:09:12 JST 2009
|
||||||
<h2>What's It?</h2>
|
<h2>What's It?</h2>
|
||||||
<p>
|
<p>
|
||||||
PDFMiner is a suite of programs that help
|
PDFMiner is a suite of programs that help
|
||||||
extracting some meaningful informatin out of PDF documents.
|
extracting some meaningful information out of PDF documents.
|
||||||
Unlike other PDF-related tools, it focuses entirely on getting
|
Unlike other PDF-related tools, it focuses entirely on getting
|
||||||
and analyzing text data from PDFs. PDFMiner allows to obtain
|
and analyzing text data from PDFs. PDFMiner allows to obtain
|
||||||
the exact location of texts in a page, as well as
|
the exact location of texts in a page, as well as
|
||||||
|
@ -95,7 +95,7 @@ http://pdf2html.tabesugi.net:8080/
|
||||||
<ol>
|
<ol>
|
||||||
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
||||||
<li> Download the <a href="#source">PDFMiner source</a>.
|
<li> Download the <a href="#source">PDFMiner source</a>.
|
||||||
<li> Extract it.
|
<li> Unpack it.
|
||||||
<li> Run <code>setup.py</code> to install:<br>
|
<li> Run <code>setup.py</code> to install:<br>
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
# <strong>python setup.py install</strong>
|
# <strong>python setup.py install</strong>
|
||||||
|
@ -344,7 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
|
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
|
||||||
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
|
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
|
||||||
<li> 2009/10/31: SGML output format is changed and renamed as XML.
|
<li> 2009/10/31: SGML output format is changed and renamed as XML.
|
||||||
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
|
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
|
||||||
|
|
|
@ -3,7 +3,7 @@ import sys
|
||||||
from pdfdevice import PDFDevice, PDFTextDevice
|
from pdfdevice import PDFDevice, PDFTextDevice
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from layout import LayoutContainer
|
from layout import LayoutContainer
|
||||||
from layout import LTPage, LTText, LTLine, LTRect
|
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
||||||
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
|
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||||
from utils import enc
|
from utils import enc
|
||||||
from utils import apply_matrix_pt, mult_matrix
|
from utils import apply_matrix_pt, mult_matrix
|
||||||
|
@ -116,12 +116,7 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
(_,x1,y1) = path[1]
|
(_,x1,y1) = path[1]
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||||
if y0 == y1:
|
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
|
||||||
# horizontal ruler
|
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
|
||||||
elif x0 == x1:
|
|
||||||
# vertical ruler
|
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
|
||||||
elif shape == 'mlllh':
|
elif shape == 'mlllh':
|
||||||
# rectangle
|
# rectangle
|
||||||
(_,x0,y0) = path[0]
|
(_,x0,y0) = path[0]
|
||||||
|
@ -135,6 +130,13 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||||
|
else:
|
||||||
|
# other polygon
|
||||||
|
pts = []
|
||||||
|
for p in path:
|
||||||
|
for i in xrange(1, len(p), 2):
|
||||||
|
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
||||||
|
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
|
@ -177,10 +179,12 @@ class XMLConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
elif isinstance(item, LTLine):
|
elif isinstance(item, LTLine) and item.direction:
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||||
elif isinstance(item, LTRect):
|
elif isinstance(item, LTRect):
|
||||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
||||||
|
elif isinstance(item, LTPolygon):
|
||||||
|
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts()))
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
|
@ -263,7 +267,7 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
elif isinstance(item, LTPolygon):
|
||||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextLine):
|
elif isinstance(item, LTTextLine):
|
||||||
for child in item:
|
for child in item:
|
||||||
|
|
|
@ -6,6 +6,20 @@ from utils import apply_matrix_pt
|
||||||
from utils import bsearch
|
from utils import bsearch
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## get_bounds
|
||||||
|
##
|
||||||
|
def get_bounds(pts):
|
||||||
|
"""Compute a maximal rectangle that covers all the points."""
|
||||||
|
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||||
|
for (x,y) in pts:
|
||||||
|
x0 = min(x0, x)
|
||||||
|
y0 = min(y0, y)
|
||||||
|
x1 = max(x1, x)
|
||||||
|
y1 = max(y1, y)
|
||||||
|
return (x0,y0,x1,y1)
|
||||||
|
|
||||||
|
|
||||||
## LAParams
|
## LAParams
|
||||||
##
|
##
|
||||||
class LAParams(object):
|
class LAParams(object):
|
||||||
|
@ -228,24 +242,44 @@ class LayoutContainer(LayoutItem):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
## LTPolygon
|
||||||
|
##
|
||||||
|
class LTPolygon(LayoutItem):
|
||||||
|
|
||||||
|
def __init__(self, linewidth, pts):
|
||||||
|
LayoutItem.__init__(self, get_bounds(pts))
|
||||||
|
self.pts = pts
|
||||||
|
self.linewidth = linewidth
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_pts(self):
|
||||||
|
return ','.join( '%.3f,%.3f' % p for p in self.pts )
|
||||||
|
|
||||||
|
|
||||||
## LTLine
|
## LTLine
|
||||||
##
|
##
|
||||||
class LTLine(LayoutItem):
|
class LTLine(LTPolygon):
|
||||||
|
|
||||||
def __init__(self, linewidth, direction, bbox):
|
def __init__(self, linewidth, p0, p1):
|
||||||
LayoutItem.__init__(self, bbox)
|
(x0,y0) = p0
|
||||||
self.linewidth = linewidth
|
(x1,y1) = p0
|
||||||
self.direction = direction
|
self.direction = None
|
||||||
|
if y0 == y1:
|
||||||
|
# horizontal ruler
|
||||||
|
self.direction = 'H'
|
||||||
|
elif x0 == x1:
|
||||||
|
# vertical ruler
|
||||||
|
self.direction = 'V'
|
||||||
|
LTPolygon.__init__(self, linewidth, [p0, p1])
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTRect
|
## LTRect
|
||||||
##
|
##
|
||||||
class LTRect(LayoutItem):
|
class LTRect(LTPolygon):
|
||||||
|
|
||||||
def __init__(self, linewidth, bbox):
|
def __init__(self, linewidth, (x0,y0,x1,y1)):
|
||||||
LayoutItem.__init__(self, bbox)
|
LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
|
||||||
self.linewidth = linewidth
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -339,15 +373,8 @@ class LTFigure(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, matrix):
|
def __init__(self, id, bbox, matrix):
|
||||||
(x,y,w,h) = bbox
|
(x,y,w,h) = bbox
|
||||||
x0 = y0 = INF
|
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
||||||
x1 = y1 = -INF
|
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||||
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
|
|
||||||
(p,q) = apply_matrix_pt(matrix, (p,q))
|
|
||||||
x0 = min(x0, p)
|
|
||||||
x1 = max(x1, p)
|
|
||||||
y0 = min(y0, q)
|
|
||||||
y1 = max(y1, q)
|
|
||||||
bbox = (x0,y0,x1,y1)
|
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
LayoutContainer.__init__(self, id, bbox)
|
LayoutContainer.__init__(self, id, bbox)
|
||||||
return
|
return
|
||||||
|
|
|
@ -518,7 +518,7 @@ class PDFCIDFont(PDFFont):
|
||||||
try:
|
try:
|
||||||
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
|
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
|
||||||
except CMapDB.CMapNotFound, e:
|
except CMapDB.CMapNotFound, e:
|
||||||
raise PDFFontError(e)
|
pass
|
||||||
|
|
||||||
def get_width(seq):
|
def get_width(seq):
|
||||||
dic = {}
|
dic = {}
|
||||||
|
|
Loading…
Reference in New Issue