diff --git a/docs/index.html b/docs/index.html
index 63f92f8..b5e58f1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sun Dec 20 00:09:12 JST 2009
+Last Modified: Sun Dec 20 01:25:02 JST 2009
@@ -41,7 +41,7 @@ Last Modified: Sun Dec 20 00:09:12 JST 2009
What's It?
PDFMiner is a suite of programs that help
-extracting some meaningful informatin out of PDF documents.
+extracting some meaningful information out of PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting
and analyzing text data from PDFs. PDFMiner allows to obtain
the exact location of texts in a page, as well as
@@ -95,7 +95,7 @@ http://pdf2html.tabesugi.net:8080/
Install Python 2.4 or newer.
Download the PDFMiner source .
- Extract it.
+ Unpack it.
Run setup.py
to install:
# python setup.py install
@@ -344,7 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
- 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
+ 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
2009/10/31: SGML output format is changed and renamed as XML.
2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 4d39103..69c25b8 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -3,7 +3,7 @@ import sys
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer
-from layout import LTPage, LTText, LTLine, LTRect
+from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
from utils import enc
from utils import apply_matrix_pt, mult_matrix
@@ -116,12 +116,7 @@ class PDFPageAggregator(PDFTextDevice):
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
- if y0 == y1:
- # horizontal ruler
- self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
- elif x0 == x1:
- # vertical ruler
- self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
+ self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
@@ -135,6 +130,13 @@ class PDFPageAggregator(PDFTextDevice):
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
+ else:
+ # other polygon
+ pts = []
+ for p in path:
+ for i in xrange(1, len(p), 2):
+ pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
+ self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
@@ -177,10 +179,12 @@ class XMLConverter(PDFConverter):
for child in item:
render(child)
self.outfp.write('\n')
- elif isinstance(item, LTLine):
+ elif isinstance(item, LTLine) and item.direction:
self.outfp.write(' ' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write(' ' % (item.linewidth, item.get_bbox()))
+ elif isinstance(item, LTPolygon):
+ self.outfp.write(' ' % (item.linewidth, item.get_bbox(), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('\n' % (item.id, item.get_bbox()))
for child in item:
@@ -263,7 +267,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
- elif isinstance(item, LTLine) or isinstance(item, LTRect):
+ elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index 7a15ce7..1b940e4 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -6,6 +6,20 @@ from utils import apply_matrix_pt
from utils import bsearch
+
+## get_bounds
+##
+def get_bounds(pts):
+ """Compute a maximal rectangle that covers all the points."""
+ (x0, y0, x1, y1) = (INF, INF, -INF, -INF)
+ for (x,y) in pts:
+ x0 = min(x0, x)
+ y0 = min(y0, y)
+ x1 = max(x1, x)
+ y1 = max(y1, y)
+ return (x0,y0,x1,y1)
+
+
## LAParams
##
class LAParams(object):
@@ -228,24 +242,44 @@ class LayoutContainer(LayoutItem):
return None
+## LTPolygon
+##
+class LTPolygon(LayoutItem):
+
+ def __init__(self, linewidth, pts):
+ LayoutItem.__init__(self, get_bounds(pts))
+ self.pts = pts
+ self.linewidth = linewidth
+ return
+
+ def get_pts(self):
+ return ','.join( '%.3f,%.3f' % p for p in self.pts )
+
+
## LTLine
##
-class LTLine(LayoutItem):
+class LTLine(LTPolygon):
- def __init__(self, linewidth, direction, bbox):
- LayoutItem.__init__(self, bbox)
- self.linewidth = linewidth
- self.direction = direction
+ def __init__(self, linewidth, p0, p1):
+ (x0,y0) = p0
+ (x1,y1) = p0
+ self.direction = None
+ if y0 == y1:
+ # horizontal ruler
+ self.direction = 'H'
+ elif x0 == x1:
+ # vertical ruler
+ self.direction = 'V'
+ LTPolygon.__init__(self, linewidth, [p0, p1])
return
## LTRect
##
-class LTRect(LayoutItem):
+class LTRect(LTPolygon):
- def __init__(self, linewidth, bbox):
- LayoutItem.__init__(self, bbox)
- self.linewidth = linewidth
+ def __init__(self, linewidth, (x0,y0,x1,y1)):
+ LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
return
@@ -339,15 +373,8 @@ class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox
- x0 = y0 = INF
- x1 = y1 = -INF
- for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
- (p,q) = apply_matrix_pt(matrix, (p,q))
- x0 = min(x0, p)
- x1 = max(x1, p)
- y0 = min(y0, q)
- y1 = max(y1, q)
- bbox = (x0,y0,x1,y1)
+ bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
+ for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
return
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index 1ed201f..fe544e6 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -518,7 +518,7 @@ class PDFCIDFont(PDFFont):
try:
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
except CMapDB.CMapNotFound, e:
- raise PDFFontError(e)
+ pass
def get_width(seq):
dic = {}