From 6590ad42f5e17641b9ff204664be097015845026 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 20 Dec 2009 02:38:01 +0000 Subject: [PATCH] experimental polygon extraction. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@166 1aa58f4a-7d42-0410-adbc-911cccaed67c --- docs/index.html | 8 +++--- pdfminer/converter.py | 22 ++++++++------- pdfminer/layout.py | 63 ++++++++++++++++++++++++++++++------------- pdfminer/pdffont.py | 2 +- 4 files changed, 63 insertions(+), 32 deletions(-) diff --git a/docs/index.html b/docs/index.html index 63f92f8..b5e58f1 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sun Dec 20 00:09:12 JST 2009 +Last Modified: Sun Dec 20 01:25:02 JST 2009
@@ -41,7 +41,7 @@ Last Modified: Sun Dec 20 00:09:12 JST 2009

What's It?

PDFMiner is a suite of programs that help -extracting some meaningful informatin out of PDF documents. +extracting some meaningful information out of PDF documents. Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data from PDFs. PDFMiner allows to obtain the exact location of texts in a page, as well as @@ -95,7 +95,7 @@ http://pdf2html.tabesugi.net:8080/

  1. Install Python 2.4 or newer.
  2. Download the PDFMiner source. -
  3. Extract it. +
  4. Unpack it.
  5. Run setup.py to install:
     # python setup.py install
    @@ -344,7 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
     

    Changes

      -
    • 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them. +
    • 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
    • 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
    • 2009/10/31: SGML output format is changed and renamed as XML.
    • 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation. diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 4d39103..69c25b8 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -3,7 +3,7 @@ import sys from pdfdevice import PDFDevice, PDFTextDevice from pdffont import PDFUnicodeNotDefined from layout import LayoutContainer -from layout import LTPage, LTText, LTLine, LTRect +from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine from utils import enc from utils import apply_matrix_pt, mult_matrix @@ -116,12 +116,7 @@ class PDFPageAggregator(PDFTextDevice): (_,x1,y1) = path[1] (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) - if y0 == y1: - # horizontal ruler - self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) - elif x0 == x1: - # vertical ruler - self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) + self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1))) elif shape == 'mlllh': # rectangle (_,x0,y0) = path[0] @@ -135,6 +130,13 @@ class PDFPageAggregator(PDFTextDevice): if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) + else: + # other polygon + pts = [] + for p in path: + for i in xrange(1, len(p), 2): + pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) + self.cur_item.add(LTPolygon(gstate.linewidth, pts)) return def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): @@ -177,10 +179,12 @@ class XMLConverter(PDFConverter): for child in item: render(child) self.outfp.write('\n') - elif isinstance(item, LTLine): + elif isinstance(item, LTLine) and item.direction: self.outfp.write('' % (item.linewidth, item.direction, item.get_bbox())) elif isinstance(item, LTRect): self.outfp.write('' % (item.linewidth, item.get_bbox())) + elif isinstance(item, LTPolygon): + self.outfp.write('' % (item.linewidth, item.get_bbox(), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('
      \n' % (item.id, item.get_bbox())) for child in item: @@ -263,7 +267,7 @@ class HTMLConverter(PDFConverter): self.outfp.write('\n') if self.debug: self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) - elif isinstance(item, LTLine) or isinstance(item, LTRect): + elif isinstance(item, LTPolygon): self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTTextLine): for child in item: diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 7a15ce7..1b940e4 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -6,6 +6,20 @@ from utils import apply_matrix_pt from utils import bsearch + +## get_bounds +## +def get_bounds(pts): + """Compute a maximal rectangle that covers all the points.""" + (x0, y0, x1, y1) = (INF, INF, -INF, -INF) + for (x,y) in pts: + x0 = min(x0, x) + y0 = min(y0, y) + x1 = max(x1, x) + y1 = max(y1, y) + return (x0,y0,x1,y1) + + ## LAParams ## class LAParams(object): @@ -228,24 +242,44 @@ class LayoutContainer(LayoutItem): return None +## LTPolygon +## +class LTPolygon(LayoutItem): + + def __init__(self, linewidth, pts): + LayoutItem.__init__(self, get_bounds(pts)) + self.pts = pts + self.linewidth = linewidth + return + + def get_pts(self): + return ','.join( '%.3f,%.3f' % p for p in self.pts ) + + ## LTLine ## -class LTLine(LayoutItem): +class LTLine(LTPolygon): - def __init__(self, linewidth, direction, bbox): - LayoutItem.__init__(self, bbox) - self.linewidth = linewidth - self.direction = direction + def __init__(self, linewidth, p0, p1): + (x0,y0) = p0 + (x1,y1) = p0 + self.direction = None + if y0 == y1: + # horizontal ruler + self.direction = 'H' + elif x0 == x1: + # vertical ruler + self.direction = 'V' + LTPolygon.__init__(self, linewidth, [p0, p1]) return ## LTRect ## -class LTRect(LayoutItem): +class LTRect(LTPolygon): - def __init__(self, linewidth, bbox): - LayoutItem.__init__(self, bbox) - self.linewidth = linewidth + def __init__(self, linewidth, (x0,y0,x1,y1)): + LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)]) return @@ -339,15 +373,8 @@ class LTFigure(LayoutContainer): def __init__(self, id, bbox, matrix): (x,y,w,h) = bbox - x0 = y0 = INF - x1 = y1 = -INF - for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)): - (p,q) = apply_matrix_pt(matrix, (p,q)) - x0 = min(x0, p) - x1 = max(x1, p) - y0 = min(y0, q) - y1 = max(y1, q) - bbox = (x0,y0,x1,y1) + bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) + for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) self.matrix = matrix LayoutContainer.__init__(self, id, bbox) return diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 1ed201f..fe544e6 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -518,7 +518,7 @@ class PDFCIDFont(PDFFont): try: self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical()) except CMapDB.CMapNotFound, e: - raise PDFFontError(e) + pass def get_width(seq): dic = {}