reorganize
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@95 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
fa13a28200
commit
13efd3faf4
118
pdflib/layout.py
118
pdflib/layout.py
|
@ -1,10 +1,124 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfdevice import PageItem
|
||||
from utils import pick
|
||||
from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm
|
||||
INF = sys.maxint
|
||||
|
||||
|
||||
## PageItem
|
||||
##
|
||||
class PageItem(object):
|
||||
|
||||
def __init__(self, (x0,y0,x1,y1)):
|
||||
#assert x0 <= x1 and y0 <= y1
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<pageitem bbox=%s>' % (self.bbox()))
|
||||
|
||||
def bbox(self):
|
||||
return rect2str((self.x0, self.y0, self.x1, self.y1))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, PageItem)
|
||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
|
||||
def voverlap(self, obj):
|
||||
assert isinstance(obj, PageItem)
|
||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
|
||||
|
||||
class PageContainer(PageItem):
|
||||
|
||||
def __init__(self, bbox):
|
||||
PageItem.__init__(self, bbox)
|
||||
self.objs = []
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
return
|
||||
|
||||
class Page(PageContainer):
|
||||
|
||||
def __init__(self, id, bbox, rotate=0):
|
||||
PageContainer.__init__(self, bbox)
|
||||
self.id = id
|
||||
self.rotate = rotate
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
|
||||
|
||||
|
||||
## FigureItem
|
||||
##
|
||||
class FigureItem(PageContainer):
|
||||
|
||||
def __init__(self, id, bbox):
|
||||
PageContainer.__init__(self, bbox)
|
||||
self.id = id
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
|
||||
|
||||
|
||||
## TextItem
|
||||
##
|
||||
class TextItem(PageItem):
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
assert chars
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
self.vertical = self.font.is_vertical()
|
||||
self.text = ''.join( char for (char,_) in chars )
|
||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
if not self.vertical:
|
||||
# horizontal text
|
||||
self.vertical = False
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||
ty += descent
|
||||
self.adv = (dx, 0)
|
||||
bbox = (tx, ty, tx+dx, ty+dy)
|
||||
else:
|
||||
# vertical text
|
||||
(_,cid) = chars[0]
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
||||
tx -= dx/2
|
||||
ty += disp
|
||||
self.adv = (0, dy)
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||
PageItem.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __len__(self):
|
||||
return len(self.text)
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
|
||||
point2str(self.adv), self.text))
|
||||
|
||||
|
||||
## bsearch
|
||||
##
|
||||
## Finds objects whose coordinates overlap with [v0,v1].
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
import sys
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator
|
||||
from pdfdevice import PDFDevice, PDFPageAggregator
|
||||
from layout import Page, FigureItem, TextItem, cluster_textobjs
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from cmap import CMapDB
|
||||
|
||||
|
@ -79,7 +80,6 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_textobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
self.yoffset += page.y1
|
||||
if self.pagenum:
|
||||
|
@ -142,7 +142,6 @@ class TextConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_textobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.pagenum:
|
||||
self.outfp.write('Page %d\n' % page.id)
|
||||
|
|
|
@ -3,8 +3,8 @@ import sys
|
|||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
|
||||
matrix2str, rect2str, point2str
|
||||
from layout import PageItem, Page, FigureItem, TextItem
|
||||
from utils import mult_matrix, translate_matrix
|
||||
|
||||
|
||||
## PDFDevice
|
||||
|
@ -52,121 +52,6 @@ class PDFDevice(object):
|
|||
return
|
||||
|
||||
|
||||
## Page
|
||||
##
|
||||
class PageItem(object):
|
||||
|
||||
def __init__(self, (x0,y0,x1,y1)):
|
||||
#assert x0 <= x1 and y0 <= y1
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<pageitem bbox=%s>' % (self.bbox()))
|
||||
|
||||
def bbox(self):
|
||||
return rect2str((self.x0, self.y0, self.x1, self.y1))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, PageItem)
|
||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
|
||||
def voverlap(self, obj):
|
||||
assert isinstance(obj, PageItem)
|
||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
|
||||
|
||||
class PageContainer(PageItem):
|
||||
|
||||
def __init__(self, bbox):
|
||||
PageItem.__init__(self, bbox)
|
||||
self.objs = []
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
return
|
||||
|
||||
class Page(PageContainer):
|
||||
|
||||
def __init__(self, id, bbox, rotate=0):
|
||||
PageContainer.__init__(self, bbox)
|
||||
self.id = id
|
||||
self.rotate = rotate
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
|
||||
|
||||
|
||||
## FigureItem
|
||||
##
|
||||
class FigureItem(PageContainer):
|
||||
|
||||
def __init__(self, id, bbox):
|
||||
PageContainer.__init__(self, bbox)
|
||||
self.id = id
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
|
||||
|
||||
|
||||
## TextItem
|
||||
##
|
||||
class TextItem(PageItem):
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
assert chars
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
self.vertical = self.font.is_vertical()
|
||||
self.text = ''.join( char for (char,_) in chars )
|
||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
if not self.vertical:
|
||||
# horizontal text
|
||||
self.vertical = False
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||
ty += descent
|
||||
self.adv = (dx, 0)
|
||||
bbox = (tx, ty, tx+dx, ty+dy)
|
||||
else:
|
||||
# vertical text
|
||||
(_,cid) = chars[0]
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
||||
tx -= dx/2
|
||||
ty += disp
|
||||
self.adv = (0, dy)
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||
PageItem.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __len__(self):
|
||||
return len(self.text)
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
|
||||
point2str(self.adv), self.text))
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
|
Loading…
Reference in New Issue