reorganize

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@95 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-04 08:59:27 +00:00
parent fa13a28200
commit 13efd3faf4
6 changed files with 120 additions and 122 deletions

0
pdflib/arcfour.py Executable file → Normal file
View File

View File

@ -1,10 +1,124 @@
#!/usr/bin/env python
import sys
from pdfdevice import PageItem
from utils import pick
from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm
INF = sys.maxint
## PageItem
##
class PageItem(object):
def __init__(self, (x0,y0,x1,y1)):
#assert x0 <= x1 and y0 <= y1
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
## bsearch
##
## Finds objects whose coordinates overlap with [v0,v1].

0
pdflib/lzw.py Executable file → Normal file
View File

View File

@ -2,7 +2,8 @@
import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator
from pdfdevice import PDFDevice, PDFPageAggregator
from layout import Page, FigureItem, TextItem, cluster_textobjs
from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB
@ -79,7 +80,6 @@ class HTMLConverter(PDFConverter):
return
def end_page(self, page):
from cluster import cluster_textobjs
page = PDFConverter.end_page(self, page)
self.yoffset += page.y1
if self.pagenum:
@ -142,7 +142,6 @@ class TextConverter(PDFConverter):
return
def end_page(self, page):
from cluster import cluster_textobjs
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)

View File

@ -3,8 +3,8 @@ import sys
stdout = sys.stdout
stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
matrix2str, rect2str, point2str
from layout import PageItem, Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix
## PDFDevice
@ -52,121 +52,6 @@ class PDFDevice(object):
return
## Page
##
class PageItem(object):
def __init__(self, (x0,y0,x1,y1)):
#assert x0 <= x1 and y0 <= y1
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
## PDFPageAggregator
##
class PDFPageAggregator(PDFDevice):

0
pdflib/pdfparser.py Executable file → Normal file
View File