reorganize

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@95 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-04 08:59:27 +00:00
parent fa13a28200
commit 13efd3faf4
6 changed files with 120 additions and 122 deletions

0
pdflib/arcfour.py Executable file → Normal file
View File

View File

@ -1,10 +1,124 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from pdfdevice import PageItem from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm
from utils import pick
INF = sys.maxint INF = sys.maxint
## PageItem
##
class PageItem(object):
def __init__(self, (x0,y0,x1,y1)):
#assert x0 <= x1 and y0 <= y1
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
## bsearch ## bsearch
## ##
## Finds objects whose coordinates overlap with [v0,v1]. ## Finds objects whose coordinates overlap with [v0,v1].

0
pdflib/lzw.py Executable file → Normal file
View File

View File

@ -2,7 +2,8 @@
import sys import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator from pdfdevice import PDFDevice, PDFPageAggregator
from layout import Page, FigureItem, TextItem, cluster_textobjs
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB from cmap import CMapDB
@ -79,7 +80,6 @@ class HTMLConverter(PDFConverter):
return return
def end_page(self, page): def end_page(self, page):
from cluster import cluster_textobjs
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
self.yoffset += page.y1 self.yoffset += page.y1
if self.pagenum: if self.pagenum:
@ -142,7 +142,6 @@ class TextConverter(PDFConverter):
return return
def end_page(self, page): def end_page(self, page):
from cluster import cluster_textobjs
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.pagenum: if self.pagenum:
self.outfp.write('Page %d\n' % page.id) self.outfp.write('Page %d\n' % page.id)

View File

@ -3,8 +3,8 @@ import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \ from layout import PageItem, Page, FigureItem, TextItem
matrix2str, rect2str, point2str from utils import mult_matrix, translate_matrix
## PDFDevice ## PDFDevice
@ -52,121 +52,6 @@ class PDFDevice(object):
return return
## Page
##
class PageItem(object):
def __init__(self, (x0,y0,x1,y1)):
#assert x0 <= x1 and y0 <= y1
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
## PDFPageAggregator ## PDFPageAggregator
## ##
class PDFPageAggregator(PDFDevice): class PDFPageAggregator(PDFDevice):

0
pdflib/pdfparser.py Executable file → Normal file
View File