diff --git a/pdflib/arcfour.py b/pdflib/arcfour.py old mode 100755 new mode 100644 diff --git a/pdflib/layout.py b/pdflib/layout.py index ef18a5e..9d67771 100644 --- a/pdflib/layout.py +++ b/pdflib/layout.py @@ -1,10 +1,124 @@ #!/usr/bin/env python import sys -from pdfdevice import PageItem -from utils import pick +from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm INF = sys.maxint +## PageItem +## +class PageItem(object): + + def __init__(self, (x0,y0,x1,y1)): + #assert x0 <= x1 and y0 <= y1 + self.x0 = x0 + self.y0 = y0 + self.x1 = x1 + self.y1 = y1 + self.width = x1-x0 + self.height = y1-y0 + return + + def __repr__(self): + return ('' % (self.bbox())) + + def bbox(self): + return rect2str((self.x0, self.y0, self.x1, self.y1)) + + def hoverlap(self, obj): + assert isinstance(obj, PageItem) + if self.x1 <= obj.x0 or obj.x1 <= self.x0: + return 0 + else: + return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) + + def voverlap(self, obj): + assert isinstance(obj, PageItem) + if self.y1 <= obj.y0 or obj.y1 <= self.y0: + return 0 + else: + return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) + + +class PageContainer(PageItem): + + def __init__(self, bbox): + PageItem.__init__(self, bbox) + self.objs = [] + return + + def add(self, obj): + self.objs.append(obj) + return + +class Page(PageContainer): + + def __init__(self, id, bbox, rotate=0): + PageContainer.__init__(self, bbox) + self.id = id + self.rotate = rotate + return + + def __repr__(self): + return ('' % (self.id, self.bbox(), self.rotate)) + + +## FigureItem +## +class FigureItem(PageContainer): + + def __init__(self, id, bbox): + PageContainer.__init__(self, bbox) + self.id = id + return + + def __repr__(self): + return ('
' % (self.id, self.bbox())) + + +## TextItem +## +class TextItem(PageItem): + + def __init__(self, matrix, font, fontsize, charspace, scaling, chars): + assert chars + self.matrix = matrix + self.font = font + (_,_,_,_,tx,ty) = self.matrix + self.vertical = self.font.is_vertical() + self.text = ''.join( char for (char,_) in chars ) + adv = sum( font.char_width(cid) for (_,cid) in chars ) + adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 + size = (font.get_ascent() - font.get_descent()) * fontsize + if not self.vertical: + # horizontal text + self.vertical = False + (dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) + (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) + ty += descent + self.adv = (dx, 0) + bbox = (tx, ty, tx+dx, ty+dy) + else: + # vertical text + (_,cid) = chars[0] + (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) + (dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) + tx -= dx/2 + ty += disp + self.adv = (0, dy) + bbox = (tx, ty+dy, tx+dx, ty) + self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) + PageItem.__init__(self, bbox) + return + + def __len__(self): + return len(self.text) + + def __repr__(self): + return ('' % + (matrix2str(self.matrix), self.font, self.fontsize, self.bbox(), + point2str(self.adv), self.text)) + + ## bsearch ## ## Finds objects whose coordinates overlap with [v0,v1]. diff --git a/pdflib/lzw.py b/pdflib/lzw.py old mode 100755 new mode 100644 diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index b20cbc7..11ed698 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -2,7 +2,8 @@ import sys from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator +from pdfdevice import PDFDevice, PDFPageAggregator +from layout import Page, FigureItem, TextItem, cluster_textobjs from pdffont import PDFUnicodeNotDefined from cmap import CMapDB @@ -79,7 +80,6 @@ class HTMLConverter(PDFConverter): return def end_page(self, page): - from cluster import cluster_textobjs page = PDFConverter.end_page(self, page) self.yoffset += page.y1 if self.pagenum: @@ -142,7 +142,6 @@ class TextConverter(PDFConverter): return def end_page(self, page): - from cluster import cluster_textobjs page = PDFConverter.end_page(self, page) if self.pagenum: self.outfp.write('Page %d\n' % page.id) diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index 7f08bc1..eacff71 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -3,8 +3,8 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdffont import PDFUnicodeNotDefined -from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \ - matrix2str, rect2str, point2str +from layout import PageItem, Page, FigureItem, TextItem +from utils import mult_matrix, translate_matrix ## PDFDevice @@ -52,121 +52,6 @@ class PDFDevice(object): return -## Page -## -class PageItem(object): - - def __init__(self, (x0,y0,x1,y1)): - #assert x0 <= x1 and y0 <= y1 - self.x0 = x0 - self.y0 = y0 - self.x1 = x1 - self.y1 = y1 - self.width = x1-x0 - self.height = y1-y0 - return - - def __repr__(self): - return ('' % (self.bbox())) - - def bbox(self): - return rect2str((self.x0, self.y0, self.x1, self.y1)) - - def hoverlap(self, obj): - assert isinstance(obj, PageItem) - if self.x1 <= obj.x0 or obj.x1 <= self.x0: - return 0 - else: - return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) - - def voverlap(self, obj): - assert isinstance(obj, PageItem) - if self.y1 <= obj.y0 or obj.y1 <= self.y0: - return 0 - else: - return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) - - -class PageContainer(PageItem): - - def __init__(self, bbox): - PageItem.__init__(self, bbox) - self.objs = [] - return - - def add(self, obj): - self.objs.append(obj) - return - -class Page(PageContainer): - - def __init__(self, id, bbox, rotate=0): - PageContainer.__init__(self, bbox) - self.id = id - self.rotate = rotate - return - - def __repr__(self): - return ('' % (self.id, self.bbox(), self.rotate)) - - -## FigureItem -## -class FigureItem(PageContainer): - - def __init__(self, id, bbox): - PageContainer.__init__(self, bbox) - self.id = id - return - - def __repr__(self): - return ('
' % (self.id, self.bbox())) - - -## TextItem -## -class TextItem(PageItem): - - def __init__(self, matrix, font, fontsize, charspace, scaling, chars): - assert chars - self.matrix = matrix - self.font = font - (_,_,_,_,tx,ty) = self.matrix - self.vertical = self.font.is_vertical() - self.text = ''.join( char for (char,_) in chars ) - adv = sum( font.char_width(cid) for (_,cid) in chars ) - adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 - size = (font.get_ascent() - font.get_descent()) * fontsize - if not self.vertical: - # horizontal text - self.vertical = False - (dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) - (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) - ty += descent - self.adv = (dx, 0) - bbox = (tx, ty, tx+dx, ty+dy) - else: - # vertical text - (_,cid) = chars[0] - (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) - (dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) - tx -= dx/2 - ty += disp - self.adv = (0, dy) - bbox = (tx, ty+dy, tx+dx, ty) - self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) - PageItem.__init__(self, bbox) - return - - def __len__(self): - return len(self.text) - - def __repr__(self): - return ('' % - (matrix2str(self.matrix), self.font, self.fontsize, self.bbox(), - point2str(self.adv), self.text)) - - ## PDFPageAggregator ## class PDFPageAggregator(PDFDevice): diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py old mode 100755 new mode 100644