diff --git a/pdflib/layout.py b/pdflib/page.py similarity index 90% rename from pdflib/layout.py rename to pdflib/page.py index f6395af..ea458a3 100644 --- a/pdflib/layout.py +++ b/pdflib/page.py @@ -49,6 +49,7 @@ def bsearch(objs, v0, v1): ## reorder_hv, reorder_vh +## chop_hv, chop_vh ## ## Reorders objects according to its writing direction. ## @@ -210,7 +211,7 @@ class LayoutItem(object): return 0 def get_direction(self): - return False + return None ## LayoutContainer @@ -227,7 +228,7 @@ class LayoutContainer(LayoutItem): return def __repr__(self): - return ('' % (self.get_bbox(), len(self.objs))) + return ('' % (self.get_bbox())) def __iter__(self): return iter(self.objs) @@ -267,8 +268,14 @@ class LayoutContainer(LayoutItem): return self.weight def get_direction(self): - return ((sum( obj.get_weight() for obj in self.objs )/2) < - sum( obj.get_weight() for obj in self.objs if obj.get_direction() )) + if not self.objs: return None + d = {} + for obj in self.objs: + k = obj.get_direction() + if k not in d: d[k] = 0 + d[k] += 1 + (direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0] + return direction ## FigureItem @@ -327,7 +334,7 @@ class TextItem(LayoutItem): def get_weight(self): return len(self.text) - def get_direction(self): + def is_vertical(self): return self.vertical @@ -340,40 +347,41 @@ class TextBox(LayoutContainer): def __init__(self, objs): LayoutContainer.__init__(self, None, (0,0,0,0), objs) - self.vertical = False + self.direction = None return + def __repr__(self): + return ('' % (self.get_bbox(), self.direction)) + def fixate(self): LayoutContainer.fixate(self) + self.direction = 'H' for obj in self.objs: - self.vertical = bool(obj.get_direction()) + if obj.is_vertical(): + self.direction = 'V' break if 2 <= len(self.objs): objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) if objs[0].get_weight() == 1 and objs[1].get_weight() == 1: h = objs[0].voverlap(objs[1]) v = objs[0].hoverlap(objs[1]) - self.vertical = (h < v) + if h < v: + self.direction = 'V' + if self.direction == 'H': + self.lines = reorder_vh(self.objs, +1) + else: + self.lines = reorder_hv(self.objs, -1) + self.objs = [] + for line in self.lines: + self.objs.extend(line) return def get_direction(self): - return self.vertical + return self.direction def get_lines(self, ratio): - if self.get_direction(): - for line in reorder_hv(self.objs, -1): - s = '' - y0 = -INF - for obj in line: - if not isinstance(obj, TextItem): continue - margin = obj.get_margin(ratio) - if obj.y1+margin < y0: - s += ' ' - s += obj.text - y0 = obj.y0 - yield s - else: - for line in reorder_vh(self.objs, +1): + if self.get_direction() == 'H': + for line in self.lines: s = '' x1 = INF for obj in line: @@ -384,6 +392,18 @@ class TextBox(LayoutContainer): s += obj.text x1 = obj.x1 yield s + else: + for line in self.lines: + s = '' + y0 = -INF + for obj in line: + if not isinstance(obj, TextItem): continue + margin = obj.get_margin(ratio) + if obj.y1+margin < y0: + s += ' ' + s += obj.text + y0 = obj.y0 + yield s return @@ -404,10 +424,10 @@ class Page(LayoutContainer): def group_text(self, ratio): self.group_objs(ratio, TextBox) - if self.get_direction(): - lines = reorder_hv(self.objs, -1) - else: + if self.get_direction() == 'H': lines = reorder_vh(self.objs, +1) + else: + lines = reorder_hv(self.objs, -1) self.objs = [] for line in lines: self.objs.extend(line) diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 70e39f9..6d770f4 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -3,8 +3,8 @@ import sys from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfdevice import PDFDevice, PDFPageAggregator -from layout import Page, LayoutContainer, TextItem, TextBox from pdffont import PDFUnicodeNotDefined +from page import Page, LayoutContainer, TextItem, FigureItem, TextBox from cmap import CMapDB @@ -104,24 +104,33 @@ class TagExtractor(PDFDevice): class SGMLConverter(PDFConverter): def end_page(self, page): - def draw(item): - if isinstance(item, TextItem): - self.outfp.write('' % - (e(item.font.fontname), item.get_direction(), + def render(item): + if isinstance(item, Page): + self.outfp.write('\n' % + (item.id, item.get_bbox(), item.rotate)) + for child in item: + render(child) + self.outfp.write('\n') + elif isinstance(item, TextItem): + self.outfp.write('' % + (e(item.font.fontname), item.is_vertical(), item.get_bbox(), item.fontsize)) self.write(item.text) self.outfp.write('\n') - elif isinstance(item, LayoutContainer): - self.outfp.write('\n' % (item.id, item.get_bbox())) + elif isinstance(item, FigureItem): + self.outfp.write('
\n' % (item.id, item.get_bbox())) for child in item: - draw(child) - self.outfp.write('\n') + render(child) + self.outfp.write('
\n') + elif isinstance(item, TextBox): + self.outfp.write('\n' % (item.id, item.get_bbox())) + print item + for child in item: + render(child) + self.outfp.write('\n') return page = PDFConverter.end_page(self, page) - self.outfp.write('\n' % - (page.id, page.get_bbox(), page.rotate)) - draw(page) - self.outfp.write('\n') + render(page) return @@ -150,7 +159,7 @@ class HTMLConverter(PDFConverter): return def end_page(self, page): - def draw(item): + def render(item): if isinstance(item, Page): self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height) if self.pagenum: @@ -158,7 +167,7 @@ class HTMLConverter(PDFConverter): ((self.yoffset-page.y1)*self.scale)) self.outfp.write('Page %s\n' % (page.id, page.id)) for child in item: - draw(child) + render(child) elif isinstance(item, TextItem): if item.vertical: wmode = 'tb-rl' @@ -175,11 +184,11 @@ class HTMLConverter(PDFConverter): elif isinstance(item, LayoutContainer): self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height) for child in item: - draw(child) + render(child) return page = PDFConverter.end_page(self, page) self.yoffset += page.y1 - draw(page) + render(page) self.yoffset += self.pagepad return @@ -204,7 +213,7 @@ class TextConverter(PDFConverter): return def end_page(self, page): - def draw(item): + def render(item): if isinstance(item, TextItem): self.outfp.write(obj.text.encode(self.codec, 'replace')) self.outfp.write('\n') @@ -214,11 +223,11 @@ class TextConverter(PDFConverter): self.outfp.write('\n') elif isinstance(item, LayoutContainer): for child in item: - draw(child) + render(child) page = PDFConverter.end_page(self, page) if self.pagenum: self.outfp.write('Page %d\n' % page.id) - draw(page) + render(page) self.outfp.write('\f') return @@ -294,7 +303,7 @@ def main(argv): CMapDB.initialize(cmapdir, cdbcmapdir) rsrc = PDFResourceManager() if outtype == 'sgml': - device = SGMLConverter(rsrc, outfp, codec=codec) + device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'text': diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index 3a3b3b3..7ff2048 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -3,7 +3,7 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdffont import PDFUnicodeNotDefined -from layout import Page, FigureItem, TextItem +from page import Page, FigureItem, TextItem from utils import mult_matrix, translate_matrix