diff --git a/Makefile b/Makefile index 1f4bfe2..e4cbd27 100644 --- a/Makefile +++ b/Makefile @@ -22,9 +22,10 @@ test: cd samples && make clean: - cd pdflib && make clean - cd tools && make clean - cd samples && make clean + -cd pdflib && make clean + -cd tools && make clean + -cd samples && make clean + -rm -rf build # Maintainance: @@ -32,7 +33,7 @@ pack: clean $(SVN) cleanup $(SVN) export . $(WORKDIR)/$(DISTNAME) $(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner - rm -rf $(WORKDIR)/$(DISTNAME) + -rm -rf $(WORKDIR)/$(DISTNAME) check: -pychecker --limit=0 *.py diff --git a/pdflib/page.py b/pdflib/page.py index ea458a3..25d3673 100644 --- a/pdflib/page.py +++ b/pdflib/page.py @@ -134,10 +134,10 @@ class Plane(object): ## ClusterSet ## -## Maintains a set of TextBox objects. -## It incrementally constructs TextBox objects +## Maintains a set of LTTextBox objects. +## It incrementally constructs LTTextBox objects ## and group them when necessary. It gives -## a sequence of TextBox objects that represent +## a sequence of LTTextBox objects that represent ## the text stream of that page. ## class ClusterSet(object): @@ -145,11 +145,13 @@ class ClusterSet(object): def __init__(self, klass): self.clusters = {} self.klass = klass + self.i = 0 return # add(objs): groups text objects if necessary. def add(self, objs): - group = self.klass(objs) + group = self.klass(objs, self.i) + self.i += 1 for obj in objs: if obj in self.clusters: group.merge(self.clusters[obj]) @@ -157,7 +159,7 @@ class ClusterSet(object): self.clusters[obj] = group return - # finish(): returns all the TextBoxes in a page. + # finish(): returns all the LTTextBoxes in a page. def finish(self): r = set(self.clusters.itervalues()) for group in r: @@ -169,9 +171,8 @@ class ClusterSet(object): ## class LayoutItem(object): - def __init__(self, id, bbox): + def __init__(self, bbox): #assert x0 <= x1 and y0 <= y1 - self.id = id self.set_bbox(bbox) return @@ -219,7 +220,8 @@ class LayoutItem(object): class LayoutContainer(LayoutItem): def __init__(self, id, bbox, objs=None): - LayoutItem.__init__(self, id, bbox) + LayoutItem.__init__(self, bbox) + self.id = id if objs: self.objs = set(objs) else: @@ -278,17 +280,38 @@ class LayoutContainer(LayoutItem): return direction -## FigureItem +## LTLine ## -class FigureItem(LayoutContainer): +class LTLine(LayoutItem): + + def __init__(self, linewidth, direction, bbox): + LayoutItem.__init__(self, bbox) + self.linewidth = linewidth + self.direction = direction + return + + +## LTRect +## +class LTRect(LayoutItem): + + def __init__(self, linewidth, bbox): + LayoutItem.__init__(self, bbox) + self.linewidth = linewidth + return + + +## LTFigure +## +class LTFigure(LayoutContainer): def __repr__(self): return ('
' % (self.id, self.get_bbox())) - -## TextItem + +## LTText ## -class TextItem(LayoutItem): +class LTText(LayoutItem): def __init__(self, matrix, font, fontsize, charspace, scaling, chars): assert chars @@ -318,7 +341,7 @@ class TextItem(LayoutItem): self.adv = (0, dy) bbox = (tx, ty+dy, tx+dx, ty) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) - LayoutItem.__init__(self, None, bbox) + LayoutItem.__init__(self, bbox) return def __repr__(self): @@ -338,15 +361,15 @@ class TextItem(LayoutItem): return self.vertical -## TextBox +## LTTextBox ## -## A set of text objects that are clustered in +## A set of text objects that are grouped within ## a certain rectangular area. ## -class TextBox(LayoutContainer): +class LTTextBox(LayoutContainer): - def __init__(self, objs): - LayoutContainer.__init__(self, None, (0,0,0,0), objs) + def __init__(self, id, objs): + LayoutContainer.__init__(self, id, (0,0,0,0), objs) self.direction = None return @@ -385,7 +408,7 @@ class TextBox(LayoutContainer): s = '' x1 = INF for obj in line: - if not isinstance(obj, TextItem): continue + if not isinstance(obj, LTText): continue margin = obj.get_margin(ratio) if x1 < obj.x0-margin: s += ' ' @@ -397,7 +420,7 @@ class TextBox(LayoutContainer): s = '' y0 = -INF for obj in line: - if not isinstance(obj, TextItem): continue + if not isinstance(obj, LTText): continue margin = obj.get_margin(ratio) if obj.y1+margin < y0: s += ' ' @@ -407,9 +430,9 @@ class TextBox(LayoutContainer): return -## Page +## LTPage ## -class Page(LayoutContainer): +class LTPage(LayoutContainer): def __init__(self, id, bbox, rotate=0): LayoutContainer.__init__(self, id, bbox) @@ -423,7 +446,7 @@ class Page(LayoutContainer): return def group_text(self, ratio): - self.group_objs(ratio, TextBox) + self.group_objs(ratio, LTTextBox) if self.get_direction() == 'H': lines = reorder_vh(self.objs, +1) else: diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 6d770f4..a41ff4d 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -1,38 +1,125 @@ #!/usr/bin/env python import sys -from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect -from pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfdevice import PDFDevice, PDFPageAggregator +from pdfdevice import PDFDevice from pdffont import PDFUnicodeNotDefined -from page import Page, LayoutContainer, TextItem, FigureItem, TextBox +from page import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox +from utils import mult_matrix, translate_matrix, enc +from pdfparser import PDFDocument, PDFParser +from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from cmap import CMapDB -# e(x): encode string -def e(x, codec='ascii'): - x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') - return x.encode(codec, 'xmlcharrefreplace') + +## PDFPageAggregator +## +class PDFPageAggregator(PDFDevice): + + def __init__(self, rsrc, pageno=1, cluster_margin=None): + PDFDevice.__init__(self, rsrc) + self.cluster_margin = cluster_margin + self.undefined_char = '?' + self.pageno = pageno + self.stack = [] + return + + def begin_page(self, page): + self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate) + return + + def end_page(self, _): + assert not self.stack + assert isinstance(self.cur_item, LTPage) + self.cur_item.fixate() + self.pageno += 1 + if self.cluster_margin: + self.cur_item.group_text(self.cluster_margin) + return self.cur_item + + def begin_figure(self, name, bbox): + self.stack.append(self.cur_item) + self.cur_item = LTFigure(name, bbox) + return + + def end_figure(self, _): + fig = self.cur_item + self.cur_item.fixate() + self.cur_item = self.stack.pop() + self.cur_item.add(fig) + return + + def handle_undefined_char(self, cidcoding, cid): + if self.debug: + print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) + return self.undefined_char + + def paint_path(self, gstate, matrix, stroke, fill, evenodd, path): + shape = ''.join(x[0] for x in path) + if shape == 'ml': # horizontal/vertical line + (_,x0,y0) = path[0] + (_,x1,y1) = path[1] + if y0 == y1: + # horizontal ruler + self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) + elif x0 == x1: + # vertical ruler + self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) + elif shape == 'mlllh': + # rectangle + (_,x0,y0) = path[0] + (_,x1,y1) = path[1] + (_,x2,y2) = path[2] + (_,x3,y3) = path[3] + if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or + (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): + self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) + return + + def render_chars(self, textmatrix, textstate, chars): + if not chars: return (0, 0) + item = LTText(textmatrix, textstate.font, textstate.fontsize, + textstate.charspace, textstate.scaling, chars) + self.cur_item.add(item) + return item.adv + + def render_string(self, textstate, textmatrix, seq): + font = textstate.font + textmatrix = mult_matrix(textmatrix, self.ctm) + chars = [] + for x in seq: + if isinstance(x, int) or isinstance(x, float): + (dx,dy) = self.render_chars(textmatrix, textstate, chars) + dx -= x * textstate.scaling * .0001 + textmatrix = translate_matrix(textmatrix, (dx, dy)) + chars = [] + else: + for cid in font.decode(x): + try: + char = font.to_unicode(cid) + except PDFUnicodeNotDefined, e: + (cidcoding, cid) = e.args + char = self.handle_undefined_char(cidcoding, cid) + chars.append((char, cid)) + if textstate.wordspace and not font.is_multibyte() and cid == 32: + (dx,dy) = self.render_chars(textmatrix, textstate, chars) + dx += textstate.wordspace * textstate.scaling * .01 + textmatrix = translate_matrix(textmatrix, (dx, dy)) + chars = [] + self.render_chars(textmatrix, textstate, chars) + return ## PDFConverter ## class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None): - PDFPageAggregator.__init__(self, rsrc) - self.cluster_margin = cluster_margin + def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'): + PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin) self.outfp = outfp self.codec = codec return - def end_page(self, page): - page = PDFPageAggregator.end_page(self, page) - if self.cluster_margin: - page.group_text(self.cluster_margin) - return page - def write(self, text): - self.outfp.write(e(text, self.codec)) + self.outfp.write(enc(text, self.codec)) return @@ -61,7 +148,7 @@ class TagExtractor(PDFDevice): try: char = font.to_unicode(cid) text += char - except PDFUnicodeNotDefined, e: + except PDFUnicodeNotDefined: pass self.write(text) return @@ -81,15 +168,15 @@ class TagExtractor(PDFDevice): def begin_tag(self, tag, props=None): s = '' if props: - s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v) + s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) in sorted(props.iteritems()) ) - self.outfp.write('<%s%s>' % (e(tag.name), s)) + self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.tag = tag return def end_tag(self): assert self.tag - self.outfp.write('' % e(self.tag.name)) + self.outfp.write('' % enc(self.tag.name)) self.tag = None return @@ -105,26 +192,29 @@ class SGMLConverter(PDFConverter): def end_page(self, page): def render(item): - if isinstance(item, Page): + if isinstance(item, LTPage): self.outfp.write('\n' % (item.id, item.get_bbox(), item.rotate)) for child in item: render(child) self.outfp.write('\n') - elif isinstance(item, TextItem): + elif isinstance(item, LTText): self.outfp.write('' % - (e(item.font.fontname), item.is_vertical(), + (enc(item.font.fontname), item.is_vertical(), item.get_bbox(), item.fontsize)) self.write(item.text) self.outfp.write('\n') - elif isinstance(item, FigureItem): + elif isinstance(item, LTLine): + self.outfp.write('' % (item.linewidth, item.direction, item.get_bbox())) + elif isinstance(item, LTRect): + self.outfp.write('' % (item.linewidth, item.get_bbox())) + elif isinstance(item, LTFigure): self.outfp.write('
\n' % (item.id, item.get_bbox())) for child in item: render(child) self.outfp.write('
\n') - elif isinstance(item, TextBox): + elif isinstance(item, LTTextBox): self.outfp.write('\n' % (item.id, item.get_bbox())) - print item for child in item: render(child) self.outfp.write('\n') @@ -138,10 +228,10 @@ class SGMLConverter(PDFConverter): ## class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, - pagepad=50, scale=1, cluster_margin=None): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin) - self.pagenum = pagenum + def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', + scale=1, showpageno=True, pagepad=50): + PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec) + self.showpageno = showpageno self.pagepad = pagepad self.scale = scale self.outfp.write('\n') @@ -152,23 +242,23 @@ class HTMLConverter(PDFConverter): self.show_text_border = False return - def write_rect(self, color, x, y, w, h): - self.outfp.write('\n' % - (color, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) + (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) return def end_page(self, page): def render(item): - if isinstance(item, Page): - self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height) - if self.pagenum: + if isinstance(item, LTPage): + self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + if self.showpageno: self.outfp.write('
' % ((self.yoffset-page.y1)*self.scale)) self.outfp.write('Page %s
\n' % (page.id, page.id)) for child in item: render(child) - elif isinstance(item, TextItem): + elif isinstance(item, LTText): if item.vertical: wmode = 'tb-rl' else: @@ -180,9 +270,11 @@ class HTMLConverter(PDFConverter): self.write(item.text) self.outfp.write('\n') if self.show_text_border: - self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + elif isinstance(item, LTLine) or isinstance(item, LTRect): + self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LayoutContainer): - self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) for child in item: render(child) return @@ -203,21 +295,21 @@ class HTMLConverter(PDFConverter): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, - cluster_margin=None, word_margin=0.2): + def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', + showpageno=False, word_margin=0.2): if cluster_margin == None: cluster_margin = 0.5 - PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin) - self.pagenum = pagenum + PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec) + self.showpageno = showpageno self.word_margin = word_margin return def end_page(self, page): def render(item): - if isinstance(item, TextItem): + if isinstance(item, LTText): self.outfp.write(obj.text.encode(self.codec, 'replace')) self.outfp.write('\n') - elif isinstance(item, TextBox): + elif isinstance(item, LTTextBox): for line in item.get_lines(self.word_margin): self.outfp.write(line.encode(self.codec, 'replace')+'\n') self.outfp.write('\n') @@ -225,7 +317,7 @@ class TextConverter(PDFConverter): for child in item: render(child) page = PDFConverter.end_page(self, page) - if self.pagenum: + if self.showpageno: self.outfp.write('Page %d\n' % page.id) render(page) self.outfp.write('\f') @@ -235,29 +327,6 @@ class TextConverter(PDFConverter): return -# pdf2txt -class TextExtractionNotAllowed(RuntimeError): pass - -def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''): - doc = PDFDocument() - fp = file(fname, 'rb') - parser = PDFParser(doc, fp) - try: - doc.initialize(password) - except PDFPasswordIncorrect: - raise TextExtractionNotAllowed('Incorrect password') - if not doc.is_extractable: - raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) - interpreter = PDFPageInterpreter(rsrc, device) - for (pageno,page) in enumerate(doc.get_pages()): - if pagenos and (pageno not in pagenos): continue - interpreter.process_page(page) - if maxpages and maxpages <= pageno+1: break - device.close() - fp.close() - return - - # main def main(argv): import getopt @@ -269,30 +338,35 @@ def main(argv): except getopt.GetoptError: return usage() if not args: return usage() + # debug option debug = 0 + # path option cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' - codec = 'utf-8' + # input option + password = '' pagenos = set() maxpages = 0 + # output option outtype = 'html' - password = '' - pagenum = True - splitwords = False - cluster_margin = None + codec = 'utf-8' outfp = sys.stdout + cluster_margin = None + pageno = 1 + scale = 1 + showpageno = True for (k, v) in opts: if k == '-d': debug += 1 - elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) - elif k == '-P': password = v - elif k == '-c': codec = v - elif k == '-m': maxpages = int(v) elif k == '-C': cmapdir = v elif k == '-D': cdbcmapdir = v - elif k == '-T': cluster_margin = float(v) + elif k == '-P': password = v + elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) + elif k == '-m': maxpages = int(v) elif k == '-t': outtype = v + elif k == '-c': codec = v elif k == '-o': outfp = file(v, 'wb') - elif k == '-w': splitwords = True + elif k == '-s': scale = float(v) + elif k == '-T': cluster_margin = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug @@ -305,7 +379,7 @@ def main(argv): if outtype == 'sgml': device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale) elif outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'tag': @@ -313,8 +387,8 @@ def main(argv): else: return usage() for fname in args: - convert(rsrc, device, fname, pagenos, - maxpages=maxpages, password=password) + process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password) + device.close() return if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdflib/pdfcolor.py b/pdflib/pdfcolor.py index 72b770a..27758fb 100644 --- a/pdflib/pdfcolor.py +++ b/pdflib/pdfcolor.py @@ -1,16 +1,15 @@ #!/usr/bin/env python import sys -stderr = sys.stderr from psparser import PSLiteralTable -## ColorSpace +## PDFColorSpace ## LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') -class ColorSpace(object): +class PDFColorSpace(object): def __init__(self, name, ncomponents): self.name = name @@ -18,11 +17,11 @@ class ColorSpace(object): return def __repr__(self): - return '' % (self.name, self.ncomponents) + return '' % (self.name, self.ncomponents) PREDEFINED_COLORSPACE = dict( - (name, ColorSpace(name,n)) for (name,n) in { + (name, PDFColorSpace(name,n)) for (name,n) in { 'CalRGB': 3, 'CalGray': 1, 'Lab': 3, diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index 7ff2048..bf341cc 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -1,11 +1,4 @@ #!/usr/bin/env python -import sys -stdout = sys.stdout -stderr = sys.stderr -from pdffont import PDFUnicodeNotDefined -from page import Page, FigureItem, TextItem -from utils import mult_matrix, translate_matrix - ## PDFDevice ## @@ -50,92 +43,3 @@ class PDFDevice(object): return def render_image(self, stream, size, matrix): return - - -## PDFPageAggregator -## -class PDFPageAggregator(PDFDevice): - - def __init__(self, rsrc, pageno=1): - PDFDevice.__init__(self, rsrc) - self.pageno = pageno - self.stack = [] - return - - def begin_page(self, page): - self.cur_item = Page(self.pageno, page.mediabox, page.rotate) - return - - def end_page(self, _): - assert not self.stack - assert isinstance(self.cur_item, Page) - self.cur_item.fixate() - self.pageno += 1 - return self.cur_item - - def begin_figure(self, name, bbox): - self.stack.append(self.cur_item) - self.cur_item = FigureItem(name, bbox) - return - - def end_figure(self, _): - fig = self.cur_item - self.cur_item.fixate() - self.cur_item = self.stack.pop() - self.cur_item.add(fig) - return - - def handle_undefined_char(self, cidcoding, cid): - if self.debug: - print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) - return '?' - - def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path): - shape = ''.join(x[0] for x in path) - if shape == 'ml': # single line - if path[0][1] == path[1][1]: - #print 'vertical' - pass - elif path[0][2] == path[1][2]: - #print 'horizontal' - pass - elif shape == 'mlllh': # rectangle - if ((path[0][1] == path[1][1] and path[1][2] == path[2][2] and - path[2][1] == path[3][1] and path[3][2] == path[0][2]) or - (path[0][2] == path[1][2] and path[1][1] == path[2][1] and - path[2][2] == path[3][2] and path[3][1] == path[0][1])): - pass - return - - def render_chars(self, textmatrix, textstate, chars): - if not chars: return (0, 0) - item = TextItem(textmatrix, textstate.font, textstate.fontsize, - textstate.charspace, textstate.scaling, chars) - self.cur_item.add(item) - return item.adv - - def render_string(self, textstate, textmatrix, seq): - font = textstate.font - textmatrix = mult_matrix(textmatrix, self.ctm) - chars = [] - for x in seq: - if isinstance(x, int) or isinstance(x, float): - (dx,dy) = self.render_chars(textmatrix, textstate, chars) - dx -= x * textstate.scaling * .0001 - textmatrix = translate_matrix(textmatrix, (dx, dy)) - chars = [] - else: - for cid in font.decode(x): - try: - char = font.to_unicode(cid) - except PDFUnicodeNotDefined, e: - (cidcoding, cid) = e.args - char = self.handle_undefined_char(cidcoding, cid) - chars.append((char, cid)) - if textstate.wordspace and not font.is_multibyte() and cid == 32: - (dx,dy) = self.render_chars(textmatrix, textstate, chars) - dx += textstate.wordspace * textstate.scaling * .01 - textmatrix = translate_matrix(textmatrix, (dx, dy)) - chars = [] - self.render_chars(textmatrix, textstate, chars) - return diff --git a/pdflib/pdffont.py b/pdflib/pdffont.py index 4f0eece..f53482d 100644 --- a/pdflib/pdffont.py +++ b/pdflib/pdffont.py @@ -1,6 +1,5 @@ #!/usr/bin/env python import sys -stderr = sys.stderr from struct import pack, unpack try: from cStringIO import StringIO diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index cd680d2..6bf0b03 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -14,7 +14,8 @@ from pdftypes import PDFException, PDFStream, PDFObjRef, \ str_value, list_value, dict_value, stream_value from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont -from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ +from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect +from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \ LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK from cmap import CMapDB @@ -34,6 +35,56 @@ LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_IMAGE = PSLiteralTable.intern('Image') +## PDFTextState +## +class PDFTextState(object): + + def __init__(self): + self.font = None + self.fontsize = 0 + self.charspace = 0 + self.wordspace = 0 + self.scaling = 100 + self.leading = 0 + self.render = 0 + self.rise = 0 + self.reset() + return + + def __repr__(self): + return ('' % + (self.font, self.fontsize, self.charspace, self.wordspace, + self.scaling, self.leading, self.render, self.rise, + self.matrix, self.linematrix)) + + def reset(self): + self.matrix = MATRIX_IDENTITY + self.linematrix = (0, 0) + return + + +## PDFGraphicState +## +class PDFGraphicState(object): + + def __init__(self): + self.linewidth = 0 + self.linecap = None + self.linejoin = None + self.miterlimit = None + self.dash = None + self.intent = None + self.flatness = None + return + + def __repr__(self): + return ('' % + (self.linewidth, self.linecap, self.linejoin, + self.miterlimit, self.dash, self.intent, self.flatness)) + ## Resource Manager ## class PDFResourceManager(object): @@ -207,46 +258,6 @@ class PDFPageInterpreter(object): debug = 0 - class TextState(object): - def __init__(self): - self.font = None - self.fontsize = 0 - self.charspace = 0 - self.wordspace = 0 - self.scaling = 100 - self.leading = 0 - self.render = 0 - self.rise = 0 - self.reset() - return - def __repr__(self): - return ('' % - (self.font, self.fontsize, self.charspace, self.wordspace, - self.scaling, self.leading, self.render, self.rise, - self.matrix, self.linematrix)) - def reset(self): - self.matrix = MATRIX_IDENTITY - self.linematrix = (0, 0) - return - - class GraphicState(object): - def __init__(self): - self.linewidth = None - self.linecap = None - self.linejoin = None - self.miterlimit = None - self.dash = None - self.intent = None - self.flatness = None - return - def __repr__(self): - return ('' % - (self.linewidth, self.linecap, self.linejoin, - self.miterlimit, self.dash, self.intent, self.flatness)) - def __init__(self, rsrc, device): self.rsrc = rsrc self.device = device @@ -255,50 +266,53 @@ class PDFPageInterpreter(object): def dup(self): return PDFPageInterpreter(self.rsrc, self.device) + # init_resources(resources): + # Prepare the fonts and XObjects listed in the Resource attribute. def init_resources(self, resources): self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() - # Handle resource declarations. + if not resources: return def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): - return ColorSpace(name, stream_value(spec[1]).dic['N']) + return PDFColorSpace(name, stream_value(spec[1]).dic['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): - return ColorSpace(name, len(list_value(spec[1]))) + return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] - if resources: - for (k,v) in dict_value(resources).iteritems(): - if 1 <= self.debug: - print >>stderr, 'Resource: %r: %r' % (k,v) - if k == 'Font': - for (fontid,spec) in dict_value(v).iteritems(): - objid = None - if isinstance(spec, PDFObjRef): - objid = spec.objid - spec = dict_value(spec) - self.fontmap[fontid] = self.rsrc.get_font(objid, spec) - elif k == 'ColorSpace': - for (csid,spec) in dict_value(v).iteritems(): - self.csmap[csid] = get_colorspace(resolve1(spec)) - elif k == 'ProcSet': - self.rsrc.get_procset(list_value(v)) - elif k == 'XObject': - for (xobjid,xobjstrm) in dict_value(v).iteritems(): - self.xobjmap[xobjid] = xobjstrm + for (k,v) in dict_value(resources).iteritems(): + if 1 <= self.debug: + print >>stderr, 'Resource: %r: %r' % (k,v) + if k == 'Font': + for (fontid,spec) in dict_value(v).iteritems(): + objid = None + if isinstance(spec, PDFObjRef): + objid = spec.objid + spec = dict_value(spec) + self.fontmap[fontid] = self.rsrc.get_font(objid, spec) + elif k == 'ColorSpace': + for (csid,spec) in dict_value(v).iteritems(): + self.csmap[csid] = get_colorspace(resolve1(spec)) + elif k == 'ProcSet': + self.rsrc.get_procset(list_value(v)) + elif k == 'XObject': + for (xobjid,xobjstrm) in dict_value(v).iteritems(): + self.xobjmap[xobjid] = xobjstrm return - + + # init_state(ctm) + # Initialize the text and graphic states for rendering a page. def init_state(self, ctm): # gstack: stack for graphical states. self.gstack = [] self.ctm = ctm self.device.set_ctm(self.ctm) - self.textstate = self.TextState() - self.graphicstate = self.GraphicState() + self.textstate = PDFTextState() + self.graphicstate = PDFGraphicState() self.curpath = [] # argstack: stack for command arguments. self.argstack = [] @@ -700,10 +714,13 @@ class PDFPageInterpreter(object): self.device.end_page(page) return - def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY): + # render_contents(resources, streams, ctm) + # Render the content streams. + # This method may be called recursively. + def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): self.init_resources(resources) self.init_state(ctm) - self.execute(list_value(contents)) + self.execute(list_value(streams)) return def execute(self, streams): @@ -738,3 +755,26 @@ class PDFPageInterpreter(object): else: self.push(obj) return + + +## process_pdf +## +class TextExtractionNotAllowed(RuntimeError): pass + +def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''): + doc = PDFDocument() + fp = file(fname, 'rb') + parser = PDFParser(doc, fp) + try: + doc.initialize(password) + except PDFPasswordIncorrect: + raise TextExtractionNotAllowed('Incorrect password') + if not doc.is_extractable: + raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) + interpreter = PDFPageInterpreter(rsrc, device) + for (pageno,page) in enumerate(doc.get_pages()): + if pagenos and (pageno not in pagenos): continue + interpreter.process_page(page) + if maxpages and maxpages <= pageno+1: break + fp.close() + return diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 3345065..60b3985 100644 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -205,6 +205,10 @@ class PDFXRefStream(PDFBaseXRef): ## PDFPage ## +## A PDFPage object is nothing more than a bunch of keys and values +## that describe the properties of the page and point to its contents, +## and has nothing to do with a real graphical entity. +## class PDFPage(object): def __init__(self, doc, pageid, attrs): diff --git a/pdflib/utils.py b/pdflib/utils.py index e2849a7..c07dd10 100644 --- a/pdflib/utils.py +++ b/pdflib/utils.py @@ -91,3 +91,8 @@ def decode_text(s): return unicode(s[2:], 'utf-16be', 'ignore') else: return ''.join( PDFDocEncoding[ord(c)] for c in s ) + +# enc(x): encode string in SGML/XML/HTML +def enc(x, codec='ascii'): + x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') + return x.encode(codec, 'xmlcharrefreplace')