another major restructuring...

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@99 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-15 14:25:32 +00:00
parent dfac85360b
commit 0a8aae5aa3
9 changed files with 332 additions and 283 deletions

View File

@ -22,9 +22,10 @@ test:
cd samples && make cd samples && make
clean: clean:
cd pdflib && make clean -cd pdflib && make clean
cd tools && make clean -cd tools && make clean
cd samples && make clean -cd samples && make clean
-rm -rf build
# Maintainance: # Maintainance:
@ -32,7 +33,7 @@ pack: clean
$(SVN) cleanup $(SVN) cleanup
$(SVN) export . $(WORKDIR)/$(DISTNAME) $(SVN) export . $(WORKDIR)/$(DISTNAME)
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner $(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
rm -rf $(WORKDIR)/$(DISTNAME) -rm -rf $(WORKDIR)/$(DISTNAME)
check: check:
-pychecker --limit=0 *.py -pychecker --limit=0 *.py

View File

@ -134,10 +134,10 @@ class Plane(object):
## ClusterSet ## ClusterSet
## ##
## Maintains a set of TextBox objects. ## Maintains a set of LTTextBox objects.
## It incrementally constructs TextBox objects ## It incrementally constructs LTTextBox objects
## and group them when necessary. It gives ## and group them when necessary. It gives
## a sequence of TextBox objects that represent ## a sequence of LTTextBox objects that represent
## the text stream of that page. ## the text stream of that page.
## ##
class ClusterSet(object): class ClusterSet(object):
@ -145,11 +145,13 @@ class ClusterSet(object):
def __init__(self, klass): def __init__(self, klass):
self.clusters = {} self.clusters = {}
self.klass = klass self.klass = klass
self.i = 0
return return
# add(objs): groups text objects if necessary. # add(objs): groups text objects if necessary.
def add(self, objs): def add(self, objs):
group = self.klass(objs) group = self.klass(objs, self.i)
self.i += 1
for obj in objs: for obj in objs:
if obj in self.clusters: if obj in self.clusters:
group.merge(self.clusters[obj]) group.merge(self.clusters[obj])
@ -157,7 +159,7 @@ class ClusterSet(object):
self.clusters[obj] = group self.clusters[obj] = group
return return
# finish(): returns all the TextBoxes in a page. # finish(): returns all the LTTextBoxes in a page.
def finish(self): def finish(self):
r = set(self.clusters.itervalues()) r = set(self.clusters.itervalues())
for group in r: for group in r:
@ -169,9 +171,8 @@ class ClusterSet(object):
## ##
class LayoutItem(object): class LayoutItem(object):
def __init__(self, id, bbox): def __init__(self, bbox):
#assert x0 <= x1 and y0 <= y1 #assert x0 <= x1 and y0 <= y1
self.id = id
self.set_bbox(bbox) self.set_bbox(bbox)
return return
@ -219,7 +220,8 @@ class LayoutItem(object):
class LayoutContainer(LayoutItem): class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None): def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, id, bbox) LayoutItem.__init__(self, bbox)
self.id = id
if objs: if objs:
self.objs = set(objs) self.objs = set(objs)
else: else:
@ -278,17 +280,38 @@ class LayoutContainer(LayoutItem):
return direction return direction
## FigureItem ## LTLine
## ##
class FigureItem(LayoutContainer): class LTLine(LayoutItem):
def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
self.direction = direction
return
## LTRect
##
class LTRect(LayoutItem):
def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
return
## LTFigure
##
class LTFigure(LayoutContainer):
def __repr__(self): def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox())) return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
## TextItem ## LTText
## ##
class TextItem(LayoutItem): class LTText(LayoutItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars): def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars assert chars
@ -318,7 +341,7 @@ class TextItem(LayoutItem):
self.adv = (0, dy) self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty) bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, None, bbox) LayoutItem.__init__(self, bbox)
return return
def __repr__(self): def __repr__(self):
@ -338,15 +361,15 @@ class TextItem(LayoutItem):
return self.vertical return self.vertical
## TextBox ## LTTextBox
## ##
## A set of text objects that are clustered in ## A set of text objects that are grouped within
## a certain rectangular area. ## a certain rectangular area.
## ##
class TextBox(LayoutContainer): class LTTextBox(LayoutContainer):
def __init__(self, objs): def __init__(self, id, objs):
LayoutContainer.__init__(self, None, (0,0,0,0), objs) LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = None self.direction = None
return return
@ -385,7 +408,7 @@ class TextBox(LayoutContainer):
s = '' s = ''
x1 = INF x1 = INF
for obj in line: for obj in line:
if not isinstance(obj, TextItem): continue if not isinstance(obj, LTText): continue
margin = obj.get_margin(ratio) margin = obj.get_margin(ratio)
if x1 < obj.x0-margin: if x1 < obj.x0-margin:
s += ' ' s += ' '
@ -397,7 +420,7 @@ class TextBox(LayoutContainer):
s = '' s = ''
y0 = -INF y0 = -INF
for obj in line: for obj in line:
if not isinstance(obj, TextItem): continue if not isinstance(obj, LTText): continue
margin = obj.get_margin(ratio) margin = obj.get_margin(ratio)
if obj.y1+margin < y0: if obj.y1+margin < y0:
s += ' ' s += ' '
@ -407,9 +430,9 @@ class TextBox(LayoutContainer):
return return
## Page ## LTPage
## ##
class Page(LayoutContainer): class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0): def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox) LayoutContainer.__init__(self, id, bbox)
@ -423,7 +446,7 @@ class Page(LayoutContainer):
return return
def group_text(self, ratio): def group_text(self, ratio):
self.group_objs(ratio, TextBox) self.group_objs(ratio, LTTextBox)
if self.get_direction() == 'H': if self.get_direction() == 'H':
lines = reorder_vh(self.objs, +1) lines = reorder_vh(self.objs, +1)
else: else:

View File

@ -1,38 +1,125 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfdevice import PDFDevice
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PDFPageAggregator
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox from page import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, enc
from pdfparser import PDFDocument, PDFParser
from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from cmap import CMapDB from cmap import CMapDB
# e(x): encode string
def e(x, codec='ascii'): ## PDFPageAggregator
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') ##
return x.encode(codec, 'xmlcharrefreplace') class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, cluster_margin=None):
PDFDevice.__init__(self, rsrc)
self.cluster_margin = cluster_margin
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
return
def begin_page(self, page):
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
self.pageno += 1
if self.cluster_margin:
self.cur_item.group_text(self.cluster_margin)
return self.cur_item
def begin_figure(self, name, bbox):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return self.undefined_char
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = LTText(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
else:
for cid in font.decode(x):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if textstate.wordspace and not font.is_multibyte() and cid == 32:
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
self.render_chars(textmatrix, textstate, chars)
return
## PDFConverter ## PDFConverter
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None): def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
PDFPageAggregator.__init__(self, rsrc) PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
self.cluster_margin = cluster_margin
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
def end_page(self, page):
page = PDFPageAggregator.end_page(self, page)
if self.cluster_margin:
page.group_text(self.cluster_margin)
return page
def write(self, text): def write(self, text):
self.outfp.write(e(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return
@ -61,7 +148,7 @@ class TagExtractor(PDFDevice):
try: try:
char = font.to_unicode(cid) char = font.to_unicode(cid)
text += char text += char
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined:
pass pass
self.write(text) self.write(text)
return return
@ -81,15 +168,15 @@ class TagExtractor(PDFDevice):
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
s = '' s = ''
if props: if props:
s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v) s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) ) in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (e(tag.name), s)) self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag self.tag = tag
return return
def end_tag(self): def end_tag(self):
assert self.tag assert self.tag
self.outfp.write('</%s>' % e(self.tag.name)) self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None self.tag = None
return return
@ -105,26 +192,29 @@ class SGMLConverter(PDFConverter):
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, Page): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate)) (item.id, item.get_bbox(), item.rotate))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, TextItem): elif isinstance(item, LTText):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.is_vertical(), (enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize)) item.get_bbox(), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, FigureItem): elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, TextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
print item
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
@ -138,10 +228,10 @@ class SGMLConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
pagepad=50, scale=1, cluster_margin=None): scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin) PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
self.pagenum = pagenum self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
self.outfp.write('<html><head>\n') self.outfp.write('<html><head>\n')
@ -152,23 +242,23 @@ class HTMLConverter(PDFConverter):
self.show_text_border = False self.show_text_border = False
return return
def write_rect(self, color, x, y, w, h): def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: 1px solid %s; ' self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, Page): if isinstance(item, LTPage):
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
if self.pagenum: if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' % self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-page.y1)*self.scale)) ((self.yoffset-page.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, TextItem): elif isinstance(item, LTText):
if item.vertical: if item.vertical:
wmode = 'tb-rl' wmode = 'tb-rl'
else: else:
@ -180,9 +270,11 @@ class HTMLConverter(PDFConverter):
self.write(item.text) self.write(item.text)
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.show_text_border: if self.show_text_border:
self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
return return
@ -203,21 +295,21 @@ class HTMLConverter(PDFConverter):
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
cluster_margin=None, word_margin=0.2): showpageno=False, word_margin=0.2):
if cluster_margin == None: if cluster_margin == None:
cluster_margin = 0.5 cluster_margin = 0.5
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin) PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
self.pagenum = pagenum self.showpageno = showpageno
self.word_margin = word_margin self.word_margin = word_margin
return return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, TextItem): if isinstance(item, LTText):
self.outfp.write(obj.text.encode(self.codec, 'replace')) self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n') self.outfp.write('\n')
elif isinstance(item, TextBox): elif isinstance(item, LTTextBox):
for line in item.get_lines(self.word_margin): for line in item.get_lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n') self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n') self.outfp.write('\n')
@ -225,7 +317,7 @@ class TextConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.pagenum: if self.showpageno:
self.outfp.write('Page %d\n' % page.id) self.outfp.write('Page %d\n' % page.id)
render(page) render(page)
self.outfp.write('\f') self.outfp.write('\f')
@ -235,29 +327,6 @@ class TextConverter(PDFConverter):
return return
# pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
try:
doc.initialize(password)
except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
device.close()
fp.close()
return
# main # main
def main(argv): def main(argv):
import getopt import getopt
@ -269,30 +338,35 @@ def main(argv):
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
# debug option
debug = 0 debug = 0
# path option
cmapdir = 'CMap' cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap' cdbcmapdir = 'CDBCMap'
codec = 'utf-8' # input option
password = ''
pagenos = set() pagenos = set()
maxpages = 0 maxpages = 0
# output option
outtype = 'html' outtype = 'html'
password = '' codec = 'utf-8'
pagenum = True
splitwords = False
cluster_margin = None
outfp = sys.stdout outfp = sys.stdout
cluster_margin = None
pageno = 1
scale = 1
showpageno = True
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-c': codec = v
elif k == '-m': maxpages = int(v)
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v elif k == '-D': cdbcmapdir = v
elif k == '-T': cluster_margin = float(v) elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
elif k == '-w': splitwords = True elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v)
# #
CMapDB.debug = debug CMapDB.debug = debug
PDFResourceManager.debug = debug PDFResourceManager.debug = debug
@ -305,7 +379,7 @@ def main(argv):
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
elif outtype == 'text': elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag': elif outtype == 'tag':
@ -313,8 +387,8 @@ def main(argv):
else: else:
return usage() return usage()
for fname in args: for fname in args:
convert(rsrc, device, fname, pagenos, process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
maxpages=maxpages, password=password) device.close()
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,16 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
stderr = sys.stderr
from psparser import PSLiteralTable from psparser import PSLiteralTable
## ColorSpace ## PDFColorSpace
## ##
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
class ColorSpace(object): class PDFColorSpace(object):
def __init__(self, name, ncomponents): def __init__(self, name, ncomponents):
self.name = name self.name = name
@ -18,11 +17,11 @@ class ColorSpace(object):
return return
def __repr__(self): def __repr__(self):
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents) return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict( PREDEFINED_COLORSPACE = dict(
(name, ColorSpace(name,n)) for (name,n) in { (name, PDFColorSpace(name,n)) for (name,n) in {
'CalRGB': 3, 'CalRGB': 3,
'CalGray': 1, 'CalGray': 1,
'Lab': 3, 'Lab': 3,

View File

@ -1,11 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined
from page import Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix
## PDFDevice ## PDFDevice
## ##
@ -50,92 +43,3 @@ class PDFDevice(object):
return return
def render_image(self, stream, size, matrix): def render_image(self, stream, size, matrix):
return return
## PDFPageAggregator
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1):
PDFDevice.__init__(self, rsrc)
self.pageno = pageno
self.stack = []
return
def begin_page(self, page):
self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, Page)
self.cur_item.fixate()
self.pageno += 1
return self.cur_item
def begin_figure(self, name, bbox):
self.stack.append(self.cur_item)
self.cur_item = FigureItem(name, bbox)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?'
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # single line
if path[0][1] == path[1][1]:
#print 'vertical'
pass
elif path[0][2] == path[1][2]:
#print 'horizontal'
pass
elif shape == 'mlllh': # rectangle
if ((path[0][1] == path[1][1] and path[1][2] == path[2][2] and
path[2][1] == path[3][1] and path[3][2] == path[0][2]) or
(path[0][2] == path[1][2] and path[1][1] == path[2][1] and
path[2][2] == path[3][2] and path[3][1] == path[0][1])):
pass
return
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = TextItem(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
else:
for cid in font.decode(x):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if textstate.wordspace and not font.is_multibyte() and cid == 32:
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
self.render_chars(textmatrix, textstate, chars)
return

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
stderr = sys.stderr
from struct import pack, unpack from struct import pack, unpack
try: try:
from cStringIO import StringIO from cStringIO import StringIO

View File

@ -14,7 +14,8 @@ from pdftypes import PDFException, PDFStream, PDFObjRef, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
from cmap import CMapDB from cmap import CMapDB
@ -34,6 +35,56 @@ LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_IMAGE = PSLiteralTable.intern('Image') LITERAL_IMAGE = PSLiteralTable.intern('Image')
## PDFTextState
##
class PDFTextState(object):
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
return
def __repr__(self):
return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
' scaling=%r, leading=%r, render=%r, rise=%r, '
' matrix=%r, linematrix=%r>' %
(self.font, self.fontsize, self.charspace, self.wordspace,
self.scaling, self.leading, self.render, self.rise,
self.matrix, self.linematrix))
def reset(self):
self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0)
return
## PDFGraphicState
##
class PDFGraphicState(object):
def __init__(self):
self.linewidth = 0
self.linecap = None
self.linejoin = None
self.miterlimit = None
self.dash = None
self.intent = None
self.flatness = None
return
def __repr__(self):
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
## Resource Manager ## Resource Manager
## ##
class PDFResourceManager(object): class PDFResourceManager(object):
@ -207,46 +258,6 @@ class PDFPageInterpreter(object):
debug = 0 debug = 0
class TextState(object):
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
return
def __repr__(self):
return ('<TextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
' scaling=%r, leading=%r, render=%r, rise=%r, '
' matrix=%r, linematrix=%r>' %
(self.font, self.fontsize, self.charspace, self.wordspace,
self.scaling, self.leading, self.render, self.rise,
self.matrix, self.linematrix))
def reset(self):
self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0)
return
class GraphicState(object):
def __init__(self):
self.linewidth = None
self.linecap = None
self.linejoin = None
self.miterlimit = None
self.dash = None
self.intent = None
self.flatness = None
return
def __repr__(self):
return ('<GraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
def __init__(self, rsrc, device): def __init__(self, rsrc, device):
self.rsrc = rsrc self.rsrc = rsrc
self.device = device self.device = device
@ -255,50 +266,53 @@ class PDFPageInterpreter(object):
def dup(self): def dup(self):
return PDFPageInterpreter(self.rsrc, self.device) return PDFPageInterpreter(self.rsrc, self.device)
# init_resources(resources):
# Prepare the fonts and XObjects listed in the Resource attribute.
def init_resources(self, resources): def init_resources(self, resources):
self.fontmap = {} self.fontmap = {}
self.xobjmap = {} self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy() self.csmap = PREDEFINED_COLORSPACE.copy()
# Handle resource declarations. if not resources: return
def get_colorspace(spec): def get_colorspace(spec):
if isinstance(spec, list): if isinstance(spec, list):
name = literal_name(spec[0]) name = literal_name(spec[0])
else: else:
name = literal_name(spec) name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, stream_value(spec[1]).dic['N']) return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, len(list_value(spec[1]))) return PDFColorSpace(name, len(list_value(spec[1])))
else: else:
return PREDEFINED_COLORSPACE[name] return PREDEFINED_COLORSPACE[name]
if resources: for (k,v) in dict_value(resources).iteritems():
for (k,v) in dict_value(resources).iteritems(): if 1 <= self.debug:
if 1 <= self.debug: print >>stderr, 'Resource: %r: %r' % (k,v)
print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font':
if k == 'Font': for (fontid,spec) in dict_value(v).iteritems():
for (fontid,spec) in dict_value(v).iteritems(): objid = None
objid = None if isinstance(spec, PDFObjRef):
if isinstance(spec, PDFObjRef): objid = spec.objid
objid = spec.objid spec = dict_value(spec)
spec = dict_value(spec) self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec) elif k == 'ColorSpace':
elif k == 'ColorSpace': for (csid,spec) in dict_value(v).iteritems():
for (csid,spec) in dict_value(v).iteritems(): self.csmap[csid] = get_colorspace(resolve1(spec))
self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet':
elif k == 'ProcSet': self.rsrc.get_procset(list_value(v))
self.rsrc.get_procset(list_value(v)) elif k == 'XObject':
elif k == 'XObject': for (xobjid,xobjstrm) in dict_value(v).iteritems():
for (xobjid,xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm
self.xobjmap[xobjid] = xobjstrm
return return
# init_state(ctm)
# Initialize the text and graphic states for rendering a page.
def init_state(self, ctm): def init_state(self, ctm):
# gstack: stack for graphical states. # gstack: stack for graphical states.
self.gstack = [] self.gstack = []
self.ctm = ctm self.ctm = ctm
self.device.set_ctm(self.ctm) self.device.set_ctm(self.ctm)
self.textstate = self.TextState() self.textstate = PDFTextState()
self.graphicstate = self.GraphicState() self.graphicstate = PDFGraphicState()
self.curpath = [] self.curpath = []
# argstack: stack for command arguments. # argstack: stack for command arguments.
self.argstack = [] self.argstack = []
@ -700,10 +714,13 @@ class PDFPageInterpreter(object):
self.device.end_page(page) self.device.end_page(page)
return return
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY): # render_contents(resources, streams, ctm)
# Render the content streams.
# This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
self.init_resources(resources) self.init_resources(resources)
self.init_state(ctm) self.init_state(ctm)
self.execute(list_value(contents)) self.execute(list_value(streams))
return return
def execute(self, streams): def execute(self, streams):
@ -738,3 +755,26 @@ class PDFPageInterpreter(object):
else: else:
self.push(obj) self.push(obj)
return return
## process_pdf
##
class TextExtractionNotAllowed(RuntimeError): pass
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
try:
doc.initialize(password)
except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
fp.close()
return

View File

@ -205,6 +205,10 @@ class PDFXRefStream(PDFBaseXRef):
## PDFPage ## PDFPage
## ##
## A PDFPage object is nothing more than a bunch of keys and values
## that describe the properties of the page and point to its contents,
## and has nothing to do with a real graphical entity.
##
class PDFPage(object): class PDFPage(object):
def __init__(self, doc, pageid, attrs): def __init__(self, doc, pageid, attrs):

View File

@ -91,3 +91,8 @@ def decode_text(s):
return unicode(s[2:], 'utf-16be', 'ignore') return unicode(s[2:], 'utf-16be', 'ignore')
else: else:
return ''.join( PDFDocEncoding[ord(c)] for c in s ) return ''.join( PDFDocEncoding[ord(c)] for c in s )
# enc(x): encode string in SGML/XML/HTML
def enc(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')