another major restructuring...

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@99 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-15 14:25:32 +00:00
parent dfac85360b
commit 0a8aae5aa3
9 changed files with 332 additions and 283 deletions

View File

@ -22,9 +22,10 @@ test:
cd samples && make
clean:
cd pdflib && make clean
cd tools && make clean
cd samples && make clean
-cd pdflib && make clean
-cd tools && make clean
-cd samples && make clean
-rm -rf build
# Maintainance:
@ -32,7 +33,7 @@ pack: clean
$(SVN) cleanup
$(SVN) export . $(WORKDIR)/$(DISTNAME)
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
rm -rf $(WORKDIR)/$(DISTNAME)
-rm -rf $(WORKDIR)/$(DISTNAME)
check:
-pychecker --limit=0 *.py

View File

@ -134,10 +134,10 @@ class Plane(object):
## ClusterSet
##
## Maintains a set of TextBox objects.
## It incrementally constructs TextBox objects
## Maintains a set of LTTextBox objects.
## It incrementally constructs LTTextBox objects
## and group them when necessary. It gives
## a sequence of TextBox objects that represent
## a sequence of LTTextBox objects that represent
## the text stream of that page.
##
class ClusterSet(object):
@ -145,11 +145,13 @@ class ClusterSet(object):
def __init__(self, klass):
self.clusters = {}
self.klass = klass
self.i = 0
return
# add(objs): groups text objects if necessary.
def add(self, objs):
group = self.klass(objs)
group = self.klass(objs, self.i)
self.i += 1
for obj in objs:
if obj in self.clusters:
group.merge(self.clusters[obj])
@ -157,7 +159,7 @@ class ClusterSet(object):
self.clusters[obj] = group
return
# finish(): returns all the TextBoxes in a page.
# finish(): returns all the LTTextBoxes in a page.
def finish(self):
r = set(self.clusters.itervalues())
for group in r:
@ -169,9 +171,8 @@ class ClusterSet(object):
##
class LayoutItem(object):
def __init__(self, id, bbox):
def __init__(self, bbox):
#assert x0 <= x1 and y0 <= y1
self.id = id
self.set_bbox(bbox)
return
@ -219,7 +220,8 @@ class LayoutItem(object):
class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, id, bbox)
LayoutItem.__init__(self, bbox)
self.id = id
if objs:
self.objs = set(objs)
else:
@ -278,17 +280,38 @@ class LayoutContainer(LayoutItem):
return direction
## FigureItem
## LTLine
##
class FigureItem(LayoutContainer):
class LTLine(LayoutItem):
def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
self.direction = direction
return
## LTRect
##
class LTRect(LayoutItem):
def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
return
## LTFigure
##
class LTFigure(LayoutContainer):
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
## TextItem
## LTText
##
class TextItem(LayoutItem):
class LTText(LayoutItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
@ -318,7 +341,7 @@ class TextItem(LayoutItem):
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, None, bbox)
LayoutItem.__init__(self, bbox)
return
def __repr__(self):
@ -338,15 +361,15 @@ class TextItem(LayoutItem):
return self.vertical
## TextBox
## LTTextBox
##
## A set of text objects that are clustered in
## A set of text objects that are grouped within
## a certain rectangular area.
##
class TextBox(LayoutContainer):
class LTTextBox(LayoutContainer):
def __init__(self, objs):
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
def __init__(self, id, objs):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = None
return
@ -385,7 +408,7 @@ class TextBox(LayoutContainer):
s = ''
x1 = INF
for obj in line:
if not isinstance(obj, TextItem): continue
if not isinstance(obj, LTText): continue
margin = obj.get_margin(ratio)
if x1 < obj.x0-margin:
s += ' '
@ -397,7 +420,7 @@ class TextBox(LayoutContainer):
s = ''
y0 = -INF
for obj in line:
if not isinstance(obj, TextItem): continue
if not isinstance(obj, LTText): continue
margin = obj.get_margin(ratio)
if obj.y1+margin < y0:
s += ' '
@ -407,9 +430,9 @@ class TextBox(LayoutContainer):
return
## Page
## LTPage
##
class Page(LayoutContainer):
class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
@ -423,7 +446,7 @@ class Page(LayoutContainer):
return
def group_text(self, ratio):
self.group_objs(ratio, TextBox)
self.group_objs(ratio, LTTextBox)
if self.get_direction() == 'H':
lines = reorder_vh(self.objs, +1)
else:

View File

@ -1,38 +1,125 @@
#!/usr/bin/env python
import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PDFPageAggregator
from pdfdevice import PDFDevice
from pdffont import PDFUnicodeNotDefined
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
from page import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, enc
from pdfparser import PDFDocument, PDFParser
from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from cmap import CMapDB
# e(x): encode string
def e(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')
## PDFPageAggregator
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, cluster_margin=None):
PDFDevice.__init__(self, rsrc)
self.cluster_margin = cluster_margin
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
return
def begin_page(self, page):
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
self.pageno += 1
if self.cluster_margin:
self.cur_item.group_text(self.cluster_margin)
return self.cur_item
def begin_figure(self, name, bbox):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return self.undefined_char
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = LTText(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
else:
for cid in font.decode(x):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if textstate.wordspace and not font.is_multibyte() and cid == 32:
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
self.render_chars(textmatrix, textstate, chars)
return
## PDFConverter
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None):
PDFPageAggregator.__init__(self, rsrc)
self.cluster_margin = cluster_margin
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
self.outfp = outfp
self.codec = codec
return
def end_page(self, page):
page = PDFPageAggregator.end_page(self, page)
if self.cluster_margin:
page.group_text(self.cluster_margin)
return page
def write(self, text):
self.outfp.write(e(text, self.codec))
self.outfp.write(enc(text, self.codec))
return
@ -61,7 +148,7 @@ class TagExtractor(PDFDevice):
try:
char = font.to_unicode(cid)
text += char
except PDFUnicodeNotDefined, e:
except PDFUnicodeNotDefined:
pass
self.write(text)
return
@ -81,15 +168,15 @@ class TagExtractor(PDFDevice):
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v)
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (e(tag.name), s))
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % e(self.tag.name))
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
@ -105,26 +192,29 @@ class SGMLConverter(PDFConverter):
def end_page(self, page):
def render(item):
if isinstance(item, Page):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, TextItem):
elif isinstance(item, LTText):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.is_vertical(),
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, FigureItem):
elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, TextBox):
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
print item
for child in item:
render(child)
self.outfp.write('</textbox>\n')
@ -138,10 +228,10 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True,
pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
self.pagenum = pagenum
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head>\n')
@ -152,23 +242,23 @@ class HTMLConverter(PDFConverter):
self.show_text_border = False
return
def write_rect(self, color, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: 1px solid %s; '
def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return
def end_page(self, page):
def render(item):
if isinstance(item, Page):
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
if self.pagenum:
if isinstance(item, LTPage):
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-page.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
render(child)
elif isinstance(item, TextItem):
elif isinstance(item, LTText):
if item.vertical:
wmode = 'tb-rl'
else:
@ -180,9 +270,11 @@ class HTMLConverter(PDFConverter):
self.write(item.text)
self.outfp.write('</span>\n')
if self.show_text_border:
self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height)
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LayoutContainer):
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)
return
@ -203,21 +295,21 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False,
cluster_margin=None, word_margin=0.2):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
showpageno=False, word_margin=0.2):
if cluster_margin == None:
cluster_margin = 0.5
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
self.pagenum = pagenum
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
self.showpageno = showpageno
self.word_margin = word_margin
return
def end_page(self, page):
def render(item):
if isinstance(item, TextItem):
if isinstance(item, LTText):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
elif isinstance(item, TextBox):
elif isinstance(item, LTTextBox):
for line in item.get_lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n')
@ -225,7 +317,7 @@ class TextConverter(PDFConverter):
for child in item:
render(child)
page = PDFConverter.end_page(self, page)
if self.pagenum:
if self.showpageno:
self.outfp.write('Page %d\n' % page.id)
render(page)
self.outfp.write('\f')
@ -235,29 +327,6 @@ class TextConverter(PDFConverter):
return
# pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
try:
doc.initialize(password)
except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
device.close()
fp.close()
return
# main
def main(argv):
import getopt
@ -269,30 +338,35 @@ def main(argv):
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# path option
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
codec = 'utf-8'
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outtype = 'html'
password = ''
pagenum = True
splitwords = False
cluster_margin = None
codec = 'utf-8'
outfp = sys.stdout
cluster_margin = None
pageno = 1
scale = 1
showpageno = True
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-c': codec = v
elif k == '-m': maxpages = int(v)
elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v
elif k == '-T': cluster_margin = float(v)
elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfp = file(v, 'wb')
elif k == '-w': splitwords = True
elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
@ -305,7 +379,7 @@ def main(argv):
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag':
@ -313,8 +387,8 @@ def main(argv):
else:
return usage()
for fname in args:
convert(rsrc, device, fname, pagenos,
maxpages=maxpages, password=password)
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
device.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,16 +1,15 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from psparser import PSLiteralTable
## ColorSpace
## PDFColorSpace
##
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
class ColorSpace(object):
class PDFColorSpace(object):
def __init__(self, name, ncomponents):
self.name = name
@ -18,11 +17,11 @@ class ColorSpace(object):
return
def __repr__(self):
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict(
(name, ColorSpace(name,n)) for (name,n) in {
(name, PDFColorSpace(name,n)) for (name,n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,

View File

@ -1,11 +1,4 @@
#!/usr/bin/env python
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined
from page import Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix
## PDFDevice
##
@ -50,92 +43,3 @@ class PDFDevice(object):
return
def render_image(self, stream, size, matrix):
return
## PDFPageAggregator
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1):
PDFDevice.__init__(self, rsrc)
self.pageno = pageno
self.stack = []
return
def begin_page(self, page):
self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, Page)
self.cur_item.fixate()
self.pageno += 1
return self.cur_item
def begin_figure(self, name, bbox):
self.stack.append(self.cur_item)
self.cur_item = FigureItem(name, bbox)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?'
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # single line
if path[0][1] == path[1][1]:
#print 'vertical'
pass
elif path[0][2] == path[1][2]:
#print 'horizontal'
pass
elif shape == 'mlllh': # rectangle
if ((path[0][1] == path[1][1] and path[1][2] == path[2][2] and
path[2][1] == path[3][1] and path[3][2] == path[0][2]) or
(path[0][2] == path[1][2] and path[1][1] == path[2][1] and
path[2][2] == path[3][2] and path[3][1] == path[0][1])):
pass
return
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = TextItem(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
else:
for cid in font.decode(x):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if textstate.wordspace and not font.is_multibyte() and cid == 32:
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
self.render_chars(textmatrix, textstate, chars)
return

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from struct import pack, unpack
try:
from cStringIO import StringIO

View File

@ -14,7 +14,8 @@ from pdftypes import PDFException, PDFStream, PDFObjRef, \
str_value, list_value, dict_value, stream_value
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
from cmap import CMapDB
@ -34,6 +35,56 @@ LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_IMAGE = PSLiteralTable.intern('Image')
## PDFTextState
##
class PDFTextState(object):
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
return
def __repr__(self):
return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
' scaling=%r, leading=%r, render=%r, rise=%r, '
' matrix=%r, linematrix=%r>' %
(self.font, self.fontsize, self.charspace, self.wordspace,
self.scaling, self.leading, self.render, self.rise,
self.matrix, self.linematrix))
def reset(self):
self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0)
return
## PDFGraphicState
##
class PDFGraphicState(object):
def __init__(self):
self.linewidth = 0
self.linecap = None
self.linejoin = None
self.miterlimit = None
self.dash = None
self.intent = None
self.flatness = None
return
def __repr__(self):
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
## Resource Manager
##
class PDFResourceManager(object):
@ -207,46 +258,6 @@ class PDFPageInterpreter(object):
debug = 0
class TextState(object):
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
return
def __repr__(self):
return ('<TextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
' scaling=%r, leading=%r, render=%r, rise=%r, '
' matrix=%r, linematrix=%r>' %
(self.font, self.fontsize, self.charspace, self.wordspace,
self.scaling, self.leading, self.render, self.rise,
self.matrix, self.linematrix))
def reset(self):
self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0)
return
class GraphicState(object):
def __init__(self):
self.linewidth = None
self.linecap = None
self.linejoin = None
self.miterlimit = None
self.dash = None
self.intent = None
self.flatness = None
return
def __repr__(self):
return ('<GraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
def __init__(self, rsrc, device):
self.rsrc = rsrc
self.device = device
@ -255,23 +266,24 @@ class PDFPageInterpreter(object):
def dup(self):
return PDFPageInterpreter(self.rsrc, self.device)
# init_resources(resources):
# Prepare the fonts and XObjects listed in the Resource attribute.
def init_resources(self, resources):
self.fontmap = {}
self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy()
# Handle resource declarations.
if not resources: return
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, stream_value(spec[1]).dic['N'])
return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, len(list_value(spec[1])))
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE[name]
if resources:
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
@ -292,13 +304,15 @@ class PDFPageInterpreter(object):
self.xobjmap[xobjid] = xobjstrm
return
# init_state(ctm)
# Initialize the text and graphic states for rendering a page.
def init_state(self, ctm):
# gstack: stack for graphical states.
self.gstack = []
self.ctm = ctm
self.device.set_ctm(self.ctm)
self.textstate = self.TextState()
self.graphicstate = self.GraphicState()
self.textstate = PDFTextState()
self.graphicstate = PDFGraphicState()
self.curpath = []
# argstack: stack for command arguments.
self.argstack = []
@ -700,10 +714,13 @@ class PDFPageInterpreter(object):
self.device.end_page(page)
return
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
# render_contents(resources, streams, ctm)
# Render the content streams.
# This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(contents))
self.execute(list_value(streams))
return
def execute(self, streams):
@ -738,3 +755,26 @@ class PDFPageInterpreter(object):
else:
self.push(obj)
return
## process_pdf
##
class TextExtractionNotAllowed(RuntimeError): pass
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
try:
doc.initialize(password)
except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
fp.close()
return

View File

@ -205,6 +205,10 @@ class PDFXRefStream(PDFBaseXRef):
## PDFPage
##
## A PDFPage object is nothing more than a bunch of keys and values
## that describe the properties of the page and point to its contents,
## and has nothing to do with a real graphical entity.
##
class PDFPage(object):
def __init__(self, doc, pageid, attrs):

View File

@ -91,3 +91,8 @@ def decode_text(s):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
# enc(x): encode string in SGML/XML/HTML
def enc(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')