another major restructuring...
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@99 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
dfac85360b
commit
0a8aae5aa3
9
Makefile
9
Makefile
|
@ -22,9 +22,10 @@ test:
|
||||||
cd samples && make
|
cd samples && make
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
cd pdflib && make clean
|
-cd pdflib && make clean
|
||||||
cd tools && make clean
|
-cd tools && make clean
|
||||||
cd samples && make clean
|
-cd samples && make clean
|
||||||
|
-rm -rf build
|
||||||
|
|
||||||
# Maintainance:
|
# Maintainance:
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ pack: clean
|
||||||
$(SVN) cleanup
|
$(SVN) cleanup
|
||||||
$(SVN) export . $(WORKDIR)/$(DISTNAME)
|
$(SVN) export . $(WORKDIR)/$(DISTNAME)
|
||||||
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
||||||
rm -rf $(WORKDIR)/$(DISTNAME)
|
-rm -rf $(WORKDIR)/$(DISTNAME)
|
||||||
|
|
||||||
check:
|
check:
|
||||||
-pychecker --limit=0 *.py
|
-pychecker --limit=0 *.py
|
||||||
|
|
|
@ -134,10 +134,10 @@ class Plane(object):
|
||||||
|
|
||||||
## ClusterSet
|
## ClusterSet
|
||||||
##
|
##
|
||||||
## Maintains a set of TextBox objects.
|
## Maintains a set of LTTextBox objects.
|
||||||
## It incrementally constructs TextBox objects
|
## It incrementally constructs LTTextBox objects
|
||||||
## and group them when necessary. It gives
|
## and group them when necessary. It gives
|
||||||
## a sequence of TextBox objects that represent
|
## a sequence of LTTextBox objects that represent
|
||||||
## the text stream of that page.
|
## the text stream of that page.
|
||||||
##
|
##
|
||||||
class ClusterSet(object):
|
class ClusterSet(object):
|
||||||
|
@ -145,11 +145,13 @@ class ClusterSet(object):
|
||||||
def __init__(self, klass):
|
def __init__(self, klass):
|
||||||
self.clusters = {}
|
self.clusters = {}
|
||||||
self.klass = klass
|
self.klass = klass
|
||||||
|
self.i = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
# add(objs): groups text objects if necessary.
|
# add(objs): groups text objects if necessary.
|
||||||
def add(self, objs):
|
def add(self, objs):
|
||||||
group = self.klass(objs)
|
group = self.klass(objs, self.i)
|
||||||
|
self.i += 1
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
if obj in self.clusters:
|
if obj in self.clusters:
|
||||||
group.merge(self.clusters[obj])
|
group.merge(self.clusters[obj])
|
||||||
|
@ -157,7 +159,7 @@ class ClusterSet(object):
|
||||||
self.clusters[obj] = group
|
self.clusters[obj] = group
|
||||||
return
|
return
|
||||||
|
|
||||||
# finish(): returns all the TextBoxes in a page.
|
# finish(): returns all the LTTextBoxes in a page.
|
||||||
def finish(self):
|
def finish(self):
|
||||||
r = set(self.clusters.itervalues())
|
r = set(self.clusters.itervalues())
|
||||||
for group in r:
|
for group in r:
|
||||||
|
@ -169,9 +171,8 @@ class ClusterSet(object):
|
||||||
##
|
##
|
||||||
class LayoutItem(object):
|
class LayoutItem(object):
|
||||||
|
|
||||||
def __init__(self, id, bbox):
|
def __init__(self, bbox):
|
||||||
#assert x0 <= x1 and y0 <= y1
|
#assert x0 <= x1 and y0 <= y1
|
||||||
self.id = id
|
|
||||||
self.set_bbox(bbox)
|
self.set_bbox(bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -219,7 +220,8 @@ class LayoutItem(object):
|
||||||
class LayoutContainer(LayoutItem):
|
class LayoutContainer(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, id, bbox, objs=None):
|
def __init__(self, id, bbox, objs=None):
|
||||||
LayoutItem.__init__(self, id, bbox)
|
LayoutItem.__init__(self, bbox)
|
||||||
|
self.id = id
|
||||||
if objs:
|
if objs:
|
||||||
self.objs = set(objs)
|
self.objs = set(objs)
|
||||||
else:
|
else:
|
||||||
|
@ -278,17 +280,38 @@ class LayoutContainer(LayoutItem):
|
||||||
return direction
|
return direction
|
||||||
|
|
||||||
|
|
||||||
## FigureItem
|
## LTLine
|
||||||
##
|
##
|
||||||
class FigureItem(LayoutContainer):
|
class LTLine(LayoutItem):
|
||||||
|
|
||||||
|
def __init__(self, linewidth, direction, bbox):
|
||||||
|
LayoutItem.__init__(self, bbox)
|
||||||
|
self.linewidth = linewidth
|
||||||
|
self.direction = direction
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTRect
|
||||||
|
##
|
||||||
|
class LTRect(LayoutItem):
|
||||||
|
|
||||||
|
def __init__(self, linewidth, bbox):
|
||||||
|
LayoutItem.__init__(self, bbox)
|
||||||
|
self.linewidth = linewidth
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTFigure
|
||||||
|
##
|
||||||
|
class LTFigure(LayoutContainer):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
|
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
|
||||||
|
|
||||||
|
|
||||||
## TextItem
|
## LTText
|
||||||
##
|
##
|
||||||
class TextItem(LayoutItem):
|
class LTText(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
assert chars
|
assert chars
|
||||||
|
@ -318,7 +341,7 @@ class TextItem(LayoutItem):
|
||||||
self.adv = (0, dy)
|
self.adv = (0, dy)
|
||||||
bbox = (tx, ty+dy, tx+dx, ty)
|
bbox = (tx, ty+dy, tx+dx, ty)
|
||||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||||
LayoutItem.__init__(self, None, bbox)
|
LayoutItem.__init__(self, bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -338,15 +361,15 @@ class TextItem(LayoutItem):
|
||||||
return self.vertical
|
return self.vertical
|
||||||
|
|
||||||
|
|
||||||
## TextBox
|
## LTTextBox
|
||||||
##
|
##
|
||||||
## A set of text objects that are clustered in
|
## A set of text objects that are grouped within
|
||||||
## a certain rectangular area.
|
## a certain rectangular area.
|
||||||
##
|
##
|
||||||
class TextBox(LayoutContainer):
|
class LTTextBox(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, id, objs):
|
||||||
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||||
self.direction = None
|
self.direction = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -385,7 +408,7 @@ class TextBox(LayoutContainer):
|
||||||
s = ''
|
s = ''
|
||||||
x1 = INF
|
x1 = INF
|
||||||
for obj in line:
|
for obj in line:
|
||||||
if not isinstance(obj, TextItem): continue
|
if not isinstance(obj, LTText): continue
|
||||||
margin = obj.get_margin(ratio)
|
margin = obj.get_margin(ratio)
|
||||||
if x1 < obj.x0-margin:
|
if x1 < obj.x0-margin:
|
||||||
s += ' '
|
s += ' '
|
||||||
|
@ -397,7 +420,7 @@ class TextBox(LayoutContainer):
|
||||||
s = ''
|
s = ''
|
||||||
y0 = -INF
|
y0 = -INF
|
||||||
for obj in line:
|
for obj in line:
|
||||||
if not isinstance(obj, TextItem): continue
|
if not isinstance(obj, LTText): continue
|
||||||
margin = obj.get_margin(ratio)
|
margin = obj.get_margin(ratio)
|
||||||
if obj.y1+margin < y0:
|
if obj.y1+margin < y0:
|
||||||
s += ' '
|
s += ' '
|
||||||
|
@ -407,9 +430,9 @@ class TextBox(LayoutContainer):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## Page
|
## LTPage
|
||||||
##
|
##
|
||||||
class Page(LayoutContainer):
|
class LTPage(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, rotate=0):
|
def __init__(self, id, bbox, rotate=0):
|
||||||
LayoutContainer.__init__(self, id, bbox)
|
LayoutContainer.__init__(self, id, bbox)
|
||||||
|
@ -423,7 +446,7 @@ class Page(LayoutContainer):
|
||||||
return
|
return
|
||||||
|
|
||||||
def group_text(self, ratio):
|
def group_text(self, ratio):
|
||||||
self.group_objs(ratio, TextBox)
|
self.group_objs(ratio, LTTextBox)
|
||||||
if self.get_direction() == 'H':
|
if self.get_direction() == 'H':
|
||||||
lines = reorder_vh(self.objs, +1)
|
lines = reorder_vh(self.objs, +1)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,38 +1,125 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
from pdfdevice import PDFDevice
|
||||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
||||||
from pdfdevice import PDFDevice, PDFPageAggregator
|
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
|
from page import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||||
|
from utils import mult_matrix, translate_matrix, enc
|
||||||
|
from pdfparser import PDFDocument, PDFParser
|
||||||
|
from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||||
from cmap import CMapDB
|
from cmap import CMapDB
|
||||||
|
|
||||||
|
|
||||||
# e(x): encode string
|
|
||||||
def e(x, codec='ascii'):
|
## PDFPageAggregator
|
||||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
##
|
||||||
return x.encode(codec, 'xmlcharrefreplace')
|
class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
|
def __init__(self, rsrc, pageno=1, cluster_margin=None):
|
||||||
|
PDFDevice.__init__(self, rsrc)
|
||||||
|
self.cluster_margin = cluster_margin
|
||||||
|
self.undefined_char = '?'
|
||||||
|
self.pageno = pageno
|
||||||
|
self.stack = []
|
||||||
|
return
|
||||||
|
|
||||||
|
def begin_page(self, page):
|
||||||
|
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
||||||
|
return
|
||||||
|
|
||||||
|
def end_page(self, _):
|
||||||
|
assert not self.stack
|
||||||
|
assert isinstance(self.cur_item, LTPage)
|
||||||
|
self.cur_item.fixate()
|
||||||
|
self.pageno += 1
|
||||||
|
if self.cluster_margin:
|
||||||
|
self.cur_item.group_text(self.cluster_margin)
|
||||||
|
return self.cur_item
|
||||||
|
|
||||||
|
def begin_figure(self, name, bbox):
|
||||||
|
self.stack.append(self.cur_item)
|
||||||
|
self.cur_item = LTFigure(name, bbox)
|
||||||
|
return
|
||||||
|
|
||||||
|
def end_figure(self, _):
|
||||||
|
fig = self.cur_item
|
||||||
|
self.cur_item.fixate()
|
||||||
|
self.cur_item = self.stack.pop()
|
||||||
|
self.cur_item.add(fig)
|
||||||
|
return
|
||||||
|
|
||||||
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
|
if self.debug:
|
||||||
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
|
return self.undefined_char
|
||||||
|
|
||||||
|
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path):
|
||||||
|
shape = ''.join(x[0] for x in path)
|
||||||
|
if shape == 'ml': # horizontal/vertical line
|
||||||
|
(_,x0,y0) = path[0]
|
||||||
|
(_,x1,y1) = path[1]
|
||||||
|
if y0 == y1:
|
||||||
|
# horizontal ruler
|
||||||
|
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||||
|
elif x0 == x1:
|
||||||
|
# vertical ruler
|
||||||
|
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
||||||
|
elif shape == 'mlllh':
|
||||||
|
# rectangle
|
||||||
|
(_,x0,y0) = path[0]
|
||||||
|
(_,x1,y1) = path[1]
|
||||||
|
(_,x2,y2) = path[2]
|
||||||
|
(_,x3,y3) = path[3]
|
||||||
|
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||||
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||||
|
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||||
|
return
|
||||||
|
|
||||||
|
def render_chars(self, textmatrix, textstate, chars):
|
||||||
|
if not chars: return (0, 0)
|
||||||
|
item = LTText(textmatrix, textstate.font, textstate.fontsize,
|
||||||
|
textstate.charspace, textstate.scaling, chars)
|
||||||
|
self.cur_item.add(item)
|
||||||
|
return item.adv
|
||||||
|
|
||||||
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
|
font = textstate.font
|
||||||
|
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||||
|
chars = []
|
||||||
|
for x in seq:
|
||||||
|
if isinstance(x, int) or isinstance(x, float):
|
||||||
|
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||||
|
dx -= x * textstate.scaling * .0001
|
||||||
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
|
chars = []
|
||||||
|
else:
|
||||||
|
for cid in font.decode(x):
|
||||||
|
try:
|
||||||
|
char = font.to_unicode(cid)
|
||||||
|
except PDFUnicodeNotDefined, e:
|
||||||
|
(cidcoding, cid) = e.args
|
||||||
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
|
chars.append((char, cid))
|
||||||
|
if textstate.wordspace and not font.is_multibyte() and cid == 32:
|
||||||
|
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||||
|
dx += textstate.wordspace * textstate.scaling * .01
|
||||||
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
|
chars = []
|
||||||
|
self.render_chars(textmatrix, textstate, chars)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## PDFConverter
|
## PDFConverter
|
||||||
##
|
##
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None):
|
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
|
||||||
PDFPageAggregator.__init__(self, rsrc)
|
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
||||||
self.cluster_margin = cluster_margin
|
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
|
||||||
page = PDFPageAggregator.end_page(self, page)
|
|
||||||
if self.cluster_margin:
|
|
||||||
page.group_text(self.cluster_margin)
|
|
||||||
return page
|
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
self.outfp.write(e(text, self.codec))
|
self.outfp.write(enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,7 +148,7 @@ class TagExtractor(PDFDevice):
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unicode(cid)
|
||||||
text += char
|
text += char
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined:
|
||||||
pass
|
pass
|
||||||
self.write(text)
|
self.write(text)
|
||||||
return
|
return
|
||||||
|
@ -81,15 +168,15 @@ class TagExtractor(PDFDevice):
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(self, tag, props=None):
|
||||||
s = ''
|
s = ''
|
||||||
if props:
|
if props:
|
||||||
s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v)
|
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
||||||
in sorted(props.iteritems()) )
|
in sorted(props.iteritems()) )
|
||||||
self.outfp.write('<%s%s>' % (e(tag.name), s))
|
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
||||||
self.tag = tag
|
self.tag = tag
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_tag(self):
|
def end_tag(self):
|
||||||
assert self.tag
|
assert self.tag
|
||||||
self.outfp.write('</%s>' % e(self.tag.name))
|
self.outfp.write('</%s>' % enc(self.tag.name))
|
||||||
self.tag = None
|
self.tag = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -105,26 +192,29 @@ class SGMLConverter(PDFConverter):
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, Page):
|
if isinstance(item, LTPage):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
(item.id, item.get_bbox(), item.rotate))
|
(item.id, item.get_bbox(), item.rotate))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
elif isinstance(item, TextItem):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||||
(e(item.font.fontname), item.is_vertical(),
|
(enc(item.font.fontname), item.is_vertical(),
|
||||||
item.get_bbox(), item.fontsize))
|
item.get_bbox(), item.fontsize))
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, FigureItem):
|
elif isinstance(item, LTLine):
|
||||||
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||||
|
elif isinstance(item, LTRect):
|
||||||
|
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
||||||
|
elif isinstance(item, LTFigure):
|
||||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</figure>\n')
|
self.outfp.write('</figure>\n')
|
||||||
elif isinstance(item, TextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
print item
|
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
|
@ -138,10 +228,10 @@ class SGMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True,
|
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||||
pagepad=50, scale=1, cluster_margin=None):
|
scale=1, showpageno=True, pagepad=50):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
||||||
self.pagenum = pagenum
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.outfp.write('<html><head>\n')
|
self.outfp.write('<html><head>\n')
|
||||||
|
@ -152,23 +242,23 @@ class HTMLConverter(PDFConverter):
|
||||||
self.show_text_border = False
|
self.show_text_border = False
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_rect(self, color, x, y, w, h):
|
def write_rect(self, color, width, x, y, w, h):
|
||||||
self.outfp.write('<span style="position:absolute; border: 1px solid %s; '
|
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
(color, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, Page):
|
if isinstance(item, LTPage):
|
||||||
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
if self.pagenum:
|
if self.showpageno:
|
||||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
||||||
((self.yoffset-page.y1)*self.scale))
|
((self.yoffset-page.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, TextItem):
|
elif isinstance(item, LTText):
|
||||||
if item.vertical:
|
if item.vertical:
|
||||||
wmode = 'tb-rl'
|
wmode = 'tb-rl'
|
||||||
else:
|
else:
|
||||||
|
@ -180,9 +270,11 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
if self.show_text_border:
|
if self.show_text_border:
|
||||||
self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
|
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||||
|
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, LayoutContainer):
|
||||||
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
return
|
return
|
||||||
|
@ -203,21 +295,21 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False,
|
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||||
cluster_margin=None, word_margin=0.2):
|
showpageno=False, word_margin=0.2):
|
||||||
if cluster_margin == None:
|
if cluster_margin == None:
|
||||||
cluster_margin = 0.5
|
cluster_margin = 0.5
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
||||||
self.pagenum = pagenum
|
self.showpageno = showpageno
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, TextItem):
|
if isinstance(item, LTText):
|
||||||
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
elif isinstance(item, TextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
for line in item.get_lines(self.word_margin):
|
for line in item.get_lines(self.word_margin):
|
||||||
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
|
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
|
@ -225,7 +317,7 @@ class TextConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
if self.pagenum:
|
if self.showpageno:
|
||||||
self.outfp.write('Page %d\n' % page.id)
|
self.outfp.write('Page %d\n' % page.id)
|
||||||
render(page)
|
render(page)
|
||||||
self.outfp.write('\f')
|
self.outfp.write('\f')
|
||||||
|
@ -235,29 +327,6 @@ class TextConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# pdf2txt
|
|
||||||
class TextExtractionNotAllowed(RuntimeError): pass
|
|
||||||
|
|
||||||
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
|
||||||
doc = PDFDocument()
|
|
||||||
fp = file(fname, 'rb')
|
|
||||||
parser = PDFParser(doc, fp)
|
|
||||||
try:
|
|
||||||
doc.initialize(password)
|
|
||||||
except PDFPasswordIncorrect:
|
|
||||||
raise TextExtractionNotAllowed('Incorrect password')
|
|
||||||
if not doc.is_extractable:
|
|
||||||
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
|
||||||
interpreter = PDFPageInterpreter(rsrc, device)
|
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
|
||||||
if pagenos and (pageno not in pagenos): continue
|
|
||||||
interpreter.process_page(page)
|
|
||||||
if maxpages and maxpages <= pageno+1: break
|
|
||||||
device.close()
|
|
||||||
fp.close()
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
|
@ -269,30 +338,35 @@ def main(argv):
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
# debug option
|
||||||
debug = 0
|
debug = 0
|
||||||
|
# path option
|
||||||
cmapdir = 'CMap'
|
cmapdir = 'CMap'
|
||||||
cdbcmapdir = 'CDBCMap'
|
cdbcmapdir = 'CDBCMap'
|
||||||
codec = 'utf-8'
|
# input option
|
||||||
|
password = ''
|
||||||
pagenos = set()
|
pagenos = set()
|
||||||
maxpages = 0
|
maxpages = 0
|
||||||
|
# output option
|
||||||
outtype = 'html'
|
outtype = 'html'
|
||||||
password = ''
|
codec = 'utf-8'
|
||||||
pagenum = True
|
|
||||||
splitwords = False
|
|
||||||
cluster_margin = None
|
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
|
cluster_margin = None
|
||||||
|
pageno = 1
|
||||||
|
scale = 1
|
||||||
|
showpageno = True
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
|
||||||
elif k == '-P': password = v
|
|
||||||
elif k == '-c': codec = v
|
|
||||||
elif k == '-m': maxpages = int(v)
|
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
elif k == '-D': cdbcmapdir = v
|
elif k == '-D': cdbcmapdir = v
|
||||||
elif k == '-T': cluster_margin = float(v)
|
elif k == '-P': password = v
|
||||||
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
|
elif k == '-m': maxpages = int(v)
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
|
elif k == '-c': codec = v
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
elif k == '-w': splitwords = True
|
elif k == '-s': scale = float(v)
|
||||||
|
elif k == '-T': cluster_margin = float(v)
|
||||||
#
|
#
|
||||||
CMapDB.debug = debug
|
CMapDB.debug = debug
|
||||||
PDFResourceManager.debug = debug
|
PDFResourceManager.debug = debug
|
||||||
|
@ -305,7 +379,7 @@ def main(argv):
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
|
||||||
elif outtype == 'text':
|
elif outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
|
@ -313,8 +387,8 @@ def main(argv):
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
for fname in args:
|
for fname in args:
|
||||||
convert(rsrc, device, fname, pagenos,
|
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
|
||||||
maxpages=maxpages, password=password)
|
device.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -1,16 +1,15 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
stderr = sys.stderr
|
|
||||||
from psparser import PSLiteralTable
|
from psparser import PSLiteralTable
|
||||||
|
|
||||||
|
|
||||||
## ColorSpace
|
## PDFColorSpace
|
||||||
##
|
##
|
||||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||||
|
|
||||||
class ColorSpace(object):
|
class PDFColorSpace(object):
|
||||||
|
|
||||||
def __init__(self, name, ncomponents):
|
def __init__(self, name, ncomponents):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -18,11 +17,11 @@ class ColorSpace(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||||
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = dict(
|
PREDEFINED_COLORSPACE = dict(
|
||||||
(name, ColorSpace(name,n)) for (name,n) in {
|
(name, PDFColorSpace(name,n)) for (name,n) in {
|
||||||
'CalRGB': 3,
|
'CalRGB': 3,
|
||||||
'CalGray': 1,
|
'CalGray': 1,
|
||||||
'Lab': 3,
|
'Lab': 3,
|
||||||
|
|
|
@ -1,11 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
|
||||||
stdout = sys.stdout
|
|
||||||
stderr = sys.stderr
|
|
||||||
from pdffont import PDFUnicodeNotDefined
|
|
||||||
from page import Page, FigureItem, TextItem
|
|
||||||
from utils import mult_matrix, translate_matrix
|
|
||||||
|
|
||||||
|
|
||||||
## PDFDevice
|
## PDFDevice
|
||||||
##
|
##
|
||||||
|
@ -50,92 +43,3 @@ class PDFDevice(object):
|
||||||
return
|
return
|
||||||
def render_image(self, stream, size, matrix):
|
def render_image(self, stream, size, matrix):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
|
||||||
##
|
|
||||||
class PDFPageAggregator(PDFDevice):
|
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1):
|
|
||||||
PDFDevice.__init__(self, rsrc)
|
|
||||||
self.pageno = pageno
|
|
||||||
self.stack = []
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_page(self, page):
|
|
||||||
self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_page(self, _):
|
|
||||||
assert not self.stack
|
|
||||||
assert isinstance(self.cur_item, Page)
|
|
||||||
self.cur_item.fixate()
|
|
||||||
self.pageno += 1
|
|
||||||
return self.cur_item
|
|
||||||
|
|
||||||
def begin_figure(self, name, bbox):
|
|
||||||
self.stack.append(self.cur_item)
|
|
||||||
self.cur_item = FigureItem(name, bbox)
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_figure(self, _):
|
|
||||||
fig = self.cur_item
|
|
||||||
self.cur_item.fixate()
|
|
||||||
self.cur_item = self.stack.pop()
|
|
||||||
self.cur_item.add(fig)
|
|
||||||
return
|
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
|
||||||
if self.debug:
|
|
||||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
|
||||||
return '?'
|
|
||||||
|
|
||||||
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
|
|
||||||
shape = ''.join(x[0] for x in path)
|
|
||||||
if shape == 'ml': # single line
|
|
||||||
if path[0][1] == path[1][1]:
|
|
||||||
#print 'vertical'
|
|
||||||
pass
|
|
||||||
elif path[0][2] == path[1][2]:
|
|
||||||
#print 'horizontal'
|
|
||||||
pass
|
|
||||||
elif shape == 'mlllh': # rectangle
|
|
||||||
if ((path[0][1] == path[1][1] and path[1][2] == path[2][2] and
|
|
||||||
path[2][1] == path[3][1] and path[3][2] == path[0][2]) or
|
|
||||||
(path[0][2] == path[1][2] and path[1][1] == path[2][1] and
|
|
||||||
path[2][2] == path[3][2] and path[3][1] == path[0][1])):
|
|
||||||
pass
|
|
||||||
return
|
|
||||||
|
|
||||||
def render_chars(self, textmatrix, textstate, chars):
|
|
||||||
if not chars: return (0, 0)
|
|
||||||
item = TextItem(textmatrix, textstate.font, textstate.fontsize,
|
|
||||||
textstate.charspace, textstate.scaling, chars)
|
|
||||||
self.cur_item.add(item)
|
|
||||||
return item.adv
|
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
|
||||||
font = textstate.font
|
|
||||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
|
||||||
chars = []
|
|
||||||
for x in seq:
|
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
|
||||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
|
||||||
dx -= x * textstate.scaling * .0001
|
|
||||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
|
||||||
chars = []
|
|
||||||
else:
|
|
||||||
for cid in font.decode(x):
|
|
||||||
try:
|
|
||||||
char = font.to_unicode(cid)
|
|
||||||
except PDFUnicodeNotDefined, e:
|
|
||||||
(cidcoding, cid) = e.args
|
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
|
||||||
chars.append((char, cid))
|
|
||||||
if textstate.wordspace and not font.is_multibyte() and cid == 32:
|
|
||||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
|
||||||
dx += textstate.wordspace * textstate.scaling * .01
|
|
||||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
|
||||||
chars = []
|
|
||||||
self.render_chars(textmatrix, textstate, chars)
|
|
||||||
return
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
stderr = sys.stderr
|
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
|
@ -14,7 +14,8 @@ from pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
|
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
|
||||||
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||||
from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||||
|
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
||||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||||
from cmap import CMapDB
|
from cmap import CMapDB
|
||||||
|
|
||||||
|
@ -34,6 +35,56 @@ LITERAL_FORM = PSLiteralTable.intern('Form')
|
||||||
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
||||||
|
|
||||||
|
|
||||||
|
## PDFTextState
|
||||||
|
##
|
||||||
|
class PDFTextState(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.font = None
|
||||||
|
self.fontsize = 0
|
||||||
|
self.charspace = 0
|
||||||
|
self.wordspace = 0
|
||||||
|
self.scaling = 100
|
||||||
|
self.leading = 0
|
||||||
|
self.render = 0
|
||||||
|
self.rise = 0
|
||||||
|
self.reset()
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
|
||||||
|
' scaling=%r, leading=%r, render=%r, rise=%r, '
|
||||||
|
' matrix=%r, linematrix=%r>' %
|
||||||
|
(self.font, self.fontsize, self.charspace, self.wordspace,
|
||||||
|
self.scaling, self.leading, self.render, self.rise,
|
||||||
|
self.matrix, self.linematrix))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.matrix = MATRIX_IDENTITY
|
||||||
|
self.linematrix = (0, 0)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## PDFGraphicState
|
||||||
|
##
|
||||||
|
class PDFGraphicState(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.linewidth = 0
|
||||||
|
self.linecap = None
|
||||||
|
self.linejoin = None
|
||||||
|
self.miterlimit = None
|
||||||
|
self.dash = None
|
||||||
|
self.intent = None
|
||||||
|
self.flatness = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
||||||
|
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
|
||||||
|
(self.linewidth, self.linecap, self.linejoin,
|
||||||
|
self.miterlimit, self.dash, self.intent, self.flatness))
|
||||||
|
|
||||||
## Resource Manager
|
## Resource Manager
|
||||||
##
|
##
|
||||||
class PDFResourceManager(object):
|
class PDFResourceManager(object):
|
||||||
|
@ -207,46 +258,6 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
class TextState(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.font = None
|
|
||||||
self.fontsize = 0
|
|
||||||
self.charspace = 0
|
|
||||||
self.wordspace = 0
|
|
||||||
self.scaling = 100
|
|
||||||
self.leading = 0
|
|
||||||
self.render = 0
|
|
||||||
self.rise = 0
|
|
||||||
self.reset()
|
|
||||||
return
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<TextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
|
|
||||||
' scaling=%r, leading=%r, render=%r, rise=%r, '
|
|
||||||
' matrix=%r, linematrix=%r>' %
|
|
||||||
(self.font, self.fontsize, self.charspace, self.wordspace,
|
|
||||||
self.scaling, self.leading, self.render, self.rise,
|
|
||||||
self.matrix, self.linematrix))
|
|
||||||
def reset(self):
|
|
||||||
self.matrix = MATRIX_IDENTITY
|
|
||||||
self.linematrix = (0, 0)
|
|
||||||
return
|
|
||||||
|
|
||||||
class GraphicState(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.linewidth = None
|
|
||||||
self.linecap = None
|
|
||||||
self.linejoin = None
|
|
||||||
self.miterlimit = None
|
|
||||||
self.dash = None
|
|
||||||
self.intent = None
|
|
||||||
self.flatness = None
|
|
||||||
return
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<GraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
|
||||||
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
|
|
||||||
(self.linewidth, self.linecap, self.linejoin,
|
|
||||||
self.miterlimit, self.dash, self.intent, self.flatness))
|
|
||||||
|
|
||||||
def __init__(self, rsrc, device):
|
def __init__(self, rsrc, device):
|
||||||
self.rsrc = rsrc
|
self.rsrc = rsrc
|
||||||
self.device = device
|
self.device = device
|
||||||
|
@ -255,50 +266,53 @@ class PDFPageInterpreter(object):
|
||||||
def dup(self):
|
def dup(self):
|
||||||
return PDFPageInterpreter(self.rsrc, self.device)
|
return PDFPageInterpreter(self.rsrc, self.device)
|
||||||
|
|
||||||
|
# init_resources(resources):
|
||||||
|
# Prepare the fonts and XObjects listed in the Resource attribute.
|
||||||
def init_resources(self, resources):
|
def init_resources(self, resources):
|
||||||
self.fontmap = {}
|
self.fontmap = {}
|
||||||
self.xobjmap = {}
|
self.xobjmap = {}
|
||||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||||
# Handle resource declarations.
|
if not resources: return
|
||||||
def get_colorspace(spec):
|
def get_colorspace(spec):
|
||||||
if isinstance(spec, list):
|
if isinstance(spec, list):
|
||||||
name = literal_name(spec[0])
|
name = literal_name(spec[0])
|
||||||
else:
|
else:
|
||||||
name = literal_name(spec)
|
name = literal_name(spec)
|
||||||
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
||||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||||
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
||||||
return ColorSpace(name, len(list_value(spec[1])))
|
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||||
else:
|
else:
|
||||||
return PREDEFINED_COLORSPACE[name]
|
return PREDEFINED_COLORSPACE[name]
|
||||||
if resources:
|
for (k,v) in dict_value(resources).iteritems():
|
||||||
for (k,v) in dict_value(resources).iteritems():
|
if 1 <= self.debug:
|
||||||
if 1 <= self.debug:
|
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
if k == 'Font':
|
||||||
if k == 'Font':
|
for (fontid,spec) in dict_value(v).iteritems():
|
||||||
for (fontid,spec) in dict_value(v).iteritems():
|
objid = None
|
||||||
objid = None
|
if isinstance(spec, PDFObjRef):
|
||||||
if isinstance(spec, PDFObjRef):
|
objid = spec.objid
|
||||||
objid = spec.objid
|
spec = dict_value(spec)
|
||||||
spec = dict_value(spec)
|
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
||||||
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
elif k == 'ColorSpace':
|
||||||
elif k == 'ColorSpace':
|
for (csid,spec) in dict_value(v).iteritems():
|
||||||
for (csid,spec) in dict_value(v).iteritems():
|
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
elif k == 'ProcSet':
|
||||||
elif k == 'ProcSet':
|
self.rsrc.get_procset(list_value(v))
|
||||||
self.rsrc.get_procset(list_value(v))
|
elif k == 'XObject':
|
||||||
elif k == 'XObject':
|
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
self.xobjmap[xobjid] = xobjstrm
|
||||||
self.xobjmap[xobjid] = xobjstrm
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# init_state(ctm)
|
||||||
|
# Initialize the text and graphic states for rendering a page.
|
||||||
def init_state(self, ctm):
|
def init_state(self, ctm):
|
||||||
# gstack: stack for graphical states.
|
# gstack: stack for graphical states.
|
||||||
self.gstack = []
|
self.gstack = []
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
self.device.set_ctm(self.ctm)
|
self.device.set_ctm(self.ctm)
|
||||||
self.textstate = self.TextState()
|
self.textstate = PDFTextState()
|
||||||
self.graphicstate = self.GraphicState()
|
self.graphicstate = PDFGraphicState()
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
# argstack: stack for command arguments.
|
# argstack: stack for command arguments.
|
||||||
self.argstack = []
|
self.argstack = []
|
||||||
|
@ -700,10 +714,13 @@ class PDFPageInterpreter(object):
|
||||||
self.device.end_page(page)
|
self.device.end_page(page)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
# render_contents(resources, streams, ctm)
|
||||||
|
# Render the content streams.
|
||||||
|
# This method may be called recursively.
|
||||||
|
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||||
self.init_resources(resources)
|
self.init_resources(resources)
|
||||||
self.init_state(ctm)
|
self.init_state(ctm)
|
||||||
self.execute(list_value(contents))
|
self.execute(list_value(streams))
|
||||||
return
|
return
|
||||||
|
|
||||||
def execute(self, streams):
|
def execute(self, streams):
|
||||||
|
@ -738,3 +755,26 @@ class PDFPageInterpreter(object):
|
||||||
else:
|
else:
|
||||||
self.push(obj)
|
self.push(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## process_pdf
|
||||||
|
##
|
||||||
|
class TextExtractionNotAllowed(RuntimeError): pass
|
||||||
|
|
||||||
|
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||||
|
doc = PDFDocument()
|
||||||
|
fp = file(fname, 'rb')
|
||||||
|
parser = PDFParser(doc, fp)
|
||||||
|
try:
|
||||||
|
doc.initialize(password)
|
||||||
|
except PDFPasswordIncorrect:
|
||||||
|
raise TextExtractionNotAllowed('Incorrect password')
|
||||||
|
if not doc.is_extractable:
|
||||||
|
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||||
|
interpreter = PDFPageInterpreter(rsrc, device)
|
||||||
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
|
if pagenos and (pageno not in pagenos): continue
|
||||||
|
interpreter.process_page(page)
|
||||||
|
if maxpages and maxpages <= pageno+1: break
|
||||||
|
fp.close()
|
||||||
|
return
|
||||||
|
|
|
@ -205,6 +205,10 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
|
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
|
## A PDFPage object is nothing more than a bunch of keys and values
|
||||||
|
## that describe the properties of the page and point to its contents,
|
||||||
|
## and has nothing to do with a real graphical entity.
|
||||||
|
##
|
||||||
class PDFPage(object):
|
class PDFPage(object):
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
def __init__(self, doc, pageid, attrs):
|
||||||
|
|
|
@ -91,3 +91,8 @@ def decode_text(s):
|
||||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||||
else:
|
else:
|
||||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||||
|
|
||||||
|
# enc(x): encode string in SGML/XML/HTML
|
||||||
|
def enc(x, codec='ascii'):
|
||||||
|
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||||
|
return x.encode(codec, 'xmlcharrefreplace')
|
||||||
|
|
Loading…
Reference in New Issue