another major restructuring...
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@99 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
dfac85360b
commit
0a8aae5aa3
9
Makefile
9
Makefile
|
@ -22,9 +22,10 @@ test:
|
|||
cd samples && make
|
||||
|
||||
clean:
|
||||
cd pdflib && make clean
|
||||
cd tools && make clean
|
||||
cd samples && make clean
|
||||
-cd pdflib && make clean
|
||||
-cd tools && make clean
|
||||
-cd samples && make clean
|
||||
-rm -rf build
|
||||
|
||||
# Maintainance:
|
||||
|
||||
|
@ -32,7 +33,7 @@ pack: clean
|
|||
$(SVN) cleanup
|
||||
$(SVN) export . $(WORKDIR)/$(DISTNAME)
|
||||
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
||||
rm -rf $(WORKDIR)/$(DISTNAME)
|
||||
-rm -rf $(WORKDIR)/$(DISTNAME)
|
||||
|
||||
check:
|
||||
-pychecker --limit=0 *.py
|
||||
|
|
|
@ -134,10 +134,10 @@ class Plane(object):
|
|||
|
||||
## ClusterSet
|
||||
##
|
||||
## Maintains a set of TextBox objects.
|
||||
## It incrementally constructs TextBox objects
|
||||
## Maintains a set of LTTextBox objects.
|
||||
## It incrementally constructs LTTextBox objects
|
||||
## and group them when necessary. It gives
|
||||
## a sequence of TextBox objects that represent
|
||||
## a sequence of LTTextBox objects that represent
|
||||
## the text stream of that page.
|
||||
##
|
||||
class ClusterSet(object):
|
||||
|
@ -145,11 +145,13 @@ class ClusterSet(object):
|
|||
def __init__(self, klass):
|
||||
self.clusters = {}
|
||||
self.klass = klass
|
||||
self.i = 0
|
||||
return
|
||||
|
||||
# add(objs): groups text objects if necessary.
|
||||
def add(self, objs):
|
||||
group = self.klass(objs)
|
||||
group = self.klass(objs, self.i)
|
||||
self.i += 1
|
||||
for obj in objs:
|
||||
if obj in self.clusters:
|
||||
group.merge(self.clusters[obj])
|
||||
|
@ -157,7 +159,7 @@ class ClusterSet(object):
|
|||
self.clusters[obj] = group
|
||||
return
|
||||
|
||||
# finish(): returns all the TextBoxes in a page.
|
||||
# finish(): returns all the LTTextBoxes in a page.
|
||||
def finish(self):
|
||||
r = set(self.clusters.itervalues())
|
||||
for group in r:
|
||||
|
@ -169,9 +171,8 @@ class ClusterSet(object):
|
|||
##
|
||||
class LayoutItem(object):
|
||||
|
||||
def __init__(self, id, bbox):
|
||||
def __init__(self, bbox):
|
||||
#assert x0 <= x1 and y0 <= y1
|
||||
self.id = id
|
||||
self.set_bbox(bbox)
|
||||
return
|
||||
|
||||
|
@ -219,7 +220,8 @@ class LayoutItem(object):
|
|||
class LayoutContainer(LayoutItem):
|
||||
|
||||
def __init__(self, id, bbox, objs=None):
|
||||
LayoutItem.__init__(self, id, bbox)
|
||||
LayoutItem.__init__(self, bbox)
|
||||
self.id = id
|
||||
if objs:
|
||||
self.objs = set(objs)
|
||||
else:
|
||||
|
@ -278,17 +280,38 @@ class LayoutContainer(LayoutItem):
|
|||
return direction
|
||||
|
||||
|
||||
## FigureItem
|
||||
## LTLine
|
||||
##
|
||||
class FigureItem(LayoutContainer):
|
||||
class LTLine(LayoutItem):
|
||||
|
||||
def __init__(self, linewidth, direction, bbox):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
self.linewidth = linewidth
|
||||
self.direction = direction
|
||||
return
|
||||
|
||||
|
||||
## LTRect
|
||||
##
|
||||
class LTRect(LayoutItem):
|
||||
|
||||
def __init__(self, linewidth, bbox):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
self.linewidth = linewidth
|
||||
return
|
||||
|
||||
|
||||
## LTFigure
|
||||
##
|
||||
class LTFigure(LayoutContainer):
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
|
||||
|
||||
|
||||
## TextItem
|
||||
## LTText
|
||||
##
|
||||
class TextItem(LayoutItem):
|
||||
class LTText(LayoutItem):
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
assert chars
|
||||
|
@ -318,7 +341,7 @@ class TextItem(LayoutItem):
|
|||
self.adv = (0, dy)
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||
LayoutItem.__init__(self, None, bbox)
|
||||
LayoutItem.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -338,15 +361,15 @@ class TextItem(LayoutItem):
|
|||
return self.vertical
|
||||
|
||||
|
||||
## TextBox
|
||||
## LTTextBox
|
||||
##
|
||||
## A set of text objects that are clustered in
|
||||
## A set of text objects that are grouped within
|
||||
## a certain rectangular area.
|
||||
##
|
||||
class TextBox(LayoutContainer):
|
||||
class LTTextBox(LayoutContainer):
|
||||
|
||||
def __init__(self, objs):
|
||||
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
|
||||
def __init__(self, id, objs):
|
||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||
self.direction = None
|
||||
return
|
||||
|
||||
|
@ -385,7 +408,7 @@ class TextBox(LayoutContainer):
|
|||
s = ''
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, TextItem): continue
|
||||
if not isinstance(obj, LTText): continue
|
||||
margin = obj.get_margin(ratio)
|
||||
if x1 < obj.x0-margin:
|
||||
s += ' '
|
||||
|
@ -397,7 +420,7 @@ class TextBox(LayoutContainer):
|
|||
s = ''
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, TextItem): continue
|
||||
if not isinstance(obj, LTText): continue
|
||||
margin = obj.get_margin(ratio)
|
||||
if obj.y1+margin < y0:
|
||||
s += ' '
|
||||
|
@ -407,9 +430,9 @@ class TextBox(LayoutContainer):
|
|||
return
|
||||
|
||||
|
||||
## Page
|
||||
## LTPage
|
||||
##
|
||||
class Page(LayoutContainer):
|
||||
class LTPage(LayoutContainer):
|
||||
|
||||
def __init__(self, id, bbox, rotate=0):
|
||||
LayoutContainer.__init__(self, id, bbox)
|
||||
|
@ -423,7 +446,7 @@ class Page(LayoutContainer):
|
|||
return
|
||||
|
||||
def group_text(self, ratio):
|
||||
self.group_objs(ratio, TextBox)
|
||||
self.group_objs(ratio, LTTextBox)
|
||||
if self.get_direction() == 'H':
|
||||
lines = reorder_vh(self.objs, +1)
|
||||
else:
|
||||
|
|
|
@ -1,38 +1,125 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfdevice import PDFDevice, PDFPageAggregator
|
||||
from pdfdevice import PDFDevice
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
|
||||
from page import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||
from utils import mult_matrix, translate_matrix, enc
|
||||
from pdfparser import PDFDocument, PDFParser
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||
from cmap import CMapDB
|
||||
|
||||
|
||||
# e(x): encode string
|
||||
def e(x, codec='ascii'):
|
||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, cluster_margin=None):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.cluster_margin = cluster_margin
|
||||
self.undefined_char = '?'
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
||||
return
|
||||
|
||||
def end_page(self, _):
|
||||
assert not self.stack
|
||||
assert isinstance(self.cur_item, LTPage)
|
||||
self.cur_item.fixate()
|
||||
self.pageno += 1
|
||||
if self.cluster_margin:
|
||||
self.cur_item.group_text(self.cluster_margin)
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox):
|
||||
self.stack.append(self.cur_item)
|
||||
self.cur_item = LTFigure(name, bbox)
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
fig = self.cur_item
|
||||
self.cur_item.fixate()
|
||||
self.cur_item = self.stack.pop()
|
||||
self.cur_item.add(fig)
|
||||
return
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
if self.debug:
|
||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return self.undefined_char
|
||||
|
||||
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path):
|
||||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml': # horizontal/vertical line
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
if y0 == y1:
|
||||
# horizontal ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||
elif x0 == x1:
|
||||
# vertical ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
||||
elif shape == 'mlllh':
|
||||
# rectangle
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(_,x2,y2) = path[2]
|
||||
(_,x3,y3) = path[3]
|
||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||
return
|
||||
|
||||
def render_chars(self, textmatrix, textstate, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = LTText(textmatrix, textstate.font, textstate.fontsize,
|
||||
textstate.charspace, textstate.scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
chars = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx -= x * textstate.scaling * .0001
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
chars = []
|
||||
else:
|
||||
for cid in font.decode(x):
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if textstate.wordspace and not font.is_multibyte() and cid == 32:
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
chars = []
|
||||
self.render_chars(textmatrix, textstate, chars)
|
||||
return
|
||||
|
||||
|
||||
## PDFConverter
|
||||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None):
|
||||
PDFPageAggregator.__init__(self, rsrc)
|
||||
self.cluster_margin = cluster_margin
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
page = PDFPageAggregator.end_page(self, page)
|
||||
if self.cluster_margin:
|
||||
page.group_text(self.cluster_margin)
|
||||
return page
|
||||
|
||||
def write(self, text):
|
||||
self.outfp.write(e(text, self.codec))
|
||||
self.outfp.write(enc(text, self.codec))
|
||||
return
|
||||
|
||||
|
||||
|
@ -61,7 +148,7 @@ class TagExtractor(PDFDevice):
|
|||
try:
|
||||
char = font.to_unicode(cid)
|
||||
text += char
|
||||
except PDFUnicodeNotDefined, e:
|
||||
except PDFUnicodeNotDefined:
|
||||
pass
|
||||
self.write(text)
|
||||
return
|
||||
|
@ -81,15 +168,15 @@ class TagExtractor(PDFDevice):
|
|||
def begin_tag(self, tag, props=None):
|
||||
s = ''
|
||||
if props:
|
||||
s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v)
|
||||
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
||||
in sorted(props.iteritems()) )
|
||||
self.outfp.write('<%s%s>' % (e(tag.name), s))
|
||||
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
||||
self.tag = tag
|
||||
return
|
||||
|
||||
def end_tag(self):
|
||||
assert self.tag
|
||||
self.outfp.write('</%s>' % e(self.tag.name))
|
||||
self.outfp.write('</%s>' % enc(self.tag.name))
|
||||
self.tag = None
|
||||
return
|
||||
|
||||
|
@ -105,26 +192,29 @@ class SGMLConverter(PDFConverter):
|
|||
|
||||
def end_page(self, page):
|
||||
def render(item):
|
||||
if isinstance(item, Page):
|
||||
if isinstance(item, LTPage):
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
(item.id, item.get_bbox(), item.rotate))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</page>\n')
|
||||
elif isinstance(item, TextItem):
|
||||
elif isinstance(item, LTText):
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(e(item.font.fontname), item.is_vertical(),
|
||||
(enc(item.font.fontname), item.is_vertical(),
|
||||
item.get_bbox(), item.fontsize))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</text>\n')
|
||||
elif isinstance(item, FigureItem):
|
||||
elif isinstance(item, LTLine):
|
||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||
elif isinstance(item, LTRect):
|
||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
||||
elif isinstance(item, LTFigure):
|
||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
elif isinstance(item, TextBox):
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
print item
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
|
@ -138,10 +228,10 @@ class SGMLConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True,
|
||||
pagepad=50, scale=1, cluster_margin=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
self.pagenum = pagenum
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||
scale=1, showpageno=True, pagepad=50):
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.scale = scale
|
||||
self.outfp.write('<html><head>\n')
|
||||
|
@ -152,23 +242,23 @@ class HTMLConverter(PDFConverter):
|
|||
self.show_text_border = False
|
||||
return
|
||||
|
||||
def write_rect(self, color, x, y, w, h):
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid %s; '
|
||||
def write_rect(self, color, width, x, y, w, h):
|
||||
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(color, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
||||
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
def render(item):
|
||||
if isinstance(item, Page):
|
||||
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
if self.pagenum:
|
||||
if isinstance(item, LTPage):
|
||||
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
if self.showpageno:
|
||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
||||
((self.yoffset-page.y1)*self.scale))
|
||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, TextItem):
|
||||
elif isinstance(item, LTText):
|
||||
if item.vertical:
|
||||
wmode = 'tb-rl'
|
||||
else:
|
||||
|
@ -180,9 +270,11 @@ class HTMLConverter(PDFConverter):
|
|||
self.write(item.text)
|
||||
self.outfp.write('</span>\n')
|
||||
if self.show_text_border:
|
||||
self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LayoutContainer):
|
||||
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
render(child)
|
||||
return
|
||||
|
@ -203,21 +295,21 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False,
|
||||
cluster_margin=None, word_margin=0.2):
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||
showpageno=False, word_margin=0.2):
|
||||
if cluster_margin == None:
|
||||
cluster_margin = 0.5
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
self.pagenum = pagenum
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
||||
self.showpageno = showpageno
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
def render(item):
|
||||
if isinstance(item, TextItem):
|
||||
if isinstance(item, LTText):
|
||||
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
elif isinstance(item, TextBox):
|
||||
elif isinstance(item, LTTextBox):
|
||||
for line in item.get_lines(self.word_margin):
|
||||
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
|
||||
self.outfp.write('\n')
|
||||
|
@ -225,7 +317,7 @@ class TextConverter(PDFConverter):
|
|||
for child in item:
|
||||
render(child)
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.pagenum:
|
||||
if self.showpageno:
|
||||
self.outfp.write('Page %d\n' % page.id)
|
||||
render(page)
|
||||
self.outfp.write('\f')
|
||||
|
@ -235,29 +327,6 @@ class TextConverter(PDFConverter):
|
|||
return
|
||||
|
||||
|
||||
# pdf2txt
|
||||
class TextExtractionNotAllowed(RuntimeError): pass
|
||||
|
||||
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||
doc = PDFDocument()
|
||||
fp = file(fname, 'rb')
|
||||
parser = PDFParser(doc, fp)
|
||||
try:
|
||||
doc.initialize(password)
|
||||
except PDFPasswordIncorrect:
|
||||
raise TextExtractionNotAllowed('Incorrect password')
|
||||
if not doc.is_extractable:
|
||||
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||
interpreter = PDFPageInterpreter(rsrc, device)
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
interpreter.process_page(page)
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
device.close()
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt
|
||||
|
@ -269,30 +338,35 @@ def main(argv):
|
|||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
# debug option
|
||||
debug = 0
|
||||
# path option
|
||||
cmapdir = 'CMap'
|
||||
cdbcmapdir = 'CDBCMap'
|
||||
codec = 'utf-8'
|
||||
# input option
|
||||
password = ''
|
||||
pagenos = set()
|
||||
maxpages = 0
|
||||
# output option
|
||||
outtype = 'html'
|
||||
password = ''
|
||||
pagenum = True
|
||||
splitwords = False
|
||||
cluster_margin = None
|
||||
codec = 'utf-8'
|
||||
outfp = sys.stdout
|
||||
cluster_margin = None
|
||||
pageno = 1
|
||||
scale = 1
|
||||
showpageno = True
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
elif k == '-P': password = v
|
||||
elif k == '-c': codec = v
|
||||
elif k == '-m': maxpages = int(v)
|
||||
elif k == '-C': cmapdir = v
|
||||
elif k == '-D': cdbcmapdir = v
|
||||
elif k == '-T': cluster_margin = float(v)
|
||||
elif k == '-P': password = v
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
elif k == '-m': maxpages = int(v)
|
||||
elif k == '-t': outtype = v
|
||||
elif k == '-c': codec = v
|
||||
elif k == '-o': outfp = file(v, 'wb')
|
||||
elif k == '-w': splitwords = True
|
||||
elif k == '-s': scale = float(v)
|
||||
elif k == '-T': cluster_margin = float(v)
|
||||
#
|
||||
CMapDB.debug = debug
|
||||
PDFResourceManager.debug = debug
|
||||
|
@ -305,7 +379,7 @@ def main(argv):
|
|||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
|
||||
elif outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'tag':
|
||||
|
@ -313,8 +387,8 @@ def main(argv):
|
|||
else:
|
||||
return usage()
|
||||
for fname in args:
|
||||
convert(rsrc, device, fname, pagenos,
|
||||
maxpages=maxpages, password=password)
|
||||
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
|
||||
device.close()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
|
|
@ -1,16 +1,15 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from psparser import PSLiteralTable
|
||||
|
||||
|
||||
## ColorSpace
|
||||
## PDFColorSpace
|
||||
##
|
||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||
|
||||
class ColorSpace(object):
|
||||
class PDFColorSpace(object):
|
||||
|
||||
def __init__(self, name, ncomponents):
|
||||
self.name = name
|
||||
|
@ -18,11 +17,11 @@ class ColorSpace(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||
|
||||
|
||||
PREDEFINED_COLORSPACE = dict(
|
||||
(name, ColorSpace(name,n)) for (name,n) in {
|
||||
(name, PDFColorSpace(name,n)) for (name,n) in {
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
|
|
|
@ -1,11 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from page import Page, FigureItem, TextItem
|
||||
from utils import mult_matrix, translate_matrix
|
||||
|
||||
|
||||
## PDFDevice
|
||||
##
|
||||
|
@ -50,92 +43,3 @@ class PDFDevice(object):
|
|||
return
|
||||
def render_image(self, stream, size, matrix):
|
||||
return
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
|
||||
return
|
||||
|
||||
def end_page(self, _):
|
||||
assert not self.stack
|
||||
assert isinstance(self.cur_item, Page)
|
||||
self.cur_item.fixate()
|
||||
self.pageno += 1
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox):
|
||||
self.stack.append(self.cur_item)
|
||||
self.cur_item = FigureItem(name, bbox)
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
fig = self.cur_item
|
||||
self.cur_item.fixate()
|
||||
self.cur_item = self.stack.pop()
|
||||
self.cur_item.add(fig)
|
||||
return
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
if self.debug:
|
||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return '?'
|
||||
|
||||
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
|
||||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml': # single line
|
||||
if path[0][1] == path[1][1]:
|
||||
#print 'vertical'
|
||||
pass
|
||||
elif path[0][2] == path[1][2]:
|
||||
#print 'horizontal'
|
||||
pass
|
||||
elif shape == 'mlllh': # rectangle
|
||||
if ((path[0][1] == path[1][1] and path[1][2] == path[2][2] and
|
||||
path[2][1] == path[3][1] and path[3][2] == path[0][2]) or
|
||||
(path[0][2] == path[1][2] and path[1][1] == path[2][1] and
|
||||
path[2][2] == path[3][2] and path[3][1] == path[0][1])):
|
||||
pass
|
||||
return
|
||||
|
||||
def render_chars(self, textmatrix, textstate, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = TextItem(textmatrix, textstate.font, textstate.fontsize,
|
||||
textstate.charspace, textstate.scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
chars = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx -= x * textstate.scaling * .0001
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
chars = []
|
||||
else:
|
||||
for cid in font.decode(x):
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if textstate.wordspace and not font.is_multibyte() and cid == 32:
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
chars = []
|
||||
self.render_chars(textmatrix, textstate, chars)
|
||||
return
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
|
|
|
@ -14,7 +14,8 @@ from pdftypes import PDFException, PDFStream, PDFObjRef, \
|
|||
str_value, list_value, dict_value, stream_value
|
||||
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
|
||||
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||
from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||
from cmap import CMapDB
|
||||
|
||||
|
@ -34,6 +35,56 @@ LITERAL_FORM = PSLiteralTable.intern('Form')
|
|||
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
||||
|
||||
|
||||
## PDFTextState
|
||||
##
|
||||
class PDFTextState(object):
|
||||
|
||||
def __init__(self):
|
||||
self.font = None
|
||||
self.fontsize = 0
|
||||
self.charspace = 0
|
||||
self.wordspace = 0
|
||||
self.scaling = 100
|
||||
self.leading = 0
|
||||
self.render = 0
|
||||
self.rise = 0
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
|
||||
' scaling=%r, leading=%r, render=%r, rise=%r, '
|
||||
' matrix=%r, linematrix=%r>' %
|
||||
(self.font, self.fontsize, self.charspace, self.wordspace,
|
||||
self.scaling, self.leading, self.render, self.rise,
|
||||
self.matrix, self.linematrix))
|
||||
|
||||
def reset(self):
|
||||
self.matrix = MATRIX_IDENTITY
|
||||
self.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
|
||||
## PDFGraphicState
|
||||
##
|
||||
class PDFGraphicState(object):
|
||||
|
||||
def __init__(self):
|
||||
self.linewidth = 0
|
||||
self.linecap = None
|
||||
self.linejoin = None
|
||||
self.miterlimit = None
|
||||
self.dash = None
|
||||
self.intent = None
|
||||
self.flatness = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
||||
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
|
||||
(self.linewidth, self.linecap, self.linejoin,
|
||||
self.miterlimit, self.dash, self.intent, self.flatness))
|
||||
|
||||
## Resource Manager
|
||||
##
|
||||
class PDFResourceManager(object):
|
||||
|
@ -207,46 +258,6 @@ class PDFPageInterpreter(object):
|
|||
|
||||
debug = 0
|
||||
|
||||
class TextState(object):
|
||||
def __init__(self):
|
||||
self.font = None
|
||||
self.fontsize = 0
|
||||
self.charspace = 0
|
||||
self.wordspace = 0
|
||||
self.scaling = 100
|
||||
self.leading = 0
|
||||
self.render = 0
|
||||
self.rise = 0
|
||||
self.reset()
|
||||
return
|
||||
def __repr__(self):
|
||||
return ('<TextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
|
||||
' scaling=%r, leading=%r, render=%r, rise=%r, '
|
||||
' matrix=%r, linematrix=%r>' %
|
||||
(self.font, self.fontsize, self.charspace, self.wordspace,
|
||||
self.scaling, self.leading, self.render, self.rise,
|
||||
self.matrix, self.linematrix))
|
||||
def reset(self):
|
||||
self.matrix = MATRIX_IDENTITY
|
||||
self.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
class GraphicState(object):
|
||||
def __init__(self):
|
||||
self.linewidth = None
|
||||
self.linecap = None
|
||||
self.linejoin = None
|
||||
self.miterlimit = None
|
||||
self.dash = None
|
||||
self.intent = None
|
||||
self.flatness = None
|
||||
return
|
||||
def __repr__(self):
|
||||
return ('<GraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
||||
' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
|
||||
(self.linewidth, self.linecap, self.linejoin,
|
||||
self.miterlimit, self.dash, self.intent, self.flatness))
|
||||
|
||||
def __init__(self, rsrc, device):
|
||||
self.rsrc = rsrc
|
||||
self.device = device
|
||||
|
@ -255,23 +266,24 @@ class PDFPageInterpreter(object):
|
|||
def dup(self):
|
||||
return PDFPageInterpreter(self.rsrc, self.device)
|
||||
|
||||
# init_resources(resources):
|
||||
# Prepare the fonts and XObjects listed in the Resource attribute.
|
||||
def init_resources(self, resources):
|
||||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||
# Handle resource declarations.
|
||||
if not resources: return
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||
return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
||||
return ColorSpace(name, len(list_value(spec[1])))
|
||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE[name]
|
||||
if resources:
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
|
@ -292,13 +304,15 @@ class PDFPageInterpreter(object):
|
|||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
||||
# init_state(ctm)
|
||||
# Initialize the text and graphic states for rendering a page.
|
||||
def init_state(self, ctm):
|
||||
# gstack: stack for graphical states.
|
||||
self.gstack = []
|
||||
self.ctm = ctm
|
||||
self.device.set_ctm(self.ctm)
|
||||
self.textstate = self.TextState()
|
||||
self.graphicstate = self.GraphicState()
|
||||
self.textstate = PDFTextState()
|
||||
self.graphicstate = PDFGraphicState()
|
||||
self.curpath = []
|
||||
# argstack: stack for command arguments.
|
||||
self.argstack = []
|
||||
|
@ -700,10 +714,13 @@ class PDFPageInterpreter(object):
|
|||
self.device.end_page(page)
|
||||
return
|
||||
|
||||
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
||||
# render_contents(resources, streams, ctm)
|
||||
# Render the content streams.
|
||||
# This method may be called recursively.
|
||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(contents))
|
||||
self.execute(list_value(streams))
|
||||
return
|
||||
|
||||
def execute(self, streams):
|
||||
|
@ -738,3 +755,26 @@ class PDFPageInterpreter(object):
|
|||
else:
|
||||
self.push(obj)
|
||||
return
|
||||
|
||||
|
||||
## process_pdf
|
||||
##
|
||||
class TextExtractionNotAllowed(RuntimeError): pass
|
||||
|
||||
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||
doc = PDFDocument()
|
||||
fp = file(fname, 'rb')
|
||||
parser = PDFParser(doc, fp)
|
||||
try:
|
||||
doc.initialize(password)
|
||||
except PDFPasswordIncorrect:
|
||||
raise TextExtractionNotAllowed('Incorrect password')
|
||||
if not doc.is_extractable:
|
||||
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||
interpreter = PDFPageInterpreter(rsrc, device)
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
interpreter.process_page(page)
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
fp.close()
|
||||
return
|
||||
|
|
|
@ -205,6 +205,10 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
|
||||
## PDFPage
|
||||
##
|
||||
## A PDFPage object is nothing more than a bunch of keys and values
|
||||
## that describe the properties of the page and point to its contents,
|
||||
## and has nothing to do with a real graphical entity.
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
|
|
|
@ -91,3 +91,8 @@ def decode_text(s):
|
|||
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||
else:
|
||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||
|
||||
# enc(x): encode string in SGML/XML/HTML
|
||||
def enc(x, codec='ascii'):
|
||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
|
|
Loading…
Reference in New Issue