add html exect layout mode; default changed.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@272 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
08c5c66917
commit
476ecf7e32
|
@ -91,10 +91,22 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||||
item = LTChar(matrix, font, fontsize, scaling, rise, cid)
|
try:
|
||||||
|
text = font.to_unichr(cid)
|
||||||
|
assert isinstance(text, unicode), text
|
||||||
|
except PDFUnicodeNotDefined:
|
||||||
|
text = self.handle_undefined_char(font, cid)
|
||||||
|
textwidth = font.char_width(cid)
|
||||||
|
textdisp = font.char_disp(cid)
|
||||||
|
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return item.adv
|
return item.adv
|
||||||
|
|
||||||
|
def handle_undefined_char(self, font, cid):
|
||||||
|
if self.debug:
|
||||||
|
print >>sys.stderr, 'undefined: %r, %r' % (font, cid)
|
||||||
|
return '?'
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -126,10 +138,6 @@ class PDFConverter(PDFLayoutAnalyzer):
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
|
||||||
self.outfp.write(enc(text, self.codec))
|
|
||||||
return
|
|
||||||
|
|
||||||
def write_image(self, image):
|
def write_image(self, image):
|
||||||
stream = image.stream
|
stream = image.stream
|
||||||
filters = stream.get_filters()
|
filters = stream.get_filters()
|
||||||
|
@ -163,7 +171,7 @@ class TextConverter(PDFConverter):
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write_text(self, text):
|
||||||
self.outfp.write(text.encode(self.codec, 'ignore'))
|
self.outfp.write(text.encode(self.codec, 'ignore'))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -173,13 +181,13 @@ class TextConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.write(item.text)
|
self.write_text(item.text)
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write('\n')
|
self.write_text('\n')
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.write('Page %s\n' % ltpage.pageid)
|
self.write_text('Page %s\n' % ltpage.pageid)
|
||||||
render(ltpage)
|
render(ltpage)
|
||||||
self.write('\f')
|
self.write_text('\f')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -191,92 +199,152 @@ class HTMLConverter(PDFConverter):
|
||||||
#'char': 'green',
|
#'char': 'green',
|
||||||
#'figure': 'yellow',
|
#'figure': 'yellow',
|
||||||
#'textline': 'magenta',
|
#'textline': 'magenta',
|
||||||
'textbox': 'cyan',
|
#'textbox': 'cyan',
|
||||||
'textgroup': 'red',
|
#'textgroup': 'red',
|
||||||
'polygon': 'black',
|
'polygon': 'black',
|
||||||
'page': 'gray',
|
'page': 'gray',
|
||||||
}
|
}
|
||||||
TEXT_COLORS = {
|
TEXT_COLORS = {
|
||||||
'textbox': 'blue',
|
#'textbox': 'blue',
|
||||||
'char': 'black',
|
'char': 'black',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
scale=1, showpageno=True, pagepad=50, outdir=None):
|
scale=1, exact=False, showpageno=True, pagepad=50, outdir=None):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
|
self.exact = exact
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.outdir = outdir
|
self.outdir = outdir
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.outfp.write('<html><head>\n')
|
self.fontscale = 0.7
|
||||||
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
|
self.write('<html><head>\n')
|
||||||
self.codec)
|
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||||
self.outfp.write('</head><body>\n')
|
self.write('</head><body>\n')
|
||||||
self.yoffset = self.pagepad
|
self.yoffset = self.pagepad
|
||||||
|
self._font = None
|
||||||
|
self._fontstack = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_rect(self, color, width, x, y, w, h):
|
def write(self, text):
|
||||||
|
self.outfp.write(text)
|
||||||
|
return
|
||||||
|
|
||||||
|
def write_text(self, text):
|
||||||
|
self.write(enc(text, self.codec))
|
||||||
|
return
|
||||||
|
|
||||||
|
def place_rect(self, color, borderwidth, x, y, w, h):
|
||||||
color = self.RECT_COLORS.get(color)
|
color = self.RECT_COLORS.get(color)
|
||||||
if color is not None:
|
if color is not None:
|
||||||
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
self.write('<span style="position:absolute; border: %s %dpx solid; '
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
(color, width,
|
(color, borderwidth,
|
||||||
x*self.scale, (self.yoffset-y)*self.scale,
|
x*self.scale, (self.yoffset-y)*self.scale,
|
||||||
w*self.scale, h*self.scale))
|
w*self.scale, h*self.scale))
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, color, text, x, y, size):
|
def place_image(self, item, borderwidth, x, y, w, h):
|
||||||
|
if self.outdir is not None:
|
||||||
|
name = self.write_image(item)
|
||||||
|
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||||
|
'width="%d" height="%d" />\n' %
|
||||||
|
(enc(name), borderwidth,
|
||||||
|
x*self.scale, (self.yoffset-y)*self.scale,
|
||||||
|
w*self.scale, h*self.scale))
|
||||||
|
return
|
||||||
|
|
||||||
|
def place_text(self, color, text, x, y, size):
|
||||||
color = self.TEXT_COLORS.get(color)
|
color = self.TEXT_COLORS.get(color)
|
||||||
if color is not None:
|
if color is not None:
|
||||||
self.outfp.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||||
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
|
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
|
||||||
self.write(text)
|
self.write_text(text)
|
||||||
self.outfp.write('</span>\n')
|
self.write('</span>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
def begin_textbox(self, color, borderwidth, x, y, w, h, writing_mode):
|
||||||
|
self._fontstack.append(self._font)
|
||||||
|
self._font = None
|
||||||
|
self.write('<div style="position:absolute; border: %s %dpx solid; writing-mode:%s; '
|
||||||
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;">' %
|
||||||
|
(color, borderwidth, writing_mode,
|
||||||
|
x*self.scale, (self.yoffset-y)*self.scale,
|
||||||
|
w*self.scale, h*self.scale))
|
||||||
|
return
|
||||||
|
|
||||||
|
def put_text(self, text, fontname, fontsize):
|
||||||
|
font = (fontname, fontsize)
|
||||||
|
if font != self._font:
|
||||||
|
if self._font is not None:
|
||||||
|
self.write('</span>')
|
||||||
|
self.write('<span style="font-size:%dpx">' %
|
||||||
|
(fontsize * self.scale * self.fontscale))
|
||||||
|
self._font = font
|
||||||
|
self.write_text(text)
|
||||||
|
return
|
||||||
|
|
||||||
|
def end_textbox(self, color):
|
||||||
|
if self._font is not None:
|
||||||
|
self.write('</span>')
|
||||||
|
self._font = self._fontstack.pop()
|
||||||
|
self.write('</div>')
|
||||||
return
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.yoffset += item.y1
|
self.yoffset += item.y1
|
||||||
self.write_rect('page', 1, item.x0, item.y1, item.width, item.height)
|
self.place_rect('page', 1, item.x0, item.y1, item.width, item.height)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
self.write('<div style="position:absolute; top:%dpx;">' %
|
||||||
((self.yoffset-item.y1)*self.scale))
|
((self.yoffset-item.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
|
self.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTChar):
|
|
||||||
self.write_rect('char', 1, item.x0, item.y1, item.width, item.height)
|
|
||||||
self.write_text('char', item.text, item.x0, item.y1, item.size)
|
|
||||||
elif isinstance(item, LTPolygon):
|
elif isinstance(item, LTPolygon):
|
||||||
self.write_rect('polygon', 1, item.x0, item.y1, item.width, item.height)
|
self.place_rect('polygon', 1, item.x0, item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextLine):
|
|
||||||
self.write_rect('textline', 1, item.x0, item.y1, item.width, item.height)
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
elif isinstance(item, LTTextBox):
|
|
||||||
self.write_rect('textbox', 1, item.x0, item.y1, item.width, item.height)
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
self.write_text('textbox', str(item.index+1), item.x0, item.y1, 20)
|
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.write_rect('figure', 1, item.x0, item.y1, item.width, item.height)
|
self.place_rect('figure', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
if self.outdir:
|
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
||||||
name = self.write_image(item)
|
else:
|
||||||
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
if self.exact:
|
||||||
'width="%d" height="%d" />\n' %
|
if isinstance(item, LTTextLine):
|
||||||
(enc(name),
|
self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
|
||||||
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
for child in item:
|
||||||
item.width*self.scale, item.height*self.scale))
|
render(child)
|
||||||
|
elif isinstance(item, LTTextBox):
|
||||||
|
self.place_rect('textbox', 1, item.x0, item.y1, item.width, item.height)
|
||||||
|
self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
elif isinstance(item, LTChar):
|
||||||
|
self.place_rect('char', 1, item.x0, item.y1, item.width, item.height)
|
||||||
|
self.place_text('char', item.text, item.x0, item.y1, item.size)
|
||||||
|
else:
|
||||||
|
if isinstance(item, LTTextLine):
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.write('<br>')
|
||||||
|
elif isinstance(item, LTTextBox):
|
||||||
|
self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
|
||||||
|
item.get_writing_mode())
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.end_textbox('textbox')
|
||||||
|
elif isinstance(item, LTChar):
|
||||||
|
self.put_text(item.text, item.fontname, item.size)
|
||||||
|
elif isinstance(item, LTText):
|
||||||
|
self.write_text(item.text)
|
||||||
return
|
return
|
||||||
render(ltpage)
|
render(ltpage)
|
||||||
if ltpage.layout:
|
if ltpage.layout:
|
||||||
def show_layout(item):
|
def show_layout(item):
|
||||||
if isinstance(item, LTTextGroup):
|
if isinstance(item, LTTextGroup):
|
||||||
self.write_rect('textgroup', 1, item.x0, item.y1, item.width, item.height)
|
self.place_rect('textgroup', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
show_layout(child)
|
show_layout(child)
|
||||||
return
|
return
|
||||||
|
@ -285,9 +353,9 @@ class HTMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
||||||
self.outfp.write('</body></html>\n')
|
self.write('</body></html>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -302,6 +370,10 @@ class XMLConverter(PDFConverter):
|
||||||
self.outfp.write('<pages>\n')
|
self.outfp.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write_text(self, text):
|
||||||
|
self.outfp.write(enc(text, self.codec))
|
||||||
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
|
@ -341,7 +413,7 @@ class XMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
|
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
|
||||||
(enc(item.font.fontname), bbox2str(item.bbox), item.size))
|
(enc(item.font.fontname), bbox2str(item.bbox), item.size))
|
||||||
self.write(item.text)
|
self.write_text(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
|
|
|
@ -183,22 +183,16 @@ class LTChar(LTItem, LTText):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, scaling, rise, cid):
|
def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp):
|
||||||
self.matrix = matrix
|
|
||||||
self.font = font
|
|
||||||
self.fontsize = fontsize
|
|
||||||
self.adv = font.char_width(cid) * fontsize * scaling
|
|
||||||
try:
|
|
||||||
text = font.to_unichr(cid)
|
|
||||||
assert isinstance(text, unicode), text
|
|
||||||
except PDFUnicodeNotDefined:
|
|
||||||
text = '?'
|
|
||||||
LTText.__init__(self, text)
|
LTText.__init__(self, text)
|
||||||
|
self.matrix = matrix
|
||||||
|
self.fontname = font.fontname
|
||||||
|
self.adv = textwidth * fontsize * scaling
|
||||||
# compute the boundary rectangle.
|
# compute the boundary rectangle.
|
||||||
if self.font.is_vertical():
|
if font.is_vertical():
|
||||||
# vertical
|
# vertical
|
||||||
width = font.get_width() * fontsize
|
width = font.get_width() * fontsize
|
||||||
(vx,vy) = font.char_disp(cid)
|
(vx,vy) = textdisp
|
||||||
if vx is None:
|
if vx is None:
|
||||||
vx = width/2
|
vx = width/2
|
||||||
else:
|
else:
|
||||||
|
@ -224,7 +218,7 @@ class LTChar(LTItem, LTText):
|
||||||
if y1 < y0:
|
if y1 < y0:
|
||||||
(y0,y1) = (y1,y0)
|
(y0,y1) = (y1,y0)
|
||||||
LTItem.__init__(self, (x0,y0,x1,y1))
|
LTItem.__init__(self, (x0,y0,x1,y1))
|
||||||
if self.font.is_vertical():
|
if font.is_vertical():
|
||||||
self.size = self.width
|
self.size = self.width
|
||||||
else:
|
else:
|
||||||
self.size = self.height
|
self.size = self.height
|
||||||
|
@ -232,9 +226,9 @@ class LTChar(LTItem, LTText):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' %
|
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
||||||
(self.__class__.__name__, bbox2str(self.bbox),
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
matrix2str(self.matrix), self.font, self.fontsize,
|
matrix2str(self.matrix), self.fontname,
|
||||||
self.adv, self.text))
|
self.adv, self.text))
|
||||||
else:
|
else:
|
||||||
return '<char %r>' % self.text
|
return '<char %r>' % self.text
|
||||||
|
@ -378,12 +372,18 @@ class LTTextBoxHorizontal(LTTextBox):
|
||||||
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
|
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
|
||||||
return LTTextBox.finish(self)
|
return LTTextBox.finish(self)
|
||||||
|
|
||||||
|
def get_writing_mode(self):
|
||||||
|
return 'lr-tb'
|
||||||
|
|
||||||
class LTTextBoxVertical(LTTextBox):
|
class LTTextBoxVertical(LTTextBox):
|
||||||
|
|
||||||
def finish(self):
|
def finish(self):
|
||||||
self._objs = csort(self._objs, key=lambda obj: -obj.x1)
|
self._objs = csort(self._objs, key=lambda obj: -obj.x1)
|
||||||
return LTTextBox.finish(self)
|
return LTTextBox.finish(self)
|
||||||
|
|
||||||
|
def get_writing_mode(self):
|
||||||
|
return 'tb-rl'
|
||||||
|
|
||||||
|
|
||||||
## LTTextGroup
|
## LTTextGroup
|
||||||
##
|
##
|
||||||
|
|
Loading…
Reference in New Issue