diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 9480b22..23091be 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -91,10 +91,22 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
- item = LTChar(matrix, font, fontsize, scaling, rise, cid)
+ try:
+ text = font.to_unichr(cid)
+ assert isinstance(text, unicode), text
+ except PDFUnicodeNotDefined:
+ text = self.handle_undefined_char(font, cid)
+ textwidth = font.char_width(cid)
+ textdisp = font.char_disp(cid)
+ item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
self.cur_item.add(item)
return item.adv
+ def handle_undefined_char(self, font, cid):
+ if self.debug:
+ print >>sys.stderr, 'undefined: %r, %r' % (font, cid)
+ return '?'
+
def receive_layout(self, ltpage):
return
@@ -126,10 +138,6 @@ class PDFConverter(PDFLayoutAnalyzer):
self.codec = codec
return
- def write(self, text):
- self.outfp.write(enc(text, self.codec))
- return
-
def write_image(self, image):
stream = image.stream
filters = stream.get_filters()
@@ -163,7 +171,7 @@ class TextConverter(PDFConverter):
self.showpageno = showpageno
return
- def write(self, text):
+ def write_text(self, text):
self.outfp.write(text.encode(self.codec, 'ignore'))
return
@@ -173,13 +181,13 @@ class TextConverter(PDFConverter):
for child in item:
render(child)
elif isinstance(item, LTText):
- self.write(item.text)
+ self.write_text(item.text)
if isinstance(item, LTTextBox):
- self.write('\n')
+ self.write_text('\n')
if self.showpageno:
- self.write('Page %s\n' % ltpage.pageid)
+ self.write_text('Page %s\n' % ltpage.pageid)
render(ltpage)
- self.write('\f')
+ self.write_text('\f')
return
@@ -191,92 +199,152 @@ class HTMLConverter(PDFConverter):
#'char': 'green',
#'figure': 'yellow',
#'textline': 'magenta',
- 'textbox': 'cyan',
- 'textgroup': 'red',
+ #'textbox': 'cyan',
+ #'textgroup': 'red',
'polygon': 'black',
'page': 'gray',
}
TEXT_COLORS = {
- 'textbox': 'blue',
+ #'textbox': 'blue',
'char': 'black',
}
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
- scale=1, showpageno=True, pagepad=50, outdir=None):
+ scale=1, exact=False, showpageno=True, pagepad=50, outdir=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+ self.exact = exact
self.showpageno = showpageno
self.pagepad = pagepad
self.outdir = outdir
self.scale = scale
- self.outfp.write('
\n')
- self.outfp.write('\n' %
- self.codec)
- self.outfp.write('\n')
+ self.fontscale = 0.7
+ self.write('\n')
+ self.write('\n' % self.codec)
+ self.write('\n')
self.yoffset = self.pagepad
+ self._font = None
+ self._fontstack = []
return
- def write_rect(self, color, width, x, y, w, h):
+ def write(self, text):
+ self.outfp.write(text)
+ return
+
+ def write_text(self, text):
+ self.write(enc(text, self.codec))
+ return
+
+ def place_rect(self, color, borderwidth, x, y, w, h):
color = self.RECT_COLORS.get(color)
if color is not None:
- self.outfp.write('\n' %
- (color, width,
- x*self.scale, (self.yoffset-y)*self.scale,
- w*self.scale, h*self.scale))
+ self.write('\n' %
+ (color, borderwidth,
+ x*self.scale, (self.yoffset-y)*self.scale,
+ w*self.scale, h*self.scale))
return
- def write_text(self, color, text, x, y, size):
+ def place_image(self, item, borderwidth, x, y, w, h):
+ if self.outdir is not None:
+ name = self.write_image(item)
+ self.write('\n' %
+ (enc(name), borderwidth,
+ x*self.scale, (self.yoffset-y)*self.scale,
+ w*self.scale, h*self.scale))
+ return
+
+ def place_text(self, color, text, x, y, size):
color = self.TEXT_COLORS.get(color)
if color is not None:
- self.outfp.write('' %
- (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
- self.write(text)
- self.outfp.write('\n')
+ self.write('' %
+ (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
+ self.write_text(text)
+ self.write('\n')
+ return
+
+ def begin_textbox(self, color, borderwidth, x, y, w, h, writing_mode):
+ self._fontstack.append(self._font)
+ self._font = None
+ self.write('' %
+ (color, borderwidth, writing_mode,
+ x*self.scale, (self.yoffset-y)*self.scale,
+ w*self.scale, h*self.scale))
+ return
+
+ def put_text(self, text, fontname, fontsize):
+ font = (fontname, fontsize)
+ if font != self._font:
+ if self._font is not None:
+ self.write('')
+ self.write('' %
+ (fontsize * self.scale * self.fontscale))
+ self._font = font
+ self.write_text(text)
+ return
+
+ def end_textbox(self, color):
+ if self._font is not None:
+ self.write('')
+ self._font = self._fontstack.pop()
+ self.write('
')
return
def receive_layout(self, ltpage):
def render(item):
if isinstance(item, LTPage):
self.yoffset += item.y1
- self.write_rect('page', 1, item.x0, item.y1, item.width, item.height)
+ self.place_rect('page', 1, item.x0, item.y1, item.width, item.height)
if self.showpageno:
- self.outfp.write('' %
- ((self.yoffset-item.y1)*self.scale))
- self.outfp.write('
Page %s \n' % (item.pageid, item.pageid))
+ self.write('' %
+ ((self.yoffset-item.y1)*self.scale))
+ self.write('
Page %s \n' % (item.pageid, item.pageid))
for child in item:
render(child)
- elif isinstance(item, LTChar):
- self.write_rect('char', 1, item.x0, item.y1, item.width, item.height)
- self.write_text('char', item.text, item.x0, item.y1, item.size)
elif isinstance(item, LTPolygon):
- self.write_rect('polygon', 1, item.x0, item.y1, item.width, item.height)
- elif isinstance(item, LTTextLine):
- self.write_rect('textline', 1, item.x0, item.y1, item.width, item.height)
- for child in item:
- render(child)
- elif isinstance(item, LTTextBox):
- self.write_rect('textbox', 1, item.x0, item.y1, item.width, item.height)
- for child in item:
- render(child)
- self.write_text('textbox', str(item.index+1), item.x0, item.y1, 20)
+ self.place_rect('polygon', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTFigure):
- self.write_rect('figure', 1, item.x0, item.y1, item.width, item.height)
+ self.place_rect('figure', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTImage):
- if self.outdir:
- name = self.write_image(item)
- self.outfp.write('\n' %
- (enc(name),
- item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
- item.width*self.scale, item.height*self.scale))
+ self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
+ else:
+ if self.exact:
+ if isinstance(item, LTTextLine):
+ self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
+ for child in item:
+ render(child)
+ elif isinstance(item, LTTextBox):
+ self.place_rect('textbox', 1, item.x0, item.y1, item.width, item.height)
+ self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
+ for child in item:
+ render(child)
+ elif isinstance(item, LTChar):
+ self.place_rect('char', 1, item.x0, item.y1, item.width, item.height)
+ self.place_text('char', item.text, item.x0, item.y1, item.size)
+ else:
+ if isinstance(item, LTTextLine):
+ for child in item:
+ render(child)
+ self.write('
')
+ elif isinstance(item, LTTextBox):
+ self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
+ item.get_writing_mode())
+ for child in item:
+ render(child)
+ self.end_textbox('textbox')
+ elif isinstance(item, LTChar):
+ self.put_text(item.text, item.fontname, item.size)
+ elif isinstance(item, LTText):
+ self.write_text(item.text)
return
render(ltpage)
if ltpage.layout:
def show_layout(item):
if isinstance(item, LTTextGroup):
- self.write_rect('textgroup', 1, item.x0, item.y1, item.width, item.height)
+ self.place_rect('textgroup', 1, item.x0, item.y1, item.width, item.height)
for child in item:
show_layout(child)
return
@@ -285,9 +353,9 @@ class HTMLConverter(PDFConverter):
return
def close(self):
- self.outfp.write('Page: %s
\n' %
- ', '.join('%s' % (i,i) for i in xrange(1,self.pageno)))
- self.outfp.write('\n')
+ self.write('Page: %s
\n' %
+ ', '.join('%s' % (i,i) for i in xrange(1,self.pageno)))
+ self.write('\n')
return
@@ -302,6 +370,10 @@ class XMLConverter(PDFConverter):
self.outfp.write('\n')
return
+ def write_text(self, text):
+ self.outfp.write(enc(text, self.codec))
+ return
+
def receive_layout(self, ltpage):
def render(item):
if isinstance(item, LTPage):
@@ -341,7 +413,7 @@ class XMLConverter(PDFConverter):
elif isinstance(item, LTChar):
self.outfp.write('' %
(enc(item.font.fontname), bbox2str(item.bbox), item.size))
- self.write(item.text)
+ self.write_text(item.text)
self.outfp.write('\n')
elif isinstance(item, LTText):
self.outfp.write('%s\n' % item.text)
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index ee6f869..6daa592 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -183,22 +183,16 @@ class LTChar(LTItem, LTText):
debug = 0
- def __init__(self, matrix, font, fontsize, scaling, rise, cid):
- self.matrix = matrix
- self.font = font
- self.fontsize = fontsize
- self.adv = font.char_width(cid) * fontsize * scaling
- try:
- text = font.to_unichr(cid)
- assert isinstance(text, unicode), text
- except PDFUnicodeNotDefined:
- text = '?'
+ def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp):
LTText.__init__(self, text)
+ self.matrix = matrix
+ self.fontname = font.fontname
+ self.adv = textwidth * fontsize * scaling
# compute the boundary rectangle.
- if self.font.is_vertical():
+ if font.is_vertical():
# vertical
width = font.get_width() * fontsize
- (vx,vy) = font.char_disp(cid)
+ (vx,vy) = textdisp
if vx is None:
vx = width/2
else:
@@ -224,7 +218,7 @@ class LTChar(LTItem, LTText):
if y1 < y0:
(y0,y1) = (y1,y0)
LTItem.__init__(self, (x0,y0,x1,y1))
- if self.font.is_vertical():
+ if font.is_vertical():
self.size = self.width
else:
self.size = self.height
@@ -232,9 +226,9 @@ class LTChar(LTItem, LTText):
def __repr__(self):
if self.debug:
- return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' %
+ return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
(self.__class__.__name__, bbox2str(self.bbox),
- matrix2str(self.matrix), self.font, self.fontsize,
+ matrix2str(self.matrix), self.fontname,
self.adv, self.text))
else:
return '' % self.text
@@ -378,12 +372,18 @@ class LTTextBoxHorizontal(LTTextBox):
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
return LTTextBox.finish(self)
+ def get_writing_mode(self):
+ return 'lr-tb'
+
class LTTextBoxVertical(LTTextBox):
def finish(self):
self._objs = csort(self._objs, key=lambda obj: -obj.x1)
return LTTextBox.finish(self)
+ def get_writing_mode(self):
+ return 'tb-rl'
+
## LTTextGroup
##