charspace bug fixed.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@139 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-10-23 14:51:40 +00:00
parent ee97f18d4e
commit a1591f6a4d
3 changed files with 19 additions and 5 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun Oct 4 12:47:18 JST 2009
Last Modified: Thu Oct 15 19:01:07 JST 2009
<!-- hhmts end -->
</div>
@ -141,6 +141,7 @@ PDFMiner comes with two handy tools:
<p>
<code>pdf2txt.py</code> extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programmatically,
ie. text represented as ASCII or Unicode strings.
It cannot recognize texts drawn as images that would require optical character recognition.
It also extracts the corresponding locations, font names, font sizes, writing
direction (horizontal or vertical) for each text portion.
@ -183,9 +184,9 @@ By default, it extracts texts from all the pages.
<dt> <code>-t <em>type</em></code>
<dd> Specifies the output format. The following formats are currently supported.
<ul>
<li> <code>html</code> : HTML format. (Default)
<li> <code>text</code> : TEXT format.
<li> <code>sgml</code> : SGML format.
<li> <code>text</code> : TEXT format. (Default)
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
<li> <code>sgml</code> : SGML format. Provides the most information available.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").

View File

@ -269,7 +269,7 @@ class LTTextItem(LayoutItem, LTText):
self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix

View File

@ -71,6 +71,7 @@ class PDFTextDevice(PDFDevice):
wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling
chars = []
needspace = False
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
@ -84,6 +85,7 @@ class PDFTextDevice(PDFDevice):
else:
x += d
chars = []
needspace = False
else:
for cid in font.decode(obj):
try:
@ -93,8 +95,14 @@ class PDFTextDevice(PDFDevice):
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy
if font.is_vertical():
@ -103,6 +111,11 @@ class PDFTextDevice(PDFDevice):
x += wordspace
chars = []
if chars:
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx