charspace bug fixed.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@139 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-10-23 14:51:40 +00:00
parent ee97f18d4e
commit a1591f6a4d
3 changed files with 19 additions and 5 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Oct 4 12:47:18 JST 2009 Last Modified: Thu Oct 15 19:01:07 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -141,6 +141,7 @@ PDFMiner comes with two handy tools:
<p> <p>
<code>pdf2txt.py</code> extracts text contents from a PDF file. <code>pdf2txt.py</code> extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programmatically, It extracts all the texts that are to be rendered programmatically,
ie. text represented as ASCII or Unicode strings.
It cannot recognize texts drawn as images that would require optical character recognition. It cannot recognize texts drawn as images that would require optical character recognition.
It also extracts the corresponding locations, font names, font sizes, writing It also extracts the corresponding locations, font names, font sizes, writing
direction (horizontal or vertical) for each text portion. direction (horizontal or vertical) for each text portion.
@ -183,9 +184,9 @@ By default, it extracts texts from all the pages.
<dt> <code>-t <em>type</em></code> <dt> <code>-t <em>type</em></code>
<dd> Specifies the output format. The following formats are currently supported. <dd> Specifies the output format. The following formats are currently supported.
<ul> <ul>
<li> <code>html</code> : HTML format. (Default) <li> <code>text</code> : TEXT format. (Default)
<li> <code>text</code> : TEXT format. <li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
<li> <code>sgml</code> : SGML format. <li> <code>sgml</code> : SGML format. Provides the most information available.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>"). Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").

View File

@ -269,7 +269,7 @@ class LTTextItem(LayoutItem, LTText):
self.vertical = font.is_vertical() self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars ) self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars ) adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize #size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix (_,_,_,_,tx,ty) = self.matrix

View File

@ -71,6 +71,7 @@ class PDFTextDevice(PDFDevice):
wordspace = textstate.wordspace * scaling wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling dxscale = .001 * fontsize * scaling
chars = [] chars = []
needspace = False
(x,y) = textstate.linematrix (x,y) = textstate.linematrix
for obj in seq: for obj in seq:
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
@ -84,6 +85,7 @@ class PDFTextDevice(PDFDevice):
else: else:
x += d x += d
chars = [] chars = []
needspace = False
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
try: try:
@ -93,8 +95,14 @@ class PDFTextDevice(PDFDevice):
char = self.handle_undefined_char(cidcoding, cid) char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid)) chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte(): if cid == 32 and textstate.wordspace and not font.is_multibyte():
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars) fontsize, charspace, scaling, chars)
needspace = True
x += dx x += dx
y += dy y += dy
if font.is_vertical(): if font.is_vertical():
@ -103,6 +111,11 @@ class PDFTextDevice(PDFDevice):
x += wordspace x += wordspace
chars = [] chars = []
if chars: if chars:
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars) fontsize, charspace, scaling, chars)
x += dx x += dx