charspace bug fixed.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@139 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
ee97f18d4e
commit
a1591f6a4d
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun Oct 4 12:47:18 JST 2009
|
||||
Last Modified: Thu Oct 15 19:01:07 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -141,6 +141,7 @@ PDFMiner comes with two handy tools:
|
|||
<p>
|
||||
<code>pdf2txt.py</code> extracts text contents from a PDF file.
|
||||
It extracts all the texts that are to be rendered programmatically,
|
||||
ie. text represented as ASCII or Unicode strings.
|
||||
It cannot recognize texts drawn as images that would require optical character recognition.
|
||||
It also extracts the corresponding locations, font names, font sizes, writing
|
||||
direction (horizontal or vertical) for each text portion.
|
||||
|
@ -183,9 +184,9 @@ By default, it extracts texts from all the pages.
|
|||
<dt> <code>-t <em>type</em></code>
|
||||
<dd> Specifies the output format. The following formats are currently supported.
|
||||
<ul>
|
||||
<li> <code>html</code> : HTML format. (Default)
|
||||
<li> <code>text</code> : TEXT format.
|
||||
<li> <code>sgml</code> : SGML format.
|
||||
<li> <code>text</code> : TEXT format. (Default)
|
||||
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
|
||||
<li> <code>sgml</code> : SGML format. Provides the most information available.
|
||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||
|
|
|
@ -269,7 +269,7 @@ class LTTextItem(LayoutItem, LTText):
|
|||
self.vertical = font.is_vertical()
|
||||
self.text = ''.join( char for (char,_) in chars )
|
||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling
|
||||
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
|
||||
#size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
size = font.get_size() * fontsize
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
|
|
|
@ -71,6 +71,7 @@ class PDFTextDevice(PDFDevice):
|
|||
wordspace = textstate.wordspace * scaling
|
||||
dxscale = .001 * fontsize * scaling
|
||||
chars = []
|
||||
needspace = False
|
||||
(x,y) = textstate.linematrix
|
||||
for obj in seq:
|
||||
if isinstance(obj, int) or isinstance(obj, float):
|
||||
|
@ -84,6 +85,7 @@ class PDFTextDevice(PDFDevice):
|
|||
else:
|
||||
x += d
|
||||
chars = []
|
||||
needspace = False
|
||||
else:
|
||||
for cid in font.decode(obj):
|
||||
try:
|
||||
|
@ -93,8 +95,14 @@ class PDFTextDevice(PDFDevice):
|
|||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||
if needspace:
|
||||
if font.is_vertical():
|
||||
y += charspace
|
||||
else:
|
||||
x += charspace
|
||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||
fontsize, charspace, scaling, chars)
|
||||
needspace = True
|
||||
x += dx
|
||||
y += dy
|
||||
if font.is_vertical():
|
||||
|
@ -103,6 +111,11 @@ class PDFTextDevice(PDFDevice):
|
|||
x += wordspace
|
||||
chars = []
|
||||
if chars:
|
||||
if needspace:
|
||||
if font.is_vertical():
|
||||
y += charspace
|
||||
else:
|
||||
x += charspace
|
||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||
fontsize, charspace, scaling, chars)
|
||||
x += dx
|
||||
|
|
Loading…
Reference in New Issue