charspace bug fixed.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@139 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
ee97f18d4e
commit
a1591f6a4d
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Oct 4 12:47:18 JST 2009
|
Last Modified: Thu Oct 15 19:01:07 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -141,6 +141,7 @@ PDFMiner comes with two handy tools:
|
||||||
<p>
|
<p>
|
||||||
<code>pdf2txt.py</code> extracts text contents from a PDF file.
|
<code>pdf2txt.py</code> extracts text contents from a PDF file.
|
||||||
It extracts all the texts that are to be rendered programmatically,
|
It extracts all the texts that are to be rendered programmatically,
|
||||||
|
ie. text represented as ASCII or Unicode strings.
|
||||||
It cannot recognize texts drawn as images that would require optical character recognition.
|
It cannot recognize texts drawn as images that would require optical character recognition.
|
||||||
It also extracts the corresponding locations, font names, font sizes, writing
|
It also extracts the corresponding locations, font names, font sizes, writing
|
||||||
direction (horizontal or vertical) for each text portion.
|
direction (horizontal or vertical) for each text portion.
|
||||||
|
@ -183,9 +184,9 @@ By default, it extracts texts from all the pages.
|
||||||
<dt> <code>-t <em>type</em></code>
|
<dt> <code>-t <em>type</em></code>
|
||||||
<dd> Specifies the output format. The following formats are currently supported.
|
<dd> Specifies the output format. The following formats are currently supported.
|
||||||
<ul>
|
<ul>
|
||||||
<li> <code>html</code> : HTML format. (Default)
|
<li> <code>text</code> : TEXT format. (Default)
|
||||||
<li> <code>text</code> : TEXT format.
|
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
|
||||||
<li> <code>sgml</code> : SGML format.
|
<li> <code>sgml</code> : SGML format. Provides the most information available.
|
||||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||||
|
|
|
@ -269,7 +269,7 @@ class LTTextItem(LayoutItem, LTText):
|
||||||
self.vertical = font.is_vertical()
|
self.vertical = font.is_vertical()
|
||||||
self.text = ''.join( char for (char,_) in chars )
|
self.text = ''.join( char for (char,_) in chars )
|
||||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||||
adv = (adv * fontsize + len(chars)*charspace) * scaling
|
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
|
||||||
#size = (font.get_ascent() - font.get_descent()) * fontsize
|
#size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||||
size = font.get_size() * fontsize
|
size = font.get_size() * fontsize
|
||||||
(_,_,_,_,tx,ty) = self.matrix
|
(_,_,_,_,tx,ty) = self.matrix
|
||||||
|
|
|
@ -71,6 +71,7 @@ class PDFTextDevice(PDFDevice):
|
||||||
wordspace = textstate.wordspace * scaling
|
wordspace = textstate.wordspace * scaling
|
||||||
dxscale = .001 * fontsize * scaling
|
dxscale = .001 * fontsize * scaling
|
||||||
chars = []
|
chars = []
|
||||||
|
needspace = False
|
||||||
(x,y) = textstate.linematrix
|
(x,y) = textstate.linematrix
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if isinstance(obj, int) or isinstance(obj, float):
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
|
@ -84,6 +85,7 @@ class PDFTextDevice(PDFDevice):
|
||||||
else:
|
else:
|
||||||
x += d
|
x += d
|
||||||
chars = []
|
chars = []
|
||||||
|
needspace = False
|
||||||
else:
|
else:
|
||||||
for cid in font.decode(obj):
|
for cid in font.decode(obj):
|
||||||
try:
|
try:
|
||||||
|
@ -93,8 +95,14 @@ class PDFTextDevice(PDFDevice):
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
chars.append((char, cid))
|
chars.append((char, cid))
|
||||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||||
|
if needspace:
|
||||||
|
if font.is_vertical():
|
||||||
|
y += charspace
|
||||||
|
else:
|
||||||
|
x += charspace
|
||||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
fontsize, charspace, scaling, chars)
|
fontsize, charspace, scaling, chars)
|
||||||
|
needspace = True
|
||||||
x += dx
|
x += dx
|
||||||
y += dy
|
y += dy
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
|
@ -103,6 +111,11 @@ class PDFTextDevice(PDFDevice):
|
||||||
x += wordspace
|
x += wordspace
|
||||||
chars = []
|
chars = []
|
||||||
if chars:
|
if chars:
|
||||||
|
if needspace:
|
||||||
|
if font.is_vertical():
|
||||||
|
y += charspace
|
||||||
|
else:
|
||||||
|
x += charspace
|
||||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
fontsize, charspace, scaling, chars)
|
fontsize, charspace, scaling, chars)
|
||||||
x += dx
|
x += dx
|
||||||
|
|
Loading…
Reference in New Issue