text rise support added

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@217 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-05-18 14:57:04 +00:00
parent 479c920ec7
commit fe3bdbfce0
6 changed files with 28 additions and 21 deletions

View File

@ -4,7 +4,6 @@
PACKAGE=pdfminer
PREFIX=/usr/local
SVN=svn
PYTHON=python
RM=rm -f
CP=cp -f
@ -22,8 +21,6 @@ clean:
distclean: clean test_clean cmap_clean
commit: distclean
$(SVN) commit
pack: distclean
$(PYTHON) setup.py sdist
register: distclean

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat Apr 24 04:30:10 UTC 2010
Last Modified: Mon May 10 23:02:20 UTC 2010
<!-- hhmts end -->
</div>
@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
<li> Reconstruct the original layout by grouping text chunks.
</ul>
<p>
On the performance, PDFMiner is about 20 times slower than
On the performance side,
PDFMiner is about 20 times slower than
other C/C++-based software such as XPdf.
<a name="source"></a>

View File

@ -56,7 +56,7 @@ for page in doc.get_pages():
</pre></blockquote>
<p>
In PDFMiner, there are several objects involved in parsing a PDF file,
In PDFMiner, there are several Python classes involved in parsing a PDF file,
as shown in Figure 1.
<div>
@ -68,7 +68,12 @@ as shown in Figure 1.
<hr noshade>
<h2>Accessing Layout Objects</h2>
<p>
PDFMiner performs a basic layout analysis.
PDF documents are more like graphics, rather than text documents.
It presents no logical structure such as sentences or paragraphs (for most cases).
PDFMiner tries to reconstruct the original structure by performing
basic layout analysis.
<p>
<blockquote><pre>
from pdfminer.layout import LAParams

View File

@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return
def render_char(self, matrix, font, fontsize, scaling, cid):
item = LTChar(matrix, font, fontsize, scaling, cid)
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
item = LTChar(matrix, font, fontsize, scaling, rise, cid)
self.cur_item.add(item)
return item.adv

View File

@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):
debug = 0
def __init__(self, matrix, font, fontsize, scaling, cid):
def __init__(self, matrix, font, fontsize, scaling, rise, cid):
self.matrix = matrix
self.font = font
self.fontsize = fontsize
@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
text = font.to_unichr(cid)
except PDFUnicodeNotDefined:
text = '?'
(a,b,c,d,e,f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
LTText.__init__(self, text)
# compute the boundary rectangle.
if self.vertical:
@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
(_,_,_,_,tx,ty) = self.matrix
tx -= dx/2
ty += displacement
ty += displacement + rise
bbox = (tx, ty+dy, tx+dx, ty)
else:
# horizontal
@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
(_,_,_,_,tx,ty) = self.matrix
ty += descent
ty += descent + rise
bbox = (tx, ty, tx+dx, ty+dy)
LTItem.__init__(self, bbox)
return
@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
return self.vertical
def is_upright(self):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
return self.upright
## LTContainer

View File

@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling
rise = textstate.rise
if font.is_multibyte():
wordspace = 0
dxscale = .001 * fontsize * scaling
if font.is_vertical():
textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
else:
textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
return
def render_string_horizontal(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale):
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
if needcharspace:
x += charspace
x += self.render_char(translate_matrix(matrix, (x,y)),
font, fontsize, scaling, cid)
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
return (x, y)
def render_string_vertical(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale):
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
if needcharspace:
y += charspace
y += self.render_char(translate_matrix(matrix, (x,y)),
font, fontsize, scaling, cid)
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
y += wordspace
needcharspace = True
return (x, y)
def render_char(self, matrix, font, fontsize, scaling, cid):
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
return 0