text rise support added
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@217 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
479c920ec7
commit
fe3bdbfce0
3
Makefile
3
Makefile
|
@ -4,7 +4,6 @@
|
|||
PACKAGE=pdfminer
|
||||
PREFIX=/usr/local
|
||||
|
||||
SVN=svn
|
||||
PYTHON=python
|
||||
RM=rm -f
|
||||
CP=cp -f
|
||||
|
@ -22,8 +21,6 @@ clean:
|
|||
|
||||
distclean: clean test_clean cmap_clean
|
||||
|
||||
commit: distclean
|
||||
$(SVN) commit
|
||||
pack: distclean
|
||||
$(PYTHON) setup.py sdist
|
||||
register: distclean
|
||||
|
|
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sat Apr 24 04:30:10 UTC 2010
|
||||
Last Modified: Mon May 10 23:02:20 UTC 2010
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
|
|||
<li> Reconstruct the original layout by grouping text chunks.
|
||||
</ul>
|
||||
<p>
|
||||
On the performance, PDFMiner is about 20 times slower than
|
||||
On the performance side,
|
||||
PDFMiner is about 20 times slower than
|
||||
other C/C++-based software such as XPdf.
|
||||
|
||||
<a name="source"></a>
|
||||
|
|
|
@ -56,7 +56,7 @@ for page in doc.get_pages():
|
|||
</pre></blockquote>
|
||||
|
||||
<p>
|
||||
In PDFMiner, there are several objects involved in parsing a PDF file,
|
||||
In PDFMiner, there are several Python classes involved in parsing a PDF file,
|
||||
as shown in Figure 1.
|
||||
|
||||
<div>
|
||||
|
@ -68,7 +68,12 @@ as shown in Figure 1.
|
|||
<hr noshade>
|
||||
<h2>Accessing Layout Objects</h2>
|
||||
<p>
|
||||
PDFMiner performs a basic layout analysis.
|
||||
PDF documents are more like graphics, rather than text documents.
|
||||
It presents no logical structure such as sentences or paragraphs (for most cases).
|
||||
PDFMiner tries to reconstruct the original structure by performing
|
||||
basic layout analysis.
|
||||
<p>
|
||||
|
||||
|
||||
<blockquote><pre>
|
||||
from pdfminer.layout import LAParams
|
||||
|
|
|
@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
|
||||
return
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, cid):
|
||||
item = LTChar(matrix, font, fontsize, scaling, cid)
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||
item = LTChar(matrix, font, fontsize, scaling, rise, cid)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
|
|
|
@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):
|
|||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, matrix, font, fontsize, scaling, cid):
|
||||
def __init__(self, matrix, font, fontsize, scaling, rise, cid):
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
self.fontsize = fontsize
|
||||
|
@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
|
|||
text = font.to_unichr(cid)
|
||||
except PDFUnicodeNotDefined:
|
||||
text = '?'
|
||||
(a,b,c,d,e,f) = self.matrix
|
||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||
LTText.__init__(self, text)
|
||||
# compute the boundary rectangle.
|
||||
if self.vertical:
|
||||
|
@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
|
|||
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
tx -= dx/2
|
||||
ty += displacement
|
||||
ty += displacement + rise
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
else:
|
||||
# horizontal
|
||||
|
@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
|
|||
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
ty += descent
|
||||
ty += descent + rise
|
||||
bbox = (tx, ty, tx+dx, ty+dy)
|
||||
LTItem.__init__(self, bbox)
|
||||
return
|
||||
|
@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
|
|||
return self.vertical
|
||||
|
||||
def is_upright(self):
|
||||
(a,b,c,d,e,f) = self.matrix
|
||||
return 0 < a*d and b*c <= 0
|
||||
return self.upright
|
||||
|
||||
|
||||
## LTContainer
|
||||
|
|
|
@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
|
|||
scaling = textstate.scaling * .01
|
||||
charspace = textstate.charspace * scaling
|
||||
wordspace = textstate.wordspace * scaling
|
||||
rise = textstate.rise
|
||||
if font.is_multibyte():
|
||||
wordspace = 0
|
||||
dxscale = .001 * fontsize * scaling
|
||||
if font.is_vertical():
|
||||
textstate.linematrix = self.render_string_vertical(
|
||||
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale)
|
||||
else:
|
||||
textstate.linematrix = self.render_string_horizontal(
|
||||
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale)
|
||||
return
|
||||
|
||||
def render_string_horizontal(self, seq, matrix, (x,y),
|
||||
font, fontsize, scaling, charspace, wordspace, dxscale):
|
||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
if isinstance(obj, int) or isinstance(obj, float):
|
||||
|
@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
|
|||
if needcharspace:
|
||||
x += charspace
|
||||
x += self.render_char(translate_matrix(matrix, (x,y)),
|
||||
font, fontsize, scaling, cid)
|
||||
font, fontsize, scaling, rise, cid)
|
||||
if cid == 32 and wordspace:
|
||||
x += wordspace
|
||||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_string_vertical(self, seq, matrix, (x,y),
|
||||
font, fontsize, scaling, charspace, wordspace, dxscale):
|
||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
if isinstance(obj, int) or isinstance(obj, float):
|
||||
|
@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
|
|||
if needcharspace:
|
||||
y += charspace
|
||||
y += self.render_char(translate_matrix(matrix, (x,y)),
|
||||
font, fontsize, scaling, cid)
|
||||
font, fontsize, scaling, rise, cid)
|
||||
if cid == 32 and wordspace:
|
||||
y += wordspace
|
||||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, cid):
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||
return 0
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue