text rise support added
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@217 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
479c920ec7
commit
fe3bdbfce0
3
Makefile
3
Makefile
|
@ -4,7 +4,6 @@
|
||||||
PACKAGE=pdfminer
|
PACKAGE=pdfminer
|
||||||
PREFIX=/usr/local
|
PREFIX=/usr/local
|
||||||
|
|
||||||
SVN=svn
|
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
RM=rm -f
|
RM=rm -f
|
||||||
CP=cp -f
|
CP=cp -f
|
||||||
|
@ -22,8 +21,6 @@ clean:
|
||||||
|
|
||||||
distclean: clean test_clean cmap_clean
|
distclean: clean test_clean cmap_clean
|
||||||
|
|
||||||
commit: distclean
|
|
||||||
$(SVN) commit
|
|
||||||
pack: distclean
|
pack: distclean
|
||||||
$(PYTHON) setup.py sdist
|
$(PYTHON) setup.py sdist
|
||||||
register: distclean
|
register: distclean
|
||||||
|
|
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat Apr 24 04:30:10 UTC 2010
|
Last Modified: Mon May 10 23:02:20 UTC 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
|
||||||
<li> Reconstruct the original layout by grouping text chunks.
|
<li> Reconstruct the original layout by grouping text chunks.
|
||||||
</ul>
|
</ul>
|
||||||
<p>
|
<p>
|
||||||
On the performance, PDFMiner is about 20 times slower than
|
On the performance side,
|
||||||
|
PDFMiner is about 20 times slower than
|
||||||
other C/C++-based software such as XPdf.
|
other C/C++-based software such as XPdf.
|
||||||
|
|
||||||
<a name="source"></a>
|
<a name="source"></a>
|
||||||
|
|
|
@ -56,7 +56,7 @@ for page in doc.get_pages():
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
In PDFMiner, there are several objects involved in parsing a PDF file,
|
In PDFMiner, there are several Python classes involved in parsing a PDF file,
|
||||||
as shown in Figure 1.
|
as shown in Figure 1.
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
|
@ -68,7 +68,12 @@ as shown in Figure 1.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Accessing Layout Objects</h2>
|
<h2>Accessing Layout Objects</h2>
|
||||||
<p>
|
<p>
|
||||||
PDFMiner performs a basic layout analysis.
|
PDF documents are more like graphics, rather than text documents.
|
||||||
|
It presents no logical structure such as sentences or paragraphs (for most cases).
|
||||||
|
PDFMiner tries to reconstruct the original structure by performing
|
||||||
|
basic layout analysis.
|
||||||
|
<p>
|
||||||
|
|
||||||
|
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
|
|
@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
|
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, cid):
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||||
item = LTChar(matrix, font, fontsize, scaling, cid)
|
item = LTChar(matrix, font, fontsize, scaling, rise, cid)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return item.adv
|
return item.adv
|
||||||
|
|
||||||
|
|
|
@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, scaling, cid):
|
def __init__(self, matrix, font, fontsize, scaling, rise, cid):
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.font = font
|
self.font = font
|
||||||
self.fontsize = fontsize
|
self.fontsize = fontsize
|
||||||
|
@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
|
||||||
text = font.to_unichr(cid)
|
text = font.to_unichr(cid)
|
||||||
except PDFUnicodeNotDefined:
|
except PDFUnicodeNotDefined:
|
||||||
text = '?'
|
text = '?'
|
||||||
|
(a,b,c,d,e,f) = self.matrix
|
||||||
|
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||||
LTText.__init__(self, text)
|
LTText.__init__(self, text)
|
||||||
# compute the boundary rectangle.
|
# compute the boundary rectangle.
|
||||||
if self.vertical:
|
if self.vertical:
|
||||||
|
@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
|
||||||
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
|
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
|
||||||
(_,_,_,_,tx,ty) = self.matrix
|
(_,_,_,_,tx,ty) = self.matrix
|
||||||
tx -= dx/2
|
tx -= dx/2
|
||||||
ty += displacement
|
ty += displacement + rise
|
||||||
bbox = (tx, ty+dy, tx+dx, ty)
|
bbox = (tx, ty+dy, tx+dx, ty)
|
||||||
else:
|
else:
|
||||||
# horizontal
|
# horizontal
|
||||||
|
@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
|
||||||
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
|
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
|
||||||
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
|
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
|
||||||
(_,_,_,_,tx,ty) = self.matrix
|
(_,_,_,_,tx,ty) = self.matrix
|
||||||
ty += descent
|
ty += descent + rise
|
||||||
bbox = (tx, ty, tx+dx, ty+dy)
|
bbox = (tx, ty, tx+dx, ty+dy)
|
||||||
LTItem.__init__(self, bbox)
|
LTItem.__init__(self, bbox)
|
||||||
return
|
return
|
||||||
|
@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
|
||||||
return self.vertical
|
return self.vertical
|
||||||
|
|
||||||
def is_upright(self):
|
def is_upright(self):
|
||||||
(a,b,c,d,e,f) = self.matrix
|
return self.upright
|
||||||
return 0 < a*d and b*c <= 0
|
|
||||||
|
|
||||||
|
|
||||||
## LTContainer
|
## LTContainer
|
||||||
|
|
|
@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
|
||||||
scaling = textstate.scaling * .01
|
scaling = textstate.scaling * .01
|
||||||
charspace = textstate.charspace * scaling
|
charspace = textstate.charspace * scaling
|
||||||
wordspace = textstate.wordspace * scaling
|
wordspace = textstate.wordspace * scaling
|
||||||
|
rise = textstate.rise
|
||||||
if font.is_multibyte():
|
if font.is_multibyte():
|
||||||
wordspace = 0
|
wordspace = 0
|
||||||
dxscale = .001 * fontsize * scaling
|
dxscale = .001 * fontsize * scaling
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
textstate.linematrix = self.render_string_vertical(
|
textstate.linematrix = self.render_string_vertical(
|
||||||
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
|
seq, matrix, textstate.linematrix, font, fontsize,
|
||||||
|
scaling, charspace, wordspace, rise, dxscale)
|
||||||
else:
|
else:
|
||||||
textstate.linematrix = self.render_string_horizontal(
|
textstate.linematrix = self.render_string_horizontal(
|
||||||
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
|
seq, matrix, textstate.linematrix, font, fontsize,
|
||||||
|
scaling, charspace, wordspace, rise, dxscale)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string_horizontal(self, seq, matrix, (x,y),
|
def render_string_horizontal(self, seq, matrix, (x,y),
|
||||||
font, fontsize, scaling, charspace, wordspace, dxscale):
|
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if isinstance(obj, int) or isinstance(obj, float):
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
|
@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
x += charspace
|
x += charspace
|
||||||
x += self.render_char(translate_matrix(matrix, (x,y)),
|
x += self.render_char(translate_matrix(matrix, (x,y)),
|
||||||
font, fontsize, scaling, cid)
|
font, fontsize, scaling, rise, cid)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
x += wordspace
|
x += wordspace
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_string_vertical(self, seq, matrix, (x,y),
|
def render_string_vertical(self, seq, matrix, (x,y),
|
||||||
font, fontsize, scaling, charspace, wordspace, dxscale):
|
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if isinstance(obj, int) or isinstance(obj, float):
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
|
@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
y += charspace
|
y += charspace
|
||||||
y += self.render_char(translate_matrix(matrix, (x,y)),
|
y += self.render_char(translate_matrix(matrix, (x,y)),
|
||||||
font, fontsize, scaling, cid)
|
font, fontsize, scaling, rise, cid)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
y += wordspace
|
y += wordspace
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, cid):
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue