text rise support added

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@217 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-05-18 14:57:04 +00:00
parent 479c920ec7
commit fe3bdbfce0
6 changed files with 28 additions and 21 deletions

View File

@ -4,7 +4,6 @@
PACKAGE=pdfminer PACKAGE=pdfminer
PREFIX=/usr/local PREFIX=/usr/local
SVN=svn
PYTHON=python PYTHON=python
RM=rm -f RM=rm -f
CP=cp -f CP=cp -f
@ -22,8 +21,6 @@ clean:
distclean: clean test_clean cmap_clean distclean: clean test_clean cmap_clean
commit: distclean
$(SVN) commit
pack: distclean pack: distclean
$(PYTHON) setup.py sdist $(PYTHON) setup.py sdist
register: distclean register: distclean

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Apr 24 04:30:10 UTC 2010 Last Modified: Mon May 10 23:02:20 UTC 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
<li> Reconstruct the original layout by grouping text chunks. <li> Reconstruct the original layout by grouping text chunks.
</ul> </ul>
<p> <p>
On the performance, PDFMiner is about 20 times slower than On the performance side,
PDFMiner is about 20 times slower than
other C/C++-based software such as XPdf. other C/C++-based software such as XPdf.
<a name="source"></a> <a name="source"></a>

View File

@ -56,7 +56,7 @@ for page in doc.get_pages():
</pre></blockquote> </pre></blockquote>
<p> <p>
In PDFMiner, there are several objects involved in parsing a PDF file, In PDFMiner, there are several Python classes involved in parsing a PDF file,
as shown in Figure 1. as shown in Figure 1.
<div> <div>
@ -68,7 +68,12 @@ as shown in Figure 1.
<hr noshade> <hr noshade>
<h2>Accessing Layout Objects</h2> <h2>Accessing Layout Objects</h2>
<p> <p>
PDFMiner performs a basic layout analysis. PDF documents are more like graphics, rather than text documents.
It presents no logical structure such as sentences or paragraphs (for most cases).
PDFMiner tries to reconstruct the original structure by performing
basic layout analysis.
<p>
<blockquote><pre> <blockquote><pre>
from pdfminer.layout import LAParams from pdfminer.layout import LAParams

View File

@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
self.cur_item.add(LTPolygon(gstate.linewidth, pts)) self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return return
def render_char(self, matrix, font, fontsize, scaling, cid): def render_char(self, matrix, font, fontsize, scaling, rise, cid):
item = LTChar(matrix, font, fontsize, scaling, cid) item = LTChar(matrix, font, fontsize, scaling, rise, cid)
self.cur_item.add(item) self.cur_item.add(item)
return item.adv return item.adv

View File

@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):
debug = 0 debug = 0
def __init__(self, matrix, font, fontsize, scaling, cid): def __init__(self, matrix, font, fontsize, scaling, rise, cid):
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
self.fontsize = fontsize self.fontsize = fontsize
@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
text = font.to_unichr(cid) text = font.to_unichr(cid)
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
text = '?' text = '?'
(a,b,c,d,e,f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
LTText.__init__(self, text) LTText.__init__(self, text)
# compute the boundary rectangle. # compute the boundary rectangle.
if self.vertical: if self.vertical:
@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv)) (dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
(_,_,_,_,tx,ty) = self.matrix (_,_,_,_,tx,ty) = self.matrix
tx -= dx/2 tx -= dx/2
ty += displacement ty += displacement + rise
bbox = (tx, ty+dy, tx+dx, ty) bbox = (tx, ty+dy, tx+dx, ty)
else: else:
# horizontal # horizontal
@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
(_,descent) = apply_matrix_norm(self.matrix, (0, descent)) (_,descent) = apply_matrix_norm(self.matrix, (0, descent))
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size)) (dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
(_,_,_,_,tx,ty) = self.matrix (_,_,_,_,tx,ty) = self.matrix
ty += descent ty += descent + rise
bbox = (tx, ty, tx+dx, ty+dy) bbox = (tx, ty, tx+dx, ty+dy)
LTItem.__init__(self, bbox) LTItem.__init__(self, bbox)
return return
@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
return self.vertical return self.vertical
def is_upright(self): def is_upright(self):
(a,b,c,d,e,f) = self.matrix return self.upright
return 0 < a*d and b*c <= 0
## LTContainer ## LTContainer

View File

@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
scaling = textstate.scaling * .01 scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling wordspace = textstate.wordspace * scaling
rise = textstate.rise
if font.is_multibyte(): if font.is_multibyte():
wordspace = 0 wordspace = 0
dxscale = .001 * fontsize * scaling dxscale = .001 * fontsize * scaling
if font.is_vertical(): if font.is_vertical():
textstate.linematrix = self.render_string_vertical( textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
else: else:
textstate.linematrix = self.render_string_horizontal( textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
return return
def render_string_horizontal(self, seq, matrix, (x,y), def render_string_horizontal(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale): font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
if needcharspace: if needcharspace:
x += charspace x += charspace
x += self.render_char(translate_matrix(matrix, (x,y)), x += self.render_char(translate_matrix(matrix, (x,y)),
font, fontsize, scaling, cid) font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace: if cid == 32 and wordspace:
x += wordspace x += wordspace
needcharspace = True needcharspace = True
return (x, y) return (x, y)
def render_string_vertical(self, seq, matrix, (x,y), def render_string_vertical(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale): font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
if needcharspace: if needcharspace:
y += charspace y += charspace
y += self.render_char(translate_matrix(matrix, (x,y)), y += self.render_char(translate_matrix(matrix, (x,y)),
font, fontsize, scaling, cid) font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace: if cid == 32 and wordspace:
y += wordspace y += wordspace
needcharspace = True needcharspace = True
return (x, y) return (x, y)
def render_char(self, matrix, font, fontsize, scaling, cid): def render_char(self, matrix, font, fontsize, scaling, rise, cid):
return 0 return 0