diff --git a/Makefile b/Makefile index 2583f3b..ae574c4 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,6 @@ PACKAGE=pdfminer PREFIX=/usr/local -SVN=svn PYTHON=python RM=rm -f CP=cp -f @@ -22,8 +21,6 @@ clean: distclean: clean test_clean cmap_clean -commit: distclean - $(SVN) commit pack: distclean $(PYTHON) setup.py sdist register: distclean diff --git a/docs/index.html b/docs/index.html index 227732d..08b4a85 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
-On the performance, PDFMiner is about 20 times slower than +On the performance side, +PDFMiner is about 20 times slower than other C/C++-based software such as XPdf. diff --git a/docs/usage.html b/docs/usage.html index b1b5677..6d1cdf5 100644 --- a/docs/usage.html +++ b/docs/usage.html @@ -56,7 +56,7 @@ for page in doc.get_pages():
-In PDFMiner, there are several objects involved in parsing a PDF file, +In PDFMiner, there are several Python classes involved in parsing a PDF file, as shown in Figure 1.
-PDFMiner performs a basic layout analysis. +PDF documents are more like graphics, rather than text documents. +It presents no logical structure such as sentences or paragraphs (for most cases). +PDFMiner tries to reconstruct the original structure by performing +basic layout analysis. +
+
from pdfminer.layout import LAParams diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 6264750..a6df07c 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice): self.cur_item.add(LTPolygon(gstate.linewidth, pts)) return - def render_char(self, matrix, font, fontsize, scaling, cid): - item = LTChar(matrix, font, fontsize, scaling, cid) + def render_char(self, matrix, font, fontsize, scaling, rise, cid): + item = LTChar(matrix, font, fontsize, scaling, rise, cid) self.cur_item.add(item) return item.adv diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 6c33978..79e8f9d 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -204,7 +204,7 @@ class LTChar(LTItem, LTText): debug = 0 - def __init__(self, matrix, font, fontsize, scaling, cid): + def __init__(self, matrix, font, fontsize, scaling, rise, cid): self.matrix = matrix self.font = font self.fontsize = fontsize @@ -214,6 +214,8 @@ class LTChar(LTItem, LTText): text = font.to_unichr(cid) except PDFUnicodeNotDefined: text = '?' + (a,b,c,d,e,f) = self.matrix + self.upright = (0 < a*d*scaling and b*c <= 0) LTText.__init__(self, text) # compute the boundary rectangle. if self.vertical: @@ -224,7 +226,7 @@ class LTChar(LTItem, LTText): (dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv)) (_,_,_,_,tx,ty) = self.matrix tx -= dx/2 - ty += displacement + ty += displacement + rise bbox = (tx, ty+dy, tx+dx, ty) else: # horizontal @@ -233,7 +235,7 @@ class LTChar(LTItem, LTText): (_,descent) = apply_matrix_norm(self.matrix, (0, descent)) (dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size)) (_,_,_,_,tx,ty) = self.matrix - ty += descent + ty += descent + rise bbox = (tx, ty, tx+dx, ty+dy) LTItem.__init__(self, bbox) return @@ -253,8 +255,7 @@ class LTChar(LTItem, LTText): return self.vertical def is_upright(self): - (a,b,c,d,e,f) = self.matrix - return 0 < a*d and b*c <= 0 + return self.upright ## LTContainer diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index a0fd371..2e3c347 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice): scaling = textstate.scaling * .01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling + rise = textstate.rise if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling if font.is_vertical(): textstate.linematrix = self.render_string_vertical( - seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) + seq, matrix, textstate.linematrix, font, fontsize, + scaling, charspace, wordspace, rise, dxscale) else: textstate.linematrix = self.render_string_horizontal( - seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) + seq, matrix, textstate.linematrix, font, fontsize, + scaling, charspace, wordspace, rise, dxscale) return def render_string_horizontal(self, seq, matrix, (x,y), - font, fontsize, scaling, charspace, wordspace, dxscale): + font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): @@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice): if needcharspace: x += charspace x += self.render_char(translate_matrix(matrix, (x,y)), - font, fontsize, scaling, cid) + font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y) def render_string_vertical(self, seq, matrix, (x,y), - font, fontsize, scaling, charspace, wordspace, dxscale): + font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: if isinstance(obj, int) or isinstance(obj, float): @@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice): if needcharspace: y += charspace y += self.render_char(translate_matrix(matrix, (x,y)), - font, fontsize, scaling, cid) + font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace needcharspace = True return (x, y) - def render_char(self, matrix, font, fontsize, scaling, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid): return 0