diff --git a/docs/index.html b/docs/index.html index 062c53b..61e75d2 100644 --- a/docs/index.html +++ b/docs/index.html @@ -18,7 +18,7 @@ Python PDF parser and analyzer
pdf2txt.py
extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programmatically,
+ie. text represented as ASCII or Unicode strings.
It cannot recognize texts drawn as images that would require optical character recognition.
It also extracts the corresponding locations, font names, font sizes, writing
direction (horizontal or vertical) for each text portion.
@@ -183,9 +184,9 @@ By default, it extracts texts from all the pages.
-t type
html
: HTML format. (Default)
-text
: TEXT format.
-sgml
: SGML format.
+text
: TEXT format. (Default)
+html
: HTML format. Not recommended for extraction purpose because the markup is very messy.
+sgml
: SGML format. Provides the most information available.
tag
: "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index de6c452..147f981 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -269,7 +269,7 @@ class LTTextItem(LayoutItem, LTText):
self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
- adv = (adv * fontsize + len(chars)*charspace) * scaling
+ adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix
diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
index 03242f4..2add9c2 100644
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@@ -71,6 +71,7 @@ class PDFTextDevice(PDFDevice):
wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling
chars = []
+ needspace = False
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
@@ -84,6 +85,7 @@ class PDFTextDevice(PDFDevice):
else:
x += d
chars = []
+ needspace = False
else:
for cid in font.decode(obj):
try:
@@ -93,8 +95,14 @@ class PDFTextDevice(PDFDevice):
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
+ if needspace:
+ if font.is_vertical():
+ y += charspace
+ else:
+ x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
+ needspace = True
x += dx
y += dy
if font.is_vertical():
@@ -103,6 +111,11 @@ class PDFTextDevice(PDFDevice):
x += wordspace
chars = []
if chars:
+ if needspace:
+ if font.is_vertical():
+ y += charspace
+ else:
+ x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx