diff --git a/README.html b/README.html
index da23724..ed974f6 100644
--- a/README.html
+++ b/README.html
@@ -18,7 +18,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Jun 20 19:51:02 JST 2009
+Last Modified: Sun Jul 12 00:27:23 JST 2009
@@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
Download:
-
-http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
+
+http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
(1.8Mbytes)
@@ -191,23 +191,63 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").
-
-T cluster_margin
-
-
+
-M char_margin
+ -L line_margin
-W word_margin
-
+ These are the parameters used for layout analysis.
+In an actual PDF file, texts might be split into several chunks
+in the middle of its running, depending on the authoring software.
+Therefore, text extraction needs to splice text chunks.
+In the figure below, two text chunks whose distance is closer than
+the char_margin (shown as M) is considered
+continuous and get grouped into one. Also, two lines whose distance is closer than
+the line_margin (L) is grouped
+as a text box, which is a recutangular area that contains a "cluster" of texts.
+Furthermore, it may be required to insert blank characters (spaces) as necessary
+if the distance between two words is greater than the word_margin
+(W), as a blank between words might not be
+represented as a space, but indicated by the positioning of each word.
+
+Each value is specified not as an actual length, but as a proportion of
+the length to the size of each character in question. The default values
+are M = 1.0, L = 0.3, and W = 0.2, respectively.
+
+→ |
+← M |
+ |
+
+Q u i |
+c k |
+ |
+b r o w |
+n f o x |
+↓ |
+
+→ | |
+← W |
+L |
+
+
+
+
+ |
+↑ |
+
-s scale
-
+ Specifies the output scale. Can be used in HTML format only.
-m maxpages
-
+ Specifies the maximum number of pages to extract.
+By default, it extracts all the pages in a document.
-P password
- Provides the user password to open the PDF file.
+ Provides the user password to access PDF contents.
-C CMap directory
-
+ Specifies the path of CMap directory. CMap is needed when extracting
+non-ASCII texts (especially in Asian languages). The CMap location can be
+also specified with CMAP_PATH
environment variable.
-d
Increases the debug level.
@@ -242,12 +282,13 @@ Options:
By default, it only prints the document trailer (like a header).
-i objno,objno, ...
-
+ Specifies PDF object IDs to display.
+Comma-separated IDs, or multiple -i
options are accepted.
-p pageno,pageno, ...
Specifies the page number to be extracted.
-Multiple -p
options are allowed.
-Note that page numbers start from one.
+Comma-separated page numbers, or multiple -p
options are accepted.
+Note that page numbers start from one, not zero.
-r
(raw)
-b
(binary)
@@ -263,11 +304,11 @@ similar to repr()
manner. When
-r
or -b
option is given,
no stream header is displayed for the ease of saving it to a file.
-
-P password
- Provides the user password to open the PDF file.
-
-T
-
+ Shows the table of contents.
+
+
-P password
+ Provides the user password to access PDF contents.
-d
Increases the debug level.
@@ -277,6 +318,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+- 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
- 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
- 2009/03/30: Text output mode added.
- 2009/03/25: Encoding problems fixed. Word splitting option added.
diff --git a/TODO b/TODO
index 36b9aeb..bdfdff9 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,5 @@
TODOs:
+ - Better text extraction / layout analysis.
- Better API Documentation.
- Robust error handling.
- Any special handling for linearized PDFs?
diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py
index 26ae14b..150bcd1 100644
--- a/pdfminer/__init__.py
+++ b/pdfminer/__init__.py
@@ -1,4 +1,4 @@
#!/usr/bin/env python
-__version__ = '20090517'
+__version__ = '20090711'
if __name__ == '__main__': print __version__
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index 7e575cf..fc09278 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -1,42 +1,9 @@
#!/usr/bin/env python
import sys
-from pdfminer.utils import apply_matrix_norm
+from pdfminer.utils import apply_matrix_norm, bsearch
INF = sys.maxint
-## pick
-##
-def pick(seq, func, maxobj=None):
- maxscore = None
- for obj in seq:
- score = func(obj)
- if maxscore == None or maxscore < score:
- (maxscore,maxobj) = (score,obj)
- return maxobj
-
-
-## bsearch
-##
-## Finds objects whose coordinates overlap with [v0,v1].
-## It performs binary search so that the processing time
-## should be around O(log n).
-##
-def bsearch(objs, v0):
- i0 = 0
- i1 = len(objs)
- while i0 < i1:
- i = (i0+i1)/2
- (v, obj) = objs[i]
- if v0 == v:
- (i0,i1) = (i,i+1)
- break
- elif v0 < v:
- i1 = i
- else:
- i0 = i+1
- return (i0,i1)
-
-
## reorder_hv, reorder_vh
## chop_hv, chop_vh
##
@@ -387,10 +354,8 @@ class LTTextBox(LayoutContainer):
def fixate(self, direction='H'):
LayoutContainer.fixate(self, direction=direction)
if not direction:
- for obj in self.objs:
- if obj.is_vertical():
- direction = 'V'
- break
+ if any( obj.is_vertical() for obj in self.objs ):
+ direction = 'V'
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index ed5afc8..368b146 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -20,12 +20,41 @@ def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
- '''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
+ '''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
-## Utilities
+## Utility functions
##
+
+# pick
+def pick(seq, func, maxobj=None):
+ '''Picks the object that has the highest value of func(obj).'''
+ maxscore = None
+ for obj in seq:
+ score = func(obj)
+ if maxscore == None or maxscore < score:
+ (maxscore,maxobj) = (score,obj)
+ return maxobj
+
+# bsearch
+def bsearch(objs, v0):
+ '''Tries to find the closest value to v0.'''
+ i0 = 0
+ i1 = len(objs)
+ while i0 < i1:
+ i = (i0+i1)/2
+ (v, obj) = objs[i]
+ if v0 == v:
+ (i0,i1) = (i,i+1)
+ break
+ elif v0 < v:
+ i1 = i
+ else:
+ i0 = i+1
+ return (i0,i1)
+
+# choplist
def choplist(n, seq):
'''Groups every n elements of the list.'''
r = []
@@ -36,6 +65,7 @@ def choplist(n, seq):
r = []
return
+# nunpack
def nunpack(s, default=0):
'''Unpacks up to 4 bytes big endian.'''
l = len(s)
@@ -52,6 +82,7 @@ def nunpack(s, default=0):
else:
return TypeError('invalid length: %d' % l)
+# decode_text
PDFDocEncoding = ''.join( unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
@@ -87,12 +118,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
+ '''Decodes a PDFDocEncoding string to Unicode.'''
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
-# enc(x): encode string in SGML/XML/HTML
+# enc
def enc(x, codec='ascii'):
+ '''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
return x.encode(codec, 'xmlcharrefreplace')