release-20090711

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@118 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-11 15:28:12 +00:00
parent 787ae4f814
commit af63784305
5 changed files with 101 additions and 60 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat Jun 20 19:51:02 JST 2009
Last Modified: Sun Jul 12 00:27:23 JST 2009
<!-- hhmts end -->
</div>
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
<a name="source"></a>
<p>
<strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
</a>
(1.8Mbytes)
@ -191,23 +191,63 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul>
<p>
<dt> <code>-T <em>cluster_margin</em></code>
<dd>
<p>
<dt> <code>-M <em>char_margin</em></code>
<dt> <code>-L <em>line_margin</em></code>
<dt> <code>-W <em>word_margin</em></code>
<dd>
<dd> These are the parameters used for layout analysis.
In an actual PDF file, texts might be split into several chunks
in the middle of its running, depending on the authoring software.
Therefore, text extraction needs to splice text chunks.
In the figure below, two text chunks whose distance is closer than
the <em>char_margin</em> (shown as <em><font color="red">M</font></em>) is considered
continuous and get grouped into one. Also, two lines whose distance is closer than
the <em>line_margin</em> (<em><font color="blue">L</font></em>) is grouped
as a text box, which is a recutangular area that contains a "cluster" of texts.
Furthermore, it may be required to insert blank characters (spaces) as necessary
if the distance between two words is greater than the <em>word_margin</em>
(<em><font color="green">W</font></em>), as a blank between words might not be
represented as a space, but indicated by the positioning of each word.
<p>
Each value is specified not as an actual length, but as a proportion of
the length to the size of each character in question. The default values
are M = 1.0, L = 0.3, and W = 0.2, respectively.
<table style="border:2px gray solid; margin: 10px; padding: 10px;"><tr>
<td style="border-right:1px red solid" align=right>&rarr;</td>
<td style="border-left:1px red solid" colspan="4" align=left>&larr; <em><font color="red">M</font></em></td>
<td></td>
</tr><tr>
<td style="border:1px solid"><code>Q u i</code></td>
<td style="border:1px solid"><code>c k</code></td>
<td width="10px"></td>
<td style="border:1px solid"><code>b r o w</code></td>
<td style="border:1px solid"><code>n &nbsp; f o x</code></td>
<td style="border-bottom:1px blue solid" align=right>&darr;</td>
</tr><tr>
<td style="border-right:1px green solid" colspan="2" align=right>&rarr;</td><td></td>
<td style="border-left:1px green solid" colspan="2" align=left>&larr; <em><font color="green">W</font></em></td>
<td rowspan="2" valign=center align=center><em><font color="blue">L</font></em></td>
</tr><tr height="10px">
</tr><tr>
<td style="padding:0px;" colspan="5">
<table style="border:1px solid"><tr><td><code>j u m p s</code></td><td>...</td></tr></table>
</td>
<td style="border-top:1px blue solid" align=right>&uarr;</td>
</tr></table>
<p>
<dt> <code>-s <em>scale</em></code>
<dd>
<dd> Specifies the output scale. Can be used in HTML format only.
<p>
<dt> <code>-m <em>maxpages</em></code>
<dd>
<dd> Specifies the maximum number of pages to extract.
By default, it extracts all the pages in a document.
<p>
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file.
<dd> Provides the user password to access PDF contents.
<p>
<dt> <code>-C <em>CMap directory</em></code>
<dd>
<dd> Specifies the path of CMap directory. CMap is needed when extracting
non-ASCII texts (especially in Asian languages). The CMap location can be
also specified with <code>CMAP_PATH</code> environment variable.
<p>
<dt> <code>-d</code>
<dd> Increases the debug level.
@ -242,12 +282,13 @@ Options:
By default, it only prints the document trailer (like a header).
<p>
<dt> <code>-i <em>objno,objno, ...</em></code>
<dd>
<dd> Specifies PDF object IDs to display.
Comma-separated IDs, or multiple <code>-i</code> options are accepted.
<p>
<dt> <code>-p <em>pageno,pageno, ...</em></code>
<dd> Specifies the page number to be extracted.
Multiple <code>-p</code> options are allowed.
Note that page numbers start from one.
Comma-separated page numbers, or multiple <code>-p</code> options are accepted.
Note that page numbers start from one, not zero.
<p>
<dt> <code>-r</code> (raw)
<dt> <code>-b</code> (binary)
@ -263,11 +304,11 @@ similar to <code>repr()</code> manner. When
<code>-r</code> or <code>-b</code> option is given,
no stream header is displayed for the ease of saving it to a file.
<p>
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file.
<p>
<dt> <code>-T</code>
<dd>
<dd> Shows the table of contents.
<p>
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to access PDF contents.
<p>
<dt> <code>-d</code>
<dd> Increases the debug level.
@ -277,6 +318,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
<li> 2009/03/30: Text output mode added.
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.

1
TODO
View File

@ -1,4 +1,5 @@
TODOs:
- Better text extraction / layout analysis.
- Better API Documentation.
- Robust error handling.
- Any special handling for linearized PDFs?

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
__version__ = '20090517'
__version__ = '20090711'
if __name__ == '__main__': print __version__

View File

@ -1,42 +1,9 @@
#!/usr/bin/env python
import sys
from pdfminer.utils import apply_matrix_norm
from pdfminer.utils import apply_matrix_norm, bsearch
INF = sys.maxint
## pick
##
def pick(seq, func, maxobj=None):
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj
## bsearch
##
## Finds objects whose coordinates overlap with [v0,v1].
## It performs binary search so that the processing time
## should be around O(log n).
##
def bsearch(objs, v0):
i0 = 0
i1 = len(objs)
while i0 < i1:
i = (i0+i1)/2
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
break
elif v0 < v:
i1 = i
else:
i0 = i+1
return (i0,i1)
## reorder_hv, reorder_vh
## chop_hv, chop_vh
##
@ -387,10 +354,8 @@ class LTTextBox(LayoutContainer):
def fixate(self, direction='H'):
LayoutContainer.fixate(self, direction=direction)
if not direction:
for obj in self.objs:
if obj.is_vertical():
direction = 'V'
break
if any( obj.is_vertical() for obj in self.objs ):
direction = 'V'
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:

View File

@ -20,12 +20,41 @@ def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
## Utilities
## Utility functions
##
# pick
def pick(seq, func, maxobj=None):
'''Picks the object that has the highest value of func(obj).'''
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj
# bsearch
def bsearch(objs, v0):
'''Tries to find the closest value to v0.'''
i0 = 0
i1 = len(objs)
while i0 < i1:
i = (i0+i1)/2
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
break
elif v0 < v:
i1 = i
else:
i0 = i+1
return (i0,i1)
# choplist
def choplist(n, seq):
'''Groups every n elements of the list.'''
r = []
@ -36,6 +65,7 @@ def choplist(n, seq):
r = []
return
# nunpack
def nunpack(s, default=0):
'''Unpacks up to 4 bytes big endian.'''
l = len(s)
@ -52,6 +82,7 @@ def nunpack(s, default=0):
else:
return TypeError('invalid length: %d' % l)
# decode_text
PDFDocEncoding = ''.join( unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
@ -87,12 +118,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
'''Decodes a PDFDocEncoding string to Unicode.'''
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
# enc(x): encode string in SGML/XML/HTML
# enc
def enc(x, codec='ascii'):
'''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')