release-20090711
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@118 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
787ae4f814
commit
af63784305
78
README.html
78
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat Jun 20 19:51:02 JST 2009
|
Last Modified: Sun Jul 12 00:27:23 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
|
||||||
<a name="source"></a>
|
<a name="source"></a>
|
||||||
<p>
|
<p>
|
||||||
<strong>Download:</strong><br>
|
<strong>Download:</strong><br>
|
||||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
|
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz">
|
||||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
|
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
|
||||||
</a>
|
</a>
|
||||||
(1.8Mbytes)
|
(1.8Mbytes)
|
||||||
|
|
||||||
|
@ -191,23 +191,63 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
|
||||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||||
</ul>
|
</ul>
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-T <em>cluster_margin</em></code>
|
<dt> <code>-M <em>char_margin</em></code>
|
||||||
<dd>
|
<dt> <code>-L <em>line_margin</em></code>
|
||||||
<p>
|
|
||||||
<dt> <code>-W <em>word_margin</em></code>
|
<dt> <code>-W <em>word_margin</em></code>
|
||||||
<dd>
|
<dd> These are the parameters used for layout analysis.
|
||||||
|
In an actual PDF file, texts might be split into several chunks
|
||||||
|
in the middle of its running, depending on the authoring software.
|
||||||
|
Therefore, text extraction needs to splice text chunks.
|
||||||
|
In the figure below, two text chunks whose distance is closer than
|
||||||
|
the <em>char_margin</em> (shown as <em><font color="red">M</font></em>) is considered
|
||||||
|
continuous and get grouped into one. Also, two lines whose distance is closer than
|
||||||
|
the <em>line_margin</em> (<em><font color="blue">L</font></em>) is grouped
|
||||||
|
as a text box, which is a recutangular area that contains a "cluster" of texts.
|
||||||
|
Furthermore, it may be required to insert blank characters (spaces) as necessary
|
||||||
|
if the distance between two words is greater than the <em>word_margin</em>
|
||||||
|
(<em><font color="green">W</font></em>), as a blank between words might not be
|
||||||
|
represented as a space, but indicated by the positioning of each word.
|
||||||
|
<p>
|
||||||
|
Each value is specified not as an actual length, but as a proportion of
|
||||||
|
the length to the size of each character in question. The default values
|
||||||
|
are M = 1.0, L = 0.3, and W = 0.2, respectively.
|
||||||
|
<table style="border:2px gray solid; margin: 10px; padding: 10px;"><tr>
|
||||||
|
<td style="border-right:1px red solid" align=right>→</td>
|
||||||
|
<td style="border-left:1px red solid" colspan="4" align=left>← <em><font color="red">M</font></em></td>
|
||||||
|
<td></td>
|
||||||
|
</tr><tr>
|
||||||
|
<td style="border:1px solid"><code>Q u i</code></td>
|
||||||
|
<td style="border:1px solid"><code>c k</code></td>
|
||||||
|
<td width="10px"></td>
|
||||||
|
<td style="border:1px solid"><code>b r o w</code></td>
|
||||||
|
<td style="border:1px solid"><code>n f o x</code></td>
|
||||||
|
<td style="border-bottom:1px blue solid" align=right>↓</td>
|
||||||
|
</tr><tr>
|
||||||
|
<td style="border-right:1px green solid" colspan="2" align=right>→</td><td></td>
|
||||||
|
<td style="border-left:1px green solid" colspan="2" align=left>← <em><font color="green">W</font></em></td>
|
||||||
|
<td rowspan="2" valign=center align=center><em><font color="blue">L</font></em></td>
|
||||||
|
</tr><tr height="10px">
|
||||||
|
</tr><tr>
|
||||||
|
<td style="padding:0px;" colspan="5">
|
||||||
|
<table style="border:1px solid"><tr><td><code>j u m p s</code></td><td>...</td></tr></table>
|
||||||
|
</td>
|
||||||
|
<td style="border-top:1px blue solid" align=right>↑</td>
|
||||||
|
</tr></table>
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-s <em>scale</em></code>
|
<dt> <code>-s <em>scale</em></code>
|
||||||
<dd>
|
<dd> Specifies the output scale. Can be used in HTML format only.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-m <em>maxpages</em></code>
|
<dt> <code>-m <em>maxpages</em></code>
|
||||||
<dd>
|
<dd> Specifies the maximum number of pages to extract.
|
||||||
|
By default, it extracts all the pages in a document.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-P <em>password</em></code>
|
<dt> <code>-P <em>password</em></code>
|
||||||
<dd> Provides the user password to open the PDF file.
|
<dd> Provides the user password to access PDF contents.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-C <em>CMap directory</em></code>
|
<dt> <code>-C <em>CMap directory</em></code>
|
||||||
<dd>
|
<dd> Specifies the path of CMap directory. CMap is needed when extracting
|
||||||
|
non-ASCII texts (especially in Asian languages). The CMap location can be
|
||||||
|
also specified with <code>CMAP_PATH</code> environment variable.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-d</code>
|
<dt> <code>-d</code>
|
||||||
<dd> Increases the debug level.
|
<dd> Increases the debug level.
|
||||||
|
@ -242,12 +282,13 @@ Options:
|
||||||
By default, it only prints the document trailer (like a header).
|
By default, it only prints the document trailer (like a header).
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-i <em>objno,objno, ...</em></code>
|
<dt> <code>-i <em>objno,objno, ...</em></code>
|
||||||
<dd>
|
<dd> Specifies PDF object IDs to display.
|
||||||
|
Comma-separated IDs, or multiple <code>-i</code> options are accepted.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno,pageno, ...</em></code>
|
<dt> <code>-p <em>pageno,pageno, ...</em></code>
|
||||||
<dd> Specifies the page number to be extracted.
|
<dd> Specifies the page number to be extracted.
|
||||||
Multiple <code>-p</code> options are allowed.
|
Comma-separated page numbers, or multiple <code>-p</code> options are accepted.
|
||||||
Note that page numbers start from one.
|
Note that page numbers start from one, not zero.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-r</code> (raw)
|
<dt> <code>-r</code> (raw)
|
||||||
<dt> <code>-b</code> (binary)
|
<dt> <code>-b</code> (binary)
|
||||||
|
@ -263,11 +304,11 @@ similar to <code>repr()</code> manner. When
|
||||||
<code>-r</code> or <code>-b</code> option is given,
|
<code>-r</code> or <code>-b</code> option is given,
|
||||||
no stream header is displayed for the ease of saving it to a file.
|
no stream header is displayed for the ease of saving it to a file.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-P <em>password</em></code>
|
|
||||||
<dd> Provides the user password to open the PDF file.
|
|
||||||
<p>
|
|
||||||
<dt> <code>-T</code>
|
<dt> <code>-T</code>
|
||||||
<dd>
|
<dd> Shows the table of contents.
|
||||||
|
<p>
|
||||||
|
<dt> <code>-P <em>password</em></code>
|
||||||
|
<dd> Provides the user password to access PDF contents.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-d</code>
|
<dt> <code>-d</code>
|
||||||
<dd> Increases the debug level.
|
<dd> Increases the debug level.
|
||||||
|
@ -277,6 +318,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
||||||
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
||||||
<li> 2009/03/30: Text output mode added.
|
<li> 2009/03/30: Text output mode added.
|
||||||
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
||||||
|
|
1
TODO
1
TODO
|
@ -1,4 +1,5 @@
|
||||||
TODOs:
|
TODOs:
|
||||||
|
- Better text extraction / layout analysis.
|
||||||
- Better API Documentation.
|
- Better API Documentation.
|
||||||
- Robust error handling.
|
- Robust error handling.
|
||||||
- Any special handling for linearized PDFs?
|
- Any special handling for linearized PDFs?
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__version__ = '20090517'
|
__version__ = '20090711'
|
||||||
|
|
||||||
if __name__ == '__main__': print __version__
|
if __name__ == '__main__': print __version__
|
||||||
|
|
|
@ -1,42 +1,9 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from pdfminer.utils import apply_matrix_norm
|
from pdfminer.utils import apply_matrix_norm, bsearch
|
||||||
INF = sys.maxint
|
INF = sys.maxint
|
||||||
|
|
||||||
|
|
||||||
## pick
|
|
||||||
##
|
|
||||||
def pick(seq, func, maxobj=None):
|
|
||||||
maxscore = None
|
|
||||||
for obj in seq:
|
|
||||||
score = func(obj)
|
|
||||||
if maxscore == None or maxscore < score:
|
|
||||||
(maxscore,maxobj) = (score,obj)
|
|
||||||
return maxobj
|
|
||||||
|
|
||||||
|
|
||||||
## bsearch
|
|
||||||
##
|
|
||||||
## Finds objects whose coordinates overlap with [v0,v1].
|
|
||||||
## It performs binary search so that the processing time
|
|
||||||
## should be around O(log n).
|
|
||||||
##
|
|
||||||
def bsearch(objs, v0):
|
|
||||||
i0 = 0
|
|
||||||
i1 = len(objs)
|
|
||||||
while i0 < i1:
|
|
||||||
i = (i0+i1)/2
|
|
||||||
(v, obj) = objs[i]
|
|
||||||
if v0 == v:
|
|
||||||
(i0,i1) = (i,i+1)
|
|
||||||
break
|
|
||||||
elif v0 < v:
|
|
||||||
i1 = i
|
|
||||||
else:
|
|
||||||
i0 = i+1
|
|
||||||
return (i0,i1)
|
|
||||||
|
|
||||||
|
|
||||||
## reorder_hv, reorder_vh
|
## reorder_hv, reorder_vh
|
||||||
## chop_hv, chop_vh
|
## chop_hv, chop_vh
|
||||||
##
|
##
|
||||||
|
@ -387,10 +354,8 @@ class LTTextBox(LayoutContainer):
|
||||||
def fixate(self, direction='H'):
|
def fixate(self, direction='H'):
|
||||||
LayoutContainer.fixate(self, direction=direction)
|
LayoutContainer.fixate(self, direction=direction)
|
||||||
if not direction:
|
if not direction:
|
||||||
for obj in self.objs:
|
if any( obj.is_vertical() for obj in self.objs ):
|
||||||
if obj.is_vertical():
|
direction = 'V'
|
||||||
direction = 'V'
|
|
||||||
break
|
|
||||||
if 2 <= len(self.objs):
|
if 2 <= len(self.objs):
|
||||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||||
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
||||||
|
|
|
@ -20,12 +20,41 @@ def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
return (a*x+c*y+e, b*x+d*y+f)
|
||||||
|
|
||||||
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
'''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
||||||
return (a*p+c*q, b*p+d*q)
|
return (a*p+c*q, b*p+d*q)
|
||||||
|
|
||||||
|
|
||||||
## Utilities
|
## Utility functions
|
||||||
##
|
##
|
||||||
|
|
||||||
|
# pick
|
||||||
|
def pick(seq, func, maxobj=None):
|
||||||
|
'''Picks the object that has the highest value of func(obj).'''
|
||||||
|
maxscore = None
|
||||||
|
for obj in seq:
|
||||||
|
score = func(obj)
|
||||||
|
if maxscore == None or maxscore < score:
|
||||||
|
(maxscore,maxobj) = (score,obj)
|
||||||
|
return maxobj
|
||||||
|
|
||||||
|
# bsearch
|
||||||
|
def bsearch(objs, v0):
|
||||||
|
'''Tries to find the closest value to v0.'''
|
||||||
|
i0 = 0
|
||||||
|
i1 = len(objs)
|
||||||
|
while i0 < i1:
|
||||||
|
i = (i0+i1)/2
|
||||||
|
(v, obj) = objs[i]
|
||||||
|
if v0 == v:
|
||||||
|
(i0,i1) = (i,i+1)
|
||||||
|
break
|
||||||
|
elif v0 < v:
|
||||||
|
i1 = i
|
||||||
|
else:
|
||||||
|
i0 = i+1
|
||||||
|
return (i0,i1)
|
||||||
|
|
||||||
|
# choplist
|
||||||
def choplist(n, seq):
|
def choplist(n, seq):
|
||||||
'''Groups every n elements of the list.'''
|
'''Groups every n elements of the list.'''
|
||||||
r = []
|
r = []
|
||||||
|
@ -36,6 +65,7 @@ def choplist(n, seq):
|
||||||
r = []
|
r = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# nunpack
|
||||||
def nunpack(s, default=0):
|
def nunpack(s, default=0):
|
||||||
'''Unpacks up to 4 bytes big endian.'''
|
'''Unpacks up to 4 bytes big endian.'''
|
||||||
l = len(s)
|
l = len(s)
|
||||||
|
@ -52,6 +82,7 @@ def nunpack(s, default=0):
|
||||||
else:
|
else:
|
||||||
return TypeError('invalid length: %d' % l)
|
return TypeError('invalid length: %d' % l)
|
||||||
|
|
||||||
|
# decode_text
|
||||||
PDFDocEncoding = ''.join( unichr(x) for x in (
|
PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||||
|
@ -87,12 +118,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||||
))
|
))
|
||||||
def decode_text(s):
|
def decode_text(s):
|
||||||
|
'''Decodes a PDFDocEncoding string to Unicode.'''
|
||||||
if s.startswith('\xfe\xff'):
|
if s.startswith('\xfe\xff'):
|
||||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||||
else:
|
else:
|
||||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||||
|
|
||||||
# enc(x): encode string in SGML/XML/HTML
|
# enc
|
||||||
def enc(x, codec='ascii'):
|
def enc(x, codec='ascii'):
|
||||||
|
'''Encodes a string for SGML/XML/HTML'''
|
||||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||||
return x.encode(codec, 'xmlcharrefreplace')
|
return x.encode(codec, 'xmlcharrefreplace')
|
||||||
|
|
Loading…
Reference in New Issue