release-20090711
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@118 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
787ae4f814
commit
af63784305
78
README.html
78
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sat Jun 20 19:51:02 JST 2009
|
||||
Last Modified: Sun Jul 12 00:27:23 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
|
|||
<a name="source"></a>
|
||||
<p>
|
||||
<strong>Download:</strong><br>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
|
||||
</a>
|
||||
(1.8Mbytes)
|
||||
|
||||
|
@ -191,23 +191,63 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
|
|||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||
</ul>
|
||||
<p>
|
||||
<dt> <code>-T <em>cluster_margin</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-M <em>char_margin</em></code>
|
||||
<dt> <code>-L <em>line_margin</em></code>
|
||||
<dt> <code>-W <em>word_margin</em></code>
|
||||
<dd>
|
||||
<dd> These are the parameters used for layout analysis.
|
||||
In an actual PDF file, texts might be split into several chunks
|
||||
in the middle of its running, depending on the authoring software.
|
||||
Therefore, text extraction needs to splice text chunks.
|
||||
In the figure below, two text chunks whose distance is closer than
|
||||
the <em>char_margin</em> (shown as <em><font color="red">M</font></em>) is considered
|
||||
continuous and get grouped into one. Also, two lines whose distance is closer than
|
||||
the <em>line_margin</em> (<em><font color="blue">L</font></em>) is grouped
|
||||
as a text box, which is a recutangular area that contains a "cluster" of texts.
|
||||
Furthermore, it may be required to insert blank characters (spaces) as necessary
|
||||
if the distance between two words is greater than the <em>word_margin</em>
|
||||
(<em><font color="green">W</font></em>), as a blank between words might not be
|
||||
represented as a space, but indicated by the positioning of each word.
|
||||
<p>
|
||||
Each value is specified not as an actual length, but as a proportion of
|
||||
the length to the size of each character in question. The default values
|
||||
are M = 1.0, L = 0.3, and W = 0.2, respectively.
|
||||
<table style="border:2px gray solid; margin: 10px; padding: 10px;"><tr>
|
||||
<td style="border-right:1px red solid" align=right>→</td>
|
||||
<td style="border-left:1px red solid" colspan="4" align=left>← <em><font color="red">M</font></em></td>
|
||||
<td></td>
|
||||
</tr><tr>
|
||||
<td style="border:1px solid"><code>Q u i</code></td>
|
||||
<td style="border:1px solid"><code>c k</code></td>
|
||||
<td width="10px"></td>
|
||||
<td style="border:1px solid"><code>b r o w</code></td>
|
||||
<td style="border:1px solid"><code>n f o x</code></td>
|
||||
<td style="border-bottom:1px blue solid" align=right>↓</td>
|
||||
</tr><tr>
|
||||
<td style="border-right:1px green solid" colspan="2" align=right>→</td><td></td>
|
||||
<td style="border-left:1px green solid" colspan="2" align=left>← <em><font color="green">W</font></em></td>
|
||||
<td rowspan="2" valign=center align=center><em><font color="blue">L</font></em></td>
|
||||
</tr><tr height="10px">
|
||||
</tr><tr>
|
||||
<td style="padding:0px;" colspan="5">
|
||||
<table style="border:1px solid"><tr><td><code>j u m p s</code></td><td>...</td></tr></table>
|
||||
</td>
|
||||
<td style="border-top:1px blue solid" align=right>↑</td>
|
||||
</tr></table>
|
||||
<p>
|
||||
<dt> <code>-s <em>scale</em></code>
|
||||
<dd>
|
||||
<dd> Specifies the output scale. Can be used in HTML format only.
|
||||
<p>
|
||||
<dt> <code>-m <em>maxpages</em></code>
|
||||
<dd>
|
||||
<dd> Specifies the maximum number of pages to extract.
|
||||
By default, it extracts all the pages in a document.
|
||||
<p>
|
||||
<dt> <code>-P <em>password</em></code>
|
||||
<dd> Provides the user password to open the PDF file.
|
||||
<dd> Provides the user password to access PDF contents.
|
||||
<p>
|
||||
<dt> <code>-C <em>CMap directory</em></code>
|
||||
<dd>
|
||||
<dd> Specifies the path of CMap directory. CMap is needed when extracting
|
||||
non-ASCII texts (especially in Asian languages). The CMap location can be
|
||||
also specified with <code>CMAP_PATH</code> environment variable.
|
||||
<p>
|
||||
<dt> <code>-d</code>
|
||||
<dd> Increases the debug level.
|
||||
|
@ -242,12 +282,13 @@ Options:
|
|||
By default, it only prints the document trailer (like a header).
|
||||
<p>
|
||||
<dt> <code>-i <em>objno,objno, ...</em></code>
|
||||
<dd>
|
||||
<dd> Specifies PDF object IDs to display.
|
||||
Comma-separated IDs, or multiple <code>-i</code> options are accepted.
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno,pageno, ...</em></code>
|
||||
<dd> Specifies the page number to be extracted.
|
||||
Multiple <code>-p</code> options are allowed.
|
||||
Note that page numbers start from one.
|
||||
Comma-separated page numbers, or multiple <code>-p</code> options are accepted.
|
||||
Note that page numbers start from one, not zero.
|
||||
<p>
|
||||
<dt> <code>-r</code> (raw)
|
||||
<dt> <code>-b</code> (binary)
|
||||
|
@ -263,11 +304,11 @@ similar to <code>repr()</code> manner. When
|
|||
<code>-r</code> or <code>-b</code> option is given,
|
||||
no stream header is displayed for the ease of saving it to a file.
|
||||
<p>
|
||||
<dt> <code>-P <em>password</em></code>
|
||||
<dd> Provides the user password to open the PDF file.
|
||||
<p>
|
||||
<dt> <code>-T</code>
|
||||
<dd>
|
||||
<dd> Shows the table of contents.
|
||||
<p>
|
||||
<dt> <code>-P <em>password</em></code>
|
||||
<dd> Provides the user password to access PDF contents.
|
||||
<p>
|
||||
<dt> <code>-d</code>
|
||||
<dd> Increases the debug level.
|
||||
|
@ -277,6 +318,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
||||
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
||||
<li> 2009/03/30: Text output mode added.
|
||||
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
||||
|
|
1
TODO
1
TODO
|
@ -1,4 +1,5 @@
|
|||
TODOs:
|
||||
- Better text extraction / layout analysis.
|
||||
- Better API Documentation.
|
||||
- Robust error handling.
|
||||
- Any special handling for linearized PDFs?
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
__version__ = '20090517'
|
||||
__version__ = '20090711'
|
||||
|
||||
if __name__ == '__main__': print __version__
|
||||
|
|
|
@ -1,42 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfminer.utils import apply_matrix_norm
|
||||
from pdfminer.utils import apply_matrix_norm, bsearch
|
||||
INF = sys.maxint
|
||||
|
||||
|
||||
## pick
|
||||
##
|
||||
def pick(seq, func, maxobj=None):
|
||||
maxscore = None
|
||||
for obj in seq:
|
||||
score = func(obj)
|
||||
if maxscore == None or maxscore < score:
|
||||
(maxscore,maxobj) = (score,obj)
|
||||
return maxobj
|
||||
|
||||
|
||||
## bsearch
|
||||
##
|
||||
## Finds objects whose coordinates overlap with [v0,v1].
|
||||
## It performs binary search so that the processing time
|
||||
## should be around O(log n).
|
||||
##
|
||||
def bsearch(objs, v0):
|
||||
i0 = 0
|
||||
i1 = len(objs)
|
||||
while i0 < i1:
|
||||
i = (i0+i1)/2
|
||||
(v, obj) = objs[i]
|
||||
if v0 == v:
|
||||
(i0,i1) = (i,i+1)
|
||||
break
|
||||
elif v0 < v:
|
||||
i1 = i
|
||||
else:
|
||||
i0 = i+1
|
||||
return (i0,i1)
|
||||
|
||||
|
||||
## reorder_hv, reorder_vh
|
||||
## chop_hv, chop_vh
|
||||
##
|
||||
|
@ -387,10 +354,8 @@ class LTTextBox(LayoutContainer):
|
|||
def fixate(self, direction='H'):
|
||||
LayoutContainer.fixate(self, direction=direction)
|
||||
if not direction:
|
||||
for obj in self.objs:
|
||||
if obj.is_vertical():
|
||||
direction = 'V'
|
||||
break
|
||||
if any( obj.is_vertical() for obj in self.objs ):
|
||||
direction = 'V'
|
||||
if 2 <= len(self.objs):
|
||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
||||
|
|
|
@ -20,12 +20,41 @@ def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
|||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||
'''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
||||
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
||||
return (a*p+c*q, b*p+d*q)
|
||||
|
||||
|
||||
## Utilities
|
||||
## Utility functions
|
||||
##
|
||||
|
||||
# pick
|
||||
def pick(seq, func, maxobj=None):
|
||||
'''Picks the object that has the highest value of func(obj).'''
|
||||
maxscore = None
|
||||
for obj in seq:
|
||||
score = func(obj)
|
||||
if maxscore == None or maxscore < score:
|
||||
(maxscore,maxobj) = (score,obj)
|
||||
return maxobj
|
||||
|
||||
# bsearch
|
||||
def bsearch(objs, v0):
|
||||
'''Tries to find the closest value to v0.'''
|
||||
i0 = 0
|
||||
i1 = len(objs)
|
||||
while i0 < i1:
|
||||
i = (i0+i1)/2
|
||||
(v, obj) = objs[i]
|
||||
if v0 == v:
|
||||
(i0,i1) = (i,i+1)
|
||||
break
|
||||
elif v0 < v:
|
||||
i1 = i
|
||||
else:
|
||||
i0 = i+1
|
||||
return (i0,i1)
|
||||
|
||||
# choplist
|
||||
def choplist(n, seq):
|
||||
'''Groups every n elements of the list.'''
|
||||
r = []
|
||||
|
@ -36,6 +65,7 @@ def choplist(n, seq):
|
|||
r = []
|
||||
return
|
||||
|
||||
# nunpack
|
||||
def nunpack(s, default=0):
|
||||
'''Unpacks up to 4 bytes big endian.'''
|
||||
l = len(s)
|
||||
|
@ -52,6 +82,7 @@ def nunpack(s, default=0):
|
|||
else:
|
||||
return TypeError('invalid length: %d' % l)
|
||||
|
||||
# decode_text
|
||||
PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
|
@ -87,12 +118,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
|
|||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
))
|
||||
def decode_text(s):
|
||||
'''Decodes a PDFDocEncoding string to Unicode.'''
|
||||
if s.startswith('\xfe\xff'):
|
||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||
else:
|
||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||
|
||||
# enc(x): encode string in SGML/XML/HTML
|
||||
# enc
|
||||
def enc(x, codec='ascii'):
|
||||
'''Encodes a string for SGML/XML/HTML'''
|
||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
|
|
Loading…
Reference in New Issue