fix typos (patches by sm)
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@183 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
aad921b382
commit
2555b38836
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Feb 7 12:13:27 JST 2010
|
Last Modified: Mon Feb 15 14:41:49 UTC 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ Last Modified: Sun Feb 7 12:13:27 JST 2010
|
||||||
<li> <a href="#intro">What's It?</a>
|
<li> <a href="#intro">What's It?</a>
|
||||||
<li> <a href="#source">Download</a>
|
<li> <a href="#source">Download</a>
|
||||||
<li> <a href="#install">Install</a>
|
<li> <a href="#install">Install</a>
|
||||||
<small>(<a href="#cmap">for East Asian languages</a>)</small>
|
<small>(<a href="#cmap">for CJK languages</a>)</small>
|
||||||
<li> <a href="#usage">How to Use</a>
|
<li> <a href="#usage">How to Use</a>
|
||||||
<small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
|
<small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
|
||||||
<li> <a href="#todos">TODOs</a>
|
<li> <a href="#todos">TODOs</a>
|
||||||
|
@ -54,7 +54,7 @@ PDF parser that can be used for other purposes instead of text analysis.
|
||||||
<ul>
|
<ul>
|
||||||
<li> Written entirely in Python. (for version 2.4 or newer)
|
<li> Written entirely in Python. (for version 2.4 or newer)
|
||||||
<li> PDF-1.7 specification support. (well, almost)
|
<li> PDF-1.7 specification support. (well, almost)
|
||||||
<li> East Asian languages and vertical writing scripts support.
|
<li> CJK languages and vertical writing scripts support.
|
||||||
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
||||||
<li> Basic encryption (RC4) support.
|
<li> Basic encryption (RC4) support.
|
||||||
<li> PDF to HTML conversion (with a sample converter web app).
|
<li> PDF to HTML conversion (with a sample converter web app).
|
||||||
|
@ -94,6 +94,7 @@ http://pdf2html.tabesugi.net:8080/
|
||||||
|
|
||||||
<ol>
|
<ol>
|
||||||
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
||||||
|
(<font color=red><strong>Python 3 is not supported.</strong></font>)
|
||||||
<li> Download the <a href="#source">PDFMiner source</a>.
|
<li> Download the <a href="#source">PDFMiner source</a>.
|
||||||
<li> Unpack it.
|
<li> Unpack it.
|
||||||
<li> Run <code>setup.py</code> to install:<br>
|
<li> Run <code>setup.py</code> to install:<br>
|
||||||
|
@ -125,8 +126,8 @@ W o r l d
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
<a name="cmap"></a>
|
<a name="cmap"></a>
|
||||||
<h3>For East Asian languages</h3>
|
<h3>For CJK languages</h3>
|
||||||
In order to handle East Asian languages (Chinese or Japanese, etc.),
|
In order to handle CJK languages,
|
||||||
an additional data called <code>CMap</code> is required.
|
an additional data called <code>CMap</code> is required.
|
||||||
CMap files are not installed by default.
|
CMap files are not installed by default.
|
||||||
<p>
|
<p>
|
||||||
|
@ -347,6 +348,8 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2010/02/15: Bugfixes. Thanks to Sean.
|
||||||
|
<li> 2010/02/13: Bugfix and enhancement. Thanks to André Auzi.
|
||||||
<li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe.
|
<li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe.
|
||||||
<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
|
<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
|
||||||
<li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
|
<li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
|
||||||
|
|
|
@ -53,7 +53,7 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name, stream):
|
||||||
assert isinstance(self.cur_item, LTFigure)
|
assert isinstance(self.cur_item, LTFigure)
|
||||||
ismask = stream.get_any(('IM', 'ImageMask'))
|
ismask = stream.get_any(('IM', 'ImageMask'))
|
||||||
bits = stream.get_any(('BPC', 'BitsPerCompoment'), 1)
|
bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
||||||
csp = stream.get_any(('CS', 'ColorSpace'))
|
csp = stream.get_any(('CS', 'ColorSpace'))
|
||||||
if not isinstance(csp, list):
|
if not isinstance(csp, list):
|
||||||
csp = [csp]
|
csp = [csp]
|
||||||
|
|
|
@ -294,7 +294,7 @@ class LTImage(LayoutItem):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
(w,h) = self.srcsize
|
(w,h) = self.srcsize
|
||||||
return '<image %s %s %dx%d>' % (self.id, self.type, w, h)
|
return '<image %s %s %dx%d>' % (self.name, self.type, w, h)
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -188,7 +188,7 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
|
|
||||||
def get_objids(self):
|
def get_objids(self):
|
||||||
for objid_range in self.objid_ranges:
|
for objid_range in self.objid_ranges:
|
||||||
for x in xrange(objid_range.get_start_id(), objid <= objid_range.get_end_id()+1):
|
for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1):
|
||||||
yield x
|
yield x
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -165,7 +165,12 @@ class PDFStream(PDFObject):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
if self.data is None:
|
||||||
|
assert self.rawdata is not None
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
|
||||||
|
else:
|
||||||
|
assert self.data is not None
|
||||||
|
return '<PDFStream(%r): len=%d, %r>' % (self.objid, len(self.data), self.attrs)
|
||||||
|
|
||||||
def __contains__(self, name):
|
def __contains__(self, name):
|
||||||
return name in self.attrs
|
return name in self.attrs
|
||||||
|
@ -203,7 +208,8 @@ class PDFStream(PDFObject):
|
||||||
data = self.decipher(self.objid, self.genno, data)
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
filters = self.get_any(('F', 'Filter'))
|
filters = self.get_any(('F', 'Filter'))
|
||||||
if not filters:
|
if not filters:
|
||||||
self.rawdata = self.data = data
|
self.data = data
|
||||||
|
self.rawdata = None
|
||||||
return
|
return
|
||||||
if not isinstance(filters, list):
|
if not isinstance(filters, list):
|
||||||
filters = [ filters ]
|
filters = [ filters ]
|
||||||
|
|
|
@ -116,6 +116,13 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
if isinstance(dest, dict):
|
if isinstance(dest, dict):
|
||||||
dest = dest['D']
|
dest = dest['D']
|
||||||
pageno = pages[dest[0].objid]
|
pageno = pages[dest[0].objid]
|
||||||
|
elif a:
|
||||||
|
action = a.resolve()
|
||||||
|
if isinstance(action, dict):
|
||||||
|
subtype = action.get('S')
|
||||||
|
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
|
||||||
|
dest = action['D']
|
||||||
|
pageno = pages[dest[0].objid]
|
||||||
outfp.write(repr((level,title,dest,pageno))+'\n')
|
outfp.write(repr((level,title,dest,pageno))+'\n')
|
||||||
parser.close()
|
parser.close()
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
Loading…
Reference in New Issue