git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@129 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
16ddc94c77
commit
b8c6cb8367
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Mon Aug 24 15:56:08 JST 2009
|
Last Modified: Thu Aug 27 00:20:29 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -101,6 +101,7 @@ World
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
<a name="cmap"></a>
|
||||||
<h3>For non-ASCII languages</h3>
|
<h3>For non-ASCII languages</h3>
|
||||||
In order to handle non-ASCII languages (e.g. Japanese),
|
In order to handle non-ASCII languages (e.g. Japanese),
|
||||||
you need to install an additional data called <code>CMap</code>,
|
you need to install an additional data called <code>CMap</code>,
|
||||||
|
@ -115,17 +116,16 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2
|
||||||
</a>
|
</a>
|
||||||
<li> Expand the archive and put the <code>CMap</code> directory under the directory
|
<li> Expand the archive and put the <code>CMap</code> directory under the directory
|
||||||
where <code>pdfminer</code> is installed.
|
where <code>pdfminer</code> is installed.
|
||||||
(Normally this should be something like <code>/usr/lib/python2.5/site-packages</code>.)
|
(Normally this should be something like <code>/usr/lib/python2.5/site-packages/pdfminer</code>.)
|
||||||
For example:
|
For example:
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>cd /usr/lib/python2.5/site-packages</strong>
|
$ <strong>cd /usr/lib/python2.5/site-packages/pdfminer</strong>
|
||||||
$ <strong>tar jxf CMap.tar.bz2</strong>
|
$ <strong>tar jxf CMap.tar.bz2</strong>
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
<li> Do the following. (this is optional, but highly recommended)<br>
|
<li> Do the following. (this is optional and may take several minutes, but highly recommended!)<br>
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>python -m pdfminer.cmap</strong>
|
$ <strong>python -m pdfminer.cmap</strong>
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
This may take several minutes.
|
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
<a name="usage"></a>
|
<a name="usage"></a>
|
||||||
|
@ -319,6 +319,8 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2009/08/27: Fixed page rotation handling.
|
||||||
|
<li> 2009/08/26: Fixed zlib decoding bug. Thanks to Shon Urbas.
|
||||||
<li> 2009/08/24: Fixed a bug in character placing. Thanks to Pawan Jain.
|
<li> 2009/08/24: Fixed a bug in character placing. Thanks to Pawan Jain.
|
||||||
<li> 2009/07/21: Improvement in layout analysis.
|
<li> 2009/07/21: Improvement in layout analysis.
|
||||||
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
||||||
|
|
|
@ -33,7 +33,7 @@ class TagExtractor(PDFDevice):
|
||||||
self.outfp.write(enc(text, self.codec))
|
self.outfp.write(enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page):
|
def begin_page(self, page, ctm):
|
||||||
(x0, y0, x1, y1) = page.mediabox
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
||||||
|
@ -77,8 +77,12 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page):
|
def begin_page(self, page, ctm):
|
||||||
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
(x0,y0,x1,y1) = page.mediabox
|
||||||
|
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
|
||||||
|
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
|
||||||
|
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
||||||
|
self.cur_item = LTPage(self.pageno, mediabox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
|
@ -225,10 +229,11 @@ class HTMLConverter(PDFConverter):
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
|
self.yoffset += item.y1
|
||||||
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
||||||
((self.yoffset-page.y1)*self.scale))
|
((self.yoffset-item.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
|
@ -256,7 +261,6 @@ class HTMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
self.yoffset += page.y1
|
|
||||||
render(page)
|
render(page)
|
||||||
self.yoffset += self.pagepad
|
self.yoffset += self.pagepad
|
||||||
return
|
return
|
||||||
|
|
|
@ -32,7 +32,7 @@ class PDFDevice(object):
|
||||||
def do_tag(self, tag, props=None):
|
def do_tag(self, tag, props=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page):
|
def begin_page(self, page, ctm):
|
||||||
return
|
return
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
return
|
return
|
||||||
|
|
|
@ -690,8 +690,17 @@ class PDFPageInterpreter(object):
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing page: %r' % page
|
print >>stderr, 'Processing page: %r' % page
|
||||||
self.device.begin_page(page)
|
(x0,y0,x1,y1) = page.mediabox
|
||||||
self.render_contents(page.resources, page.contents)
|
if page.rotate == 90:
|
||||||
|
ctm = (0,-1,1,0, -y0,x1)
|
||||||
|
elif page.rotate == 180:
|
||||||
|
ctm = (-1,0,0,-1, x1,y1)
|
||||||
|
elif page.rotate == 270:
|
||||||
|
ctm = (0,1,-1,0, x0,-y1)
|
||||||
|
else:
|
||||||
|
ctm = (1,0,0,1, -x0,-y0)
|
||||||
|
self.device.begin_page(page, ctm)
|
||||||
|
self.render_contents(page.resources, page.contents, ctm=ctm)
|
||||||
self.device.end_page(page)
|
self.device.end_page(page)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue