for release 20091219
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@164 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
c07bef376d
commit
fb05e4b990
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Nov 29 16:20:36 JST 2009
|
Last Modified: Sun Dec 20 00:09:12 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -128,28 +128,20 @@ W o r l d
|
||||||
<h3>For non-ASCII languages</h3>
|
<h3>For non-ASCII languages</h3>
|
||||||
In order to handle non-ASCII languages (e.g. Japanese),
|
In order to handle non-ASCII languages (e.g. Japanese),
|
||||||
you need to install an additional data called <code>CMap</code>,
|
you need to install an additional data called <code>CMap</code>,
|
||||||
which is distributed from Adobe.
|
which is originally distributed by Adobe. CMap is now included
|
||||||
|
in the pdfminer package, but not installed by default.
|
||||||
<p>
|
<p>
|
||||||
Here is how:
|
Here is the additional step you need:
|
||||||
|
<blockquote><pre>
|
||||||
|
# <strong>make cmap</strong>
|
||||||
|
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
|
||||||
|
reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'...
|
||||||
|
writing 'CNS1_H.py'...
|
||||||
|
...
|
||||||
|
<em>(this may take several minutes)</em>
|
||||||
|
|
||||||
<ol>
|
# <strong>python setup.py install</strong>
|
||||||
<li> Get CMap files from
|
|
||||||
<a href="http://www.unixuser.org/~euske/pub/CMap.tar.bz2">
|
|
||||||
http://www.unixuser.org/~euske/pub/CMap.tar.bz2
|
|
||||||
</a>
|
|
||||||
<li> Expand the archive and put the <code>CMap</code> directory under the directory
|
|
||||||
where <code>pdfminer</code> is installed.
|
|
||||||
(Normally this should be something like <code>/usr/lib/python2.5/site-packages/pdfminer</code>.)
|
|
||||||
For example:
|
|
||||||
<blockquote><pre>
|
|
||||||
$ <strong>cd /usr/lib/python2.5/site-packages/pdfminer</strong>
|
|
||||||
$ <strong>tar jxf CMap.tar.bz2</strong>
|
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
<li> Do the following. (this is optional and may take several minutes, but highly recommended!)<br>
|
|
||||||
<blockquote><pre>
|
|
||||||
$ <strong>python -m pdfminer.cmap</strong>
|
|
||||||
</pre></blockquote>
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<a name="usage"></a>
|
<a name="usage"></a>
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
|
@ -272,11 +264,6 @@ By default, it extracts all the pages in a document.
|
||||||
<dt> <code>-P <em>password</em></code>
|
<dt> <code>-P <em>password</em></code>
|
||||||
<dd> Provides the user password to access PDF contents.
|
<dd> Provides the user password to access PDF contents.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-C <em>CMap directory</em></code>
|
|
||||||
<dd> Specifies the path of CMap directory. CMap is needed when extracting
|
|
||||||
non-ASCII texts (especially in Asian languages). The CMap location can be
|
|
||||||
also specified with <code>CMAP_PATH</code> environment variable.
|
|
||||||
<p>
|
|
||||||
<dt> <code>-d</code>
|
<dt> <code>-d</code>
|
||||||
<dd> Increases the debug level.
|
<dd> Increases the debug level.
|
||||||
</dl>
|
</dl>
|
||||||
|
@ -357,6 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
|
||||||
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
|
<li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
|
||||||
<li> 2009/10/31: SGML output format is changed and renamed as XML.
|
<li> 2009/10/31: SGML output format is changed and renamed as XML.
|
||||||
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
|
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__version__ = '20091129'
|
__version__ = '20091219'
|
||||||
|
|
||||||
if __name__ == '__main__': print __version__
|
if __name__ == '__main__': print __version__
|
||||||
|
|
|
@ -22,7 +22,6 @@ import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
||||||
from pdfminer.converter import HTMLConverter, TextConverter
|
from pdfminer.converter import HTMLConverter, TextConverter
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
from pdfminer.cmap import CMapDB
|
|
||||||
|
|
||||||
|
|
||||||
# quote HTML metacharacters
|
# quote HTML metacharacters
|
||||||
|
@ -58,7 +57,6 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10,
|
||||||
infp.close()
|
infp.close()
|
||||||
# perform conversion and
|
# perform conversion and
|
||||||
# send the results over the network.
|
# send the results over the network.
|
||||||
CMapDB.initialize()
|
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
if html:
|
if html:
|
||||||
|
|
Loading…
Reference in New Issue