diff --git a/docs/index.html b/docs/index.html index d0ed14f..63f92f8 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
CMap
,
-which is distributed from Adobe.
+which is originally distributed by Adobe. CMap is now included
+in the pdfminer package, but not installed by default.
-Here is how: +Here is the additional step you need: +
+# make cmap +python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5 +reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'... +writing 'CNS1_H.py'... +... +(this may take several minutes) --
- Get CMap files from - -http://www.unixuser.org/~euske/pub/CMap.tar.bz2 - -
- Expand the archive and put the
CMap
directory under the directory -wherepdfminer
is installed. -(Normally this should be something like/usr/lib/python2.5/site-packages/pdfminer
.) -For example: ---$ cd /usr/lib/python2.5/site-packages/pdfminer -$ tar jxf CMap.tar.bz2 +# python setup.py install- Do the following. (this is optional and may take several minutes, but highly recommended!)
---$ python -m pdfminer.cmap -
@@ -272,11 +264,6 @@ By default, it extracts all the pages in a document.-P password
Provides the user password to access PDF contents. -
-C CMap directory
-Specifies the path of CMap directory. CMap is needed when extracting -non-ASCII texts (especially in Asian languages). The CMap location can be -also specified with CMAP_PATH
environment variable. -
-d
Increases the debug level. @@ -357,6 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+
- 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
- 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
- 2009/10/31: SGML output format is changed and renamed as XML.
- 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation. diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index 37887d6..c97acc5 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -1,4 +1,4 @@ #!/usr/bin/env python -__version__ = '20091129' +__version__ = '20091219' if __name__ == '__main__': print __version__ diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index 68de31c..47fc743 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -22,7 +22,6 @@ import os, os.path, re, cgi, time, random, codecs, logging, traceback from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.layout import LAParams -from pdfminer.cmap import CMapDB # quote HTML metacharacters @@ -58,7 +57,6 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10, infp.close() # perform conversion and # send the results over the network. - CMapDB.initialize() rsrc = PDFResourceManager() laparams = LAParams() if html: