diff --git a/docs/index.html b/docs/index.html index d0ed14f..63f92f8 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sun Nov 29 16:20:36 JST 2009 +Last Modified: Sun Dec 20 00:09:12 JST 2009
@@ -128,28 +128,20 @@ W o r l d

For non-ASCII languages

In order to handle non-ASCII languages (e.g. Japanese), you need to install an additional data called CMap, -which is distributed from Adobe. +which is originally distributed by Adobe. CMap is now included +in the pdfminer package, but not installed by default.

-Here is how: +Here is the additional step you need: +

+# make cmap
+python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
+reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'...
+writing 'CNS1_H.py'...
+...
+(this may take several minutes)
 
-
    -
  1. Get CMap files from - -http://www.unixuser.org/~euske/pub/CMap.tar.bz2 - -
  2. Expand the archive and put the CMap directory under the directory -where pdfminer is installed. -(Normally this should be something like /usr/lib/python2.5/site-packages/pdfminer.) -For example: -
    -$ cd /usr/lib/python2.5/site-packages/pdfminer
    -$ tar jxf CMap.tar.bz2
    +# python setup.py install
     
    -
  3. Do the following. (this is optional and may take several minutes, but highly recommended!)
    -
    -$ python -m pdfminer.cmap
    -
    -

@@ -272,11 +264,6 @@ By default, it extracts all the pages in a document.
-P password
Provides the user password to access PDF contents.

-

-C CMap directory -
Specifies the path of CMap directory. CMap is needed when extracting -non-ASCII texts (especially in Asian languages). The CMap location can be -also specified with CMAP_PATH environment variable. -

-d
Increases the debug level. @@ -357,6 +344,7 @@ no stream header is displayed for the ease of saving it to a file.

Changes

    +
  • 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
  • 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
  • 2009/10/31: SGML output format is changed and renamed as XML.
  • 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation. diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index 37887d6..c97acc5 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -1,4 +1,4 @@ #!/usr/bin/env python -__version__ = '20091129' +__version__ = '20091219' if __name__ == '__main__': print __version__ diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index 68de31c..47fc743 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -22,7 +22,6 @@ import os, os.path, re, cgi, time, random, codecs, logging, traceback from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.layout import LAParams -from pdfminer.cmap import CMapDB # quote HTML metacharacters @@ -58,7 +57,6 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10, infp.close() # perform conversion and # send the results over the network. - CMapDB.initialize() rsrc = PDFResourceManager() laparams = LAParams() if html: