include cmap

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@162 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-12-19 14:17:00 +00:00
parent ed8a5362b9
commit e4b089e327
20 changed files with 92216 additions and 362 deletions

View File

@ -6,7 +6,8 @@ pdfminer/Makefile
pdfminer/__init__.py pdfminer/__init__.py
pdfminer/arcfour.py pdfminer/arcfour.py
pdfminer/ascii85.py pdfminer/ascii85.py
pdfminer/cmap.py pdfminer/cmapdb.py
pdfminer/encodingdb.py
pdfminer/converter.py pdfminer/converter.py
pdfminer/fontmetrics.py pdfminer/fontmetrics.py
pdfminer/glyphlist.py pdfminer/glyphlist.py
@ -24,6 +25,8 @@ pdfminer/psparser.py
pdfminer/pycdb.py pdfminer/pycdb.py
pdfminer/rijndael.py pdfminer/rijndael.py
pdfminer/utils.py pdfminer/utils.py
pdfminer/cmap/Makefile
pdfminer/cmap/__init__.py
tools/Makefile tools/Makefile
tools/dumppdf.py tools/dumppdf.py
tools/pdf2txt.py tools/pdf2txt.py
@ -40,3 +43,4 @@ samples/i1040nr.pdf
samples/kampo.pdf samples/kampo.pdf
samples/naacl06-shinyama.pdf samples/naacl06-shinyama.pdf
samples/nlp2004slides.pdf samples/nlp2004slides.pdf
cmaprsrc/README.txt

View File

@ -36,3 +36,15 @@ register: clean
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE) WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish: publish:
$(CP) docs/*.html $(WEBDIR) $(CP) docs/*.html $(WEBDIR)
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPDIR=pdfminer/cmap
CMAPRSRC=cmaprsrc
cmap: cmaprsrc
$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
cmap_clean:
cd $(CMAPDIR) && make cmap_clean

2
TODO
View File

@ -4,4 +4,4 @@ TODOs:
- Better API Documentation. - Better API Documentation.
- Robust error handling. - Robust error handling.
- Any special handling for linearized PDFs? - Any special handling for linearized PDFs?
- Handle crypt filter. (I need more samples!) - Handle crypt filter. (More sample documents are needed!)

60
cmaprsrc/README.txt Normal file
View File

@ -0,0 +1,60 @@
README.txt for cmaprsrc
This directory contains Adobe CMap resources. CMaps are required
to decode text data written in Chinese, Japanese or Korean language.
CMap resources are now available freely from Adobe web site:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
The follwing files were extracted from the downloadable tarballs:
cid2code_Adobe_CNS1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
cid2code_Adobe_GB1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
cid2code_Adobe_Japan1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
cid2code_Adobe_Korea1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
Here is the license information in the original files:
%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
%%Copyright: All rights reserved.
%%Copyright:
%%Copyright: Redistribution and use in source and binary forms, with or
%%Copyright: without modification, are permitted provided that the
%%Copyright: following conditions are met:
%%Copyright:
%%Copyright: Redistributions of source code must retain the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer.
%%Copyright:
%%Copyright: Redistributions in binary form must reproduce the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer in the documentation and/or other materials
%%Copyright: provided with the distribution.
%%Copyright:
%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
%%Copyright: of its contributors may be used to endorse or promote
%%Copyright: products derived from this software without specific prior
%%Copyright: written permission.
%%Copyright:
%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%%Copyright: -----------------------------------------------------------

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@ all:
clean: clean:
-rm *.pyc *.pyo -rm *.pyc *.pyo
cd cmap && make clean
check: check:
$(PYCHECKER) *.py $(PYCHECKER) *.py

10
pdfminer/cmap/Makefile Normal file
View File

@ -0,0 +1,10 @@
# Makefile for pdfminer.cmap
all:
clean:
-rm *.pyc *.pyo
cmap_clean:
-rm *.py
touch __init__.py

View File

View File

@ -20,240 +20,229 @@ from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
from psparser import PSLiteral, PSKeyword from psparser import PSLiteral, PSKeyword
from psparser import literal_name, keyword_name from psparser import literal_name, keyword_name
from fontmetrics import FONT_METRICS from encodingdb import name2unicode
from latin_enc import ENCODING
from glyphlist import charname2unicode
from utils import choplist, nunpack from utils import choplist, nunpack
try:
import cdb
except ImportError:
import pdfminer.pycdb as cdb
class CMapError(Exception): pass class CMapError(Exception): pass
## find_cmap_path
##
def find_cmap_path():
"""Returns the location of CMap directory."""
for path in (os.environ.get('CMAP_PATH', '.'),
os.path.join(os.path.dirname(__file__), 'CMap')):
if os.path.isdir(path):
return path
raise IOError
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
return int(m.group(0))
## CMap ## CMap
## ##
class CMap(object): class CMap(object):
debug = 0 debug = 0
def __init__(self, code2cid=None):
self.code2cid = code2cid or {}
return
def is_vertical(self):
return False
def use_cmap(self, cmap):
assert isinstance(cmap, CMap)
def copy(dst, src):
for (k,v) in src.iteritems():
if isinstance(v, dict):
d = {}
dst[k] = d
copy(d, v)
else:
dst[k] = v
copy(self.code2cid, cmap.code2cid)
return
def decode(self, code):
if self.debug:
print >>sys.stderr, 'decode: %r, %r' % (self, code)
d = self.code2cid
for c in code:
c = ord(c)
if c in d:
d = d[c]
if isinstance(d, int):
yield d
d = self.code2cid
else:
d = self.code2cid
return
## IdentityCMap
##
class IdentityCMap(object):
def __init__(self, vertical):
self.vertical = vertical
return
def is_vertical(self):
return self.vertical
def decode(self, code):
return unpack('>%dH' % (len(code)/2), code)
## UnicodeMap
##
class UnicodeMap(object):
debug = 0
def __init__(self, cid2unicode=None):
self.cid2unicode = cid2unicode or {}
return
def get_unicode(self, cid):
if self.debug:
print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
return self.cid2unicode.get(cid)
## FileCMap
##
class FileCMap(CMap):
def __init__(self): def __init__(self):
self.code2cid = {} CMap.__init__(self)
self.cid2code = {}
self.attrs = {} self.attrs = {}
return return
def __repr__(self): def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName') return '<CMap: %s>' % self.attrs.get('CMapName')
def update(self, code2cid=None, cid2code=None):
if code2cid:
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def register_code2cid(self, code, cid):
if isinstance(code, str) and isinstance(cid, int):
self.code2cid[code] = cid
return self
def register_cid2code(self, cid, code):
if isinstance(cid, int):
if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', name2unicode(code.name))
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def decode(self, bytes):
if self.debug:
print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
def is_vertical(self): def is_vertical(self):
return self.attrs.get('WMode', 0) return self.attrs.get('WMode', 0)
def tocid(self, code): def set_attr(self, k, v):
return self.code2cid.get(code) self.attrs[k] = v
def tocode(self, cid): return
return self.cid2code.get(cid)
def getall_attrs(self): def add_code2cid(self, code, cid):
return self.attrs.iteritems() assert isinstance(code, str) and isinstance(cid, int)
def getall_code2cid(self): d = self.code2cid
return self.code2cid.iteritems() for c in code[:-1]:
def getall_cid2code(self): c = ord(c)
return self.cid2code.iteritems() if c in d:
d = d[c]
else:
t = {}
d[c] = t
d =t
c = ord(code[-1])
d[c] = cid
return
## CDBCMap ## FileUnicodeMap
## ##
class CDBCMap(CMap): class FileUnicodeMap(UnicodeMap):
def __init__(self, cdbname): def __init__(self):
CMap.__init__(self) UnicodeMap.__init__(self)
self.cdbname = cdbname self.attrs = {}
self.db = cdb.init(cdbname)
return return
def __repr__(self): def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname) return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
def tocid(self, code): def set_attr(self, k, v):
k = 'c'+code self.attrs[k] = v
if not self.db.has_key(k): return
return None
return unpack('>L', self.db[k]) def add_cid2unicode(self, cid, code):
def tocode(self, cid): assert isinstance(cid, int)
k = 'i'+pack('>L', cid) if isinstance(code, PSLiteral):
if not self.db.has_key(k): # Interpret as an Adobe glyph name.
return None self.cid2unicode[cid] = name2unicode(code.name)
return self.db[k] elif isinstance(code, str):
# Interpret as UTF-16BE.
self.cid2unicode[cid] = unpack('>H', code)[0]
elif isinstance(code, int):
self.cid2unicode[cid] = code
else:
raise TypeError(code)
return
## PyCMap
##
class PyCMap(CMap):
def __init__(self, name, module):
CMap.__init__(self, module.CODE2CID)
self.name = name
self._is_vertical = module.IS_VERTICAL
return
def __repr__(self):
return '<PyCMap: %s>' % (self.name)
def is_vertical(self): def is_vertical(self):
return (self.db.has_key('/WMode') and return self._is_vertical
self.db['/WMode'] == '1')
def getall(self, c):
while 1: ## PyUnicodeMap
x = self.db.each() ##
if not x: break class PyUnicodeMap(UnicodeMap):
(k,v) = x
if k.startswith(c): def __init__(self, name, module, vertical):
yield (k[1:], unpack('>L', v)[0]) if vertical:
cid2unicode = module.CID2UNICODE_V
else:
cid2unicode = module.CID2UNICODE_H
UnicodeMap.__init__(self, cid2unicode)
self.name = name
return return
def getall_attrs(self): def __repr__(self):
while 1: return '<PyUnicodeMap: %s>' % (self.name)
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes):
if self.debug:
print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
## CMapDB ## CMapDB
## ##
class CMapDB(object): class CMapDB(object):
class CMapNotFound(CMapError): pass
CMAP_ALIAS = { }
debug = 0 debug = 0
def __init__(self, dirname=None, cdbdirname=None): class CMapNotFound(CMapError): pass
if not dirname:
dirname = find_cmap_path()
self.dirname = dirname
self.cdbdirname = cdbdirname or dirname
self.cmapdb = {}
return
def get_cmap(self, cmapname, strict=True): @classmethod
cmapname = self.CMAP_ALIAS.get(cmapname, cmapname) def get_cmap(klass, name):
if cmapname in self.cmapdb: if name == 'Identity-H':
cmap = self.cmapdb[cmapname] return IdentityCMap(False)
else: elif name == 'Identity-V':
fname = os.path.join(self.dirname, cmapname) return IdentityCMap(True)
cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb') modname = 'pdfminer.cmap.%s' % name.replace('-','_')
if os.path.exists(cdbname): if klass.debug:
if 1 <= self.debug: print >>sys.stderr, 'loading:', modname
print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname try:
cmap = CDBCMap(cdbname) module = __import__(modname, fromlist=['pdfminer.cmap'])
elif os.path.exists(fname): except ImportError:
if 1 <= self.debug: raise CMapDB.CMapNotFound(name)
print >>sys.stderr, 'Reading: CMap %r...' % fname return PyCMap(name, module)
cmap = CMap()
fp = file(fname, 'rb') @classmethod
CMapParser(self, cmap, fp).run() def get_unicode_map(klass, name, vertical=False):
fp.close() modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
elif not strict: if klass.debug:
cmap = CMap() # just create empty cmap print >>sys.stderr, 'loading:', modname, vertical
else: try:
raise CMapDB.CMapNotFound(cmapname) module = __import__(modname, fromlist=['pdfminer.cmap'])
self.cmapdb[cmapname] = cmap except ImportError:
return cmap raise CMapDB.CMapNotFound(name)
return PyUnicodeMap(name, module, vertical)
## CMapParser ## CMapParser
## ##
class CMapParser(PSStackParser): class CMapParser(PSStackParser):
def __init__(self, cmapdb, cmap, fp): def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp) PSStackParser.__init__(self, fp)
self.cmapdb = cmapdb
self.cmap = cmap self.cmap = cmap
self.in_cmap = False self._in_cmap = False
return return
def run(self): def run(self):
@ -266,29 +255,30 @@ class CMapParser(PSStackParser):
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
name = token.name name = token.name
if name == 'begincmap': if name == 'begincmap':
self.in_cmap = True self._in_cmap = True
self.popall() self.popall()
return return
elif name == 'endcmap': elif name == 'endcmap':
self.in_cmap = False self._in_cmap = False
return return
if not self.in_cmap: return if not self._in_cmap: return
# #
if name == 'def': if name == 'def':
try: try:
((_,k),(_,v)) = self.pop(2) ((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError: except PSSyntaxError:
pass pass
return return
if name == 'usecmap': if name == 'usecmap':
if self.cmapdb: try:
try: ((_,cmapname),) = self.pop(1)
((_,cmapname),) = self.pop(1) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
self.cmap.copycmap(self.cmapdb.get_cmap(literal_name(cmapname))) except PSSyntaxError:
except PSSyntaxError: pass
pass except CMapDB.CMapNotFound:
pass
return return
if name == 'begincodespacerange': if name == 'begincodespacerange':
@ -317,7 +307,7 @@ class CMapParser(PSStackParser):
#assert s1 <= e1 #assert s1 <= e1
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:] x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i) self.cmap.add_code2cid(x, cid+i)
return return
if name == 'begincidchar': if name == 'begincidchar':
@ -327,7 +317,7 @@ class CMapParser(PSStackParser):
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs): for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str): if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid)) self.cmap.add_code2cid(code, nunpack(cid))
return return
if name == 'beginbfrange': if name == 'beginbfrange':
@ -343,7 +333,7 @@ class CMapParser(PSStackParser):
#assert s1 <= e1 #assert s1 <= e1
if isinstance(code, list): if isinstance(code, list):
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i]) self.cmap.add_cid2unicode(s1+i, code[i])
else: else:
var = code[-4:] var = code[-4:]
base = nunpack(var) base = nunpack(var)
@ -351,7 +341,7 @@ class CMapParser(PSStackParser):
vlen = len(var) vlen = len(var)
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:] x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x) self.cmap.add_cid2unicode(s1+i, x)
return return
if name == 'beginbfchar': if name == 'beginbfchar':
@ -361,7 +351,7 @@ class CMapParser(PSStackParser):
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs): for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str): if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code) self.cmap.add_cid2unicode(nunpack(cid), code)
return return
if name == 'beginnotdefrange': if name == 'beginnotdefrange':
@ -373,123 +363,3 @@ class CMapParser(PSStackParser):
self.push((pos, token)) self.push((pos, token))
return return
## FontMetricsDB
##
class FontMetricsDB(object):
@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]
## EncodingDB
##
class EncodingDB(object):
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name))
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError:
pass
cid += 1
return cid2unicode
## CMap -> CMapCDB conversion
##
def dump_cdb(cmap, cdbfile, verbose=1):
"""Writes a CMap object into a cdb file."""
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose:
print >>sys.stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code)
m.finish()
return
def convert_cmap(cmapdir, outputdir, force=False):
"""Convert all CMap source files in a directory into cdb files."""
cmapdb = CMapDB(cmapdir)
for fname in os.listdir(cmapdir):
if '.' in fname: continue
cmapname = os.path.basename(fname)
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname):
print >>sys.stderr, 'Skipping: %r' % cmapname
continue
print >>sys.stderr, 'Reading: %r...' % cmapname
cmap = cmapdb.get_cmap(cmapname)
dump_cdb(cmap, cdbname)
return
def main(argv):
"""Converts CMap files into cdb files.
usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]]
"""
import getopt
def usage():
print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'f')
except getopt.GetoptError:
return usage()
if args:
cmapdir = args.pop(0)
else:
try:
cmapdir = find_cmap_path()
except IOError:
print >>sys.stderr, 'cannot find CMap directory'
return 1
if args:
outputdir = args.pop(0)
else:
outputdir = cmapdir
force = False
for (k, v) in opts:
if k == '-f': force = True
if not os.path.isdir(cmapdir):
print >>sys.stderr, 'directory does not exist: %r' % cmapdir
return 1
if not os.path.isdir(outputdir):
print >>sys.stderr, 'directory does not exist: %r' % outputdir
return 1
return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -29,7 +29,7 @@ class TagExtractor(PDFDevice):
chars = font.decode(obj) chars = font.decode(obj)
for cid in chars: for cid in chars:
try: try:
char = font.to_unicode(cid) char = font.to_unichr(cid)
text += char text += char
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
pass pass

58
pdfminer/encodingdb.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import re
from psparser import PSLiteral
from glyphlist import charname2unicode
from latin_enc import ENCODING
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
return int(m.group(0))
## EncodingDB
##
class EncodingDB(object):
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name))
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError:
pass
cid += 1
return cid2unicode

View File

@ -89,7 +89,7 @@ class PDFTextDevice(PDFDevice):
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
try: try:
char = font.to_unicode(cid) char = font.to_unichr(cid)
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid) char = self.handle_undefined_char(cidcoding, cid)

View File

@ -4,17 +4,27 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from cmapdb import CMap, CMapDB, CMapParser from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
from cmapdb import FontMetricsDB, EncodingDB from encodingdb import EncodingDB
from struct import pack, unpack from struct import pack, unpack
from psparser import LIT, STRICT from psparser import LIT, STRICT
from psparser import PSLiteral, literal_name from psparser import PSLiteral, literal_name
from pdftypes import PDFException, resolve1 from pdftypes import PDFException, resolve1
from pdftypes import int_value, float_value, num_value from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value from pdftypes import str_value, list_value, dict_value, stream_value
from fontmetrics import FONT_METRICS
from utils import apply_matrix_norm, nunpack from utils import apply_matrix_norm, nunpack
## FontMetricsDB
##
class FontMetricsDB(object):
@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]
## CFFFont ## CFFFont
## (Format specified in Adobe Technical Note: #5176 ## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification") ## "The Compact Font Format Specification")
@ -249,7 +259,7 @@ class TrueTypeFont(object):
self.tables[name] = (offset, length) self.tables[name] = (offset, length)
return return
def create_cmap(self): def create_unicode_map(self):
if 'cmap' not in self.tables: if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap'] (base_offset, length) = self.tables['cmap']
@ -302,9 +312,11 @@ class TrueTypeFont(object):
else: else:
for c in xrange(sc, ec+1): for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char)) # create unicode map
for (char,gid) in char2gid.iteritems() ) unicode_map = FileUnicodeMap()
return CMap().update(char2gid, gid2char) for (char,gid) in char2gid.iteritems():
unicode_map.add_cid2code(gid, char)
return unicode_map
## Fonts ## Fonts
@ -383,20 +395,19 @@ class PDFSimpleFont(PDFFont):
self.encoding = EncodingDB.get_encoding(name, diff) self.encoding = EncodingDB.get_encoding(name, diff)
else: else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding)) self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.ucs2_cmap = None self.unicode_map = None
if 'ToUnicode' in spec: if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode']) strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap() self.unicode_map = FileUnicodeMap()
CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run() CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths) PDFFont.__init__(self, descriptor, widths)
return return
def to_unicode(self, cid): def to_unichr(self, cid):
if self.ucs2_cmap: if self.unicode_map:
code = self.ucs2_cmap.tocode(cid) code = self.unicode_map.get_unicode(cid)
if code: if code is not None:
chars = unpack('>%dH' % (len(code)/2), code) return unichr(code)
return ''.join( unichr(c) for c in chars )
try: try:
return self.encoding[cid] return self.encoding[cid]
except KeyError: except KeyError:
@ -476,9 +487,11 @@ class PDFCIDFont(PDFFont):
raise PDFFontError('Encoding is unspecified') raise PDFFontError('Encoding is unspecified')
name = 'unknown' name = 'unknown'
try: try:
self.cmap = rsrc.get_cmap(name, strict=STRICT) self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound, e: except CMapDB.CMapNotFound, e:
raise PDFFontError(e) if STRICT:
raise PDFFontError(e)
self.cmap = CMap()
try: try:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
except KeyError: except KeyError:
@ -490,21 +503,20 @@ class PDFCIDFont(PDFFont):
self.fontfile = stream_value(descriptor.get('FontFile2')) self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont, ttf = TrueTypeFont(self.basefont,
StringIO(self.fontfile.get_data())) StringIO(self.fontfile.get_data()))
self.ucs2_cmap = None self.unicode_map = None
if 'ToUnicode' in spec: if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode']) strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap() self.unicode_map = FileUnicodeMap()
CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run() CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
elif self.cidcoding == 'Adobe-Identity': elif self.cidcoding == 'Adobe-Identity':
if ttf: if ttf:
try: try:
self.ucs2_cmap = ttf.create_cmap() self.unicode_map = ttf.create_unicode_map()
except TrueTypeFont.CMapNotFound: except TrueTypeFont.CMapNotFound:
pass pass
else: else:
try: try:
self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding, self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
strict=STRICT)
except CMapDB.CMapNotFound, e: except CMapDB.CMapNotFound, e:
raise PDFFontError(e) raise PDFFontError(e)
@ -558,14 +570,13 @@ class PDFCIDFont(PDFFont):
def char_disp(self, cid): def char_disp(self, cid):
return self.disps.get(cid, self.default_disp) return self.disps.get(cid, self.default_disp)
def to_unicode(self, cid): def to_unichr(self, cid):
if not self.ucs2_cmap: if not self.unicode_map:
raise PDFUnicodeNotDefined(self.cidcoding, cid) raise PDFUnicodeNotDefined(self.cidcoding, cid)
code = self.ucs2_cmap.tocode(cid) code = self.unicode_map.get_unicode(cid)
if not code: if code is not None:
raise PDFUnicodeNotDefined(self.cidcoding, cid) return unichr(code)
chars = unpack('>%dH' % (len(code)/2), code) raise PDFUnicodeNotDefined(self.cidcoding, cid)
return ''.join( unichr(c) for c in chars )
# main # main

View File

@ -6,7 +6,7 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from cmapdb import CMapDB from cmapdb import CMapDB, CMap
from psparser import PSException, PSTypeError, PSEOF from psparser import PSException, PSTypeError, PSEOF
from psparser import PSKeyword, literal_name, keyword_name from psparser import PSKeyword, literal_name, keyword_name
from psparser import PSStackParser from psparser import PSStackParser
@ -106,9 +106,8 @@ class PDFResourceManager(object):
''' '''
debug = 0 debug = 0
def __init__(self, cmapdb): def __init__(self):
self.fonts = {} self.fonts = {}
self.cmapdb = cmapdb
return return
def get_procset(self, procs): def get_procset(self, procs):
@ -123,7 +122,11 @@ class PDFResourceManager(object):
return return
def get_cmap(self, cmapname, strict=False): def get_cmap(self, cmapname, strict=False):
return self.cmapdb.get_cmap(cmapname, strict=strict) try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict: raise
return CMapDB.CMap()
def get_font(self, objid, spec): def get_font(self, objid, spec):
if objid and objid in self.fonts: if objid and objid in self.fonts:

View File

@ -19,7 +19,8 @@ PDF parser that can be used for other purposes instead of text analysis.''',
author_email='yusuke at cs dot nyu dot edu', author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html', url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[ packages=[
'pdfminer' 'pdfminer',
'pdfminer.cmap'
], ],
scripts=[ scripts=[
'tools/pdf2txt.py', 'tools/pdf2txt.py',

155
tools/conv_cmap.py Executable file
View File

@ -0,0 +1,155 @@
#!/usr/bin/env python
import sys
import os.path
def process_cid2code(fp, check_codecs=[]):
def get_canonicals(name):
if name.endswith('-H'):
return (name, None)
elif name == 'H':
return ('H', 'V')
else:
return (name+'-H', name+'-V')
def get_unicode(codes):
# determine the "most popular" candidate.
d = {}
for code in codes:
char = unicode(code, 'utf-8')
if char not in d:
d[char] = 0
for codec in check_codecs:
try:
char.encode(codec, 'strict')
d[char] += 1
except UnicodeError:
pass
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
return ord(chars[0])
def put(dmap, code, cid, force=False):
for b in code[:-1]:
b = ord(b)
if b in dmap:
dmap = dmap[b]
else:
d = {}
dmap[b] = d
dmap = d
b = ord(code[-1])
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
names = []
code2cid = {} # {'cmapname': ...}
is_vertical = {}
cid2unicode_h = {} # {cid: unicode}
cid2unicode_v = {} # {cid: unicode}
for line in fp:
line = line.strip()
if line.startswith('#'): continue
if line.startswith('CID'):
names = line.split('\t')[1:]
continue
f = line.split('\t')
if not f: continue
cid = int(f[0])
for (x,name) in zip(f[1:], names):
if x == '*': continue
(hmapname, vmapname) = get_canonicals(name)
if hmapname in code2cid:
hmap = code2cid[hmapname]
else:
hmap = {}
code2cid[hmapname] = hmap
vmap = None
if vmapname:
is_vertical[vmapname] = True
if vmapname in code2cid:
vmap = code2cid[vmapname]
else:
vmap = {}
code2cid[vmapname] = vmap
hcodes = []
vcodes = []
for code in x.split(','):
vertical = code.endswith('v')
if vertical:
code = code[:-1]
try:
code = code.decode('hex')
except:
code = chr(int(code, 16))
if vertical:
vcodes.append(code)
else:
hcodes.append(code)
if vcodes:
assert vmap is not None
for code in vcodes:
put(vmap, code, cid, True)
for code in hcodes:
put(hmap, code, cid, True)
if name.endswith('-UTF8'):
if hcodes:
cid2unicode_h[cid] = get_unicode(hcodes)
if vcodes:
cid2unicode_v[cid] = get_unicode(vcodes)
else:
for code in hcodes:
put(hmap, code, cid)
put(vmap, code, cid)
if name.endswith('-UTF8') and hcodes:
code = get_unicode(hcodes)
if cid not in cid2unicode_h:
cid2unicode_h[cid] = code
if cid not in cid2unicode_v:
cid2unicode_v[cid] = code
return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
# main
def main(argv):
def usage():
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
return 100
def pyname(name):
return name.replace('-','_')+'.py'
args = argv[1:]
if len(args) < 3: return usage()
(outdir, regname, src) = args[:3]
check_codecs = args[3:]
print >>sys.stderr, 'reading %r...' % src
fp = file(src)
(code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
fp.close()
for (name, cmap) in code2cid.iteritems():
fname = pyname(name)
print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
print >>fp, 'CODE2CID = %r' % cmap
fp.close()
fname = 'TO_UNICODE_'+pyname(regname)
print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
fp.close()
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmapdb import CMapDB, find_cmap_path from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
# main # main
@ -22,8 +22,6 @@ def main(argv):
if not args: return usage() if not args: return usage()
# debug option # debug option
debug = 0 debug = 0
# path option
cmapdir = find_cmap_path()
# input option # input option
password = '' password = ''
pagenos = set() pagenos = set()
@ -38,7 +36,6 @@ def main(argv):
laparams = LAParams() laparams = LAParams()
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-C': cmapdir = v
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
@ -59,8 +56,7 @@ def main(argv):
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
cmapdb = CMapDB(cmapdir) rsrc = PDFResourceManager()
rsrc = PDFResourceManager(cmapdb)
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
if outfile: if outfile: