include cmap

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@162 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-12-19 14:17:00 +00:00
parent ed8a5362b9
commit e4b089e327
20 changed files with 92216 additions and 362 deletions

View File

@ -6,7 +6,8 @@ pdfminer/Makefile
pdfminer/__init__.py
pdfminer/arcfour.py
pdfminer/ascii85.py
pdfminer/cmap.py
pdfminer/cmapdb.py
pdfminer/encodingdb.py
pdfminer/converter.py
pdfminer/fontmetrics.py
pdfminer/glyphlist.py
@ -24,6 +25,8 @@ pdfminer/psparser.py
pdfminer/pycdb.py
pdfminer/rijndael.py
pdfminer/utils.py
pdfminer/cmap/Makefile
pdfminer/cmap/__init__.py
tools/Makefile
tools/dumppdf.py
tools/pdf2txt.py
@ -40,3 +43,4 @@ samples/i1040nr.pdf
samples/kampo.pdf
samples/naacl06-shinyama.pdf
samples/nlp2004slides.pdf
cmaprsrc/README.txt

View File

@ -36,3 +36,15 @@ register: clean
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish:
$(CP) docs/*.html $(WEBDIR)
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPDIR=pdfminer/cmap
CMAPRSRC=cmaprsrc
cmap: cmaprsrc
$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
cmap_clean:
cd $(CMAPDIR) && make cmap_clean

2
TODO
View File

@ -4,4 +4,4 @@ TODOs:
- Better API Documentation.
- Robust error handling.
- Any special handling for linearized PDFs?
- Handle crypt filter. (I need more samples!)
- Handle crypt filter. (More sample documents are needed!)

60
cmaprsrc/README.txt Normal file
View File

@ -0,0 +1,60 @@
README.txt for cmaprsrc
This directory contains Adobe CMap resources. CMaps are required
to decode text data written in Chinese, Japanese or Korean language.
CMap resources are now available freely from Adobe web site:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
The follwing files were extracted from the downloadable tarballs:
cid2code_Adobe_CNS1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
cid2code_Adobe_GB1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
cid2code_Adobe_Japan1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
cid2code_Adobe_Korea1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
Here is the license information in the original files:
%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
%%Copyright: All rights reserved.
%%Copyright:
%%Copyright: Redistribution and use in source and binary forms, with or
%%Copyright: without modification, are permitted provided that the
%%Copyright: following conditions are met:
%%Copyright:
%%Copyright: Redistributions of source code must retain the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer.
%%Copyright:
%%Copyright: Redistributions in binary form must reproduce the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer in the documentation and/or other materials
%%Copyright: provided with the distribution.
%%Copyright:
%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
%%Copyright: of its contributors may be used to endorse or promote
%%Copyright: products derived from this software without specific prior
%%Copyright: written permission.
%%Copyright:
%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%%Copyright: -----------------------------------------------------------

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@ all:
clean:
-rm *.pyc *.pyo
cd cmap && make clean
check:
$(PYCHECKER) *.py

10
pdfminer/cmap/Makefile Normal file
View File

@ -0,0 +1,10 @@
# Makefile for pdfminer.cmap
all:
clean:
-rm *.pyc *.pyo
cmap_clean:
-rm *.py
touch __init__.py

View File

View File

@ -20,240 +20,229 @@ from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
from psparser import PSLiteral, PSKeyword
from psparser import literal_name, keyword_name
from fontmetrics import FONT_METRICS
from latin_enc import ENCODING
from glyphlist import charname2unicode
from encodingdb import name2unicode
from utils import choplist, nunpack
try:
import cdb
except ImportError:
import pdfminer.pycdb as cdb
class CMapError(Exception): pass
## find_cmap_path
##
def find_cmap_path():
"""Returns the location of CMap directory."""
for path in (os.environ.get('CMAP_PATH', '.'),
os.path.join(os.path.dirname(__file__), 'CMap')):
if os.path.isdir(path):
return path
raise IOError
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
return int(m.group(0))
## CMap
##
class CMap(object):
debug = 0
def __init__(self, code2cid=None):
self.code2cid = code2cid or {}
return
def is_vertical(self):
return False
def use_cmap(self, cmap):
assert isinstance(cmap, CMap)
def copy(dst, src):
for (k,v) in src.iteritems():
if isinstance(v, dict):
d = {}
dst[k] = d
copy(d, v)
else:
dst[k] = v
copy(self.code2cid, cmap.code2cid)
return
def decode(self, code):
if self.debug:
print >>sys.stderr, 'decode: %r, %r' % (self, code)
d = self.code2cid
for c in code:
c = ord(c)
if c in d:
d = d[c]
if isinstance(d, int):
yield d
d = self.code2cid
else:
d = self.code2cid
return
## IdentityCMap
##
class IdentityCMap(object):
def __init__(self, vertical):
self.vertical = vertical
return
def is_vertical(self):
return self.vertical
def decode(self, code):
return unpack('>%dH' % (len(code)/2), code)
## UnicodeMap
##
class UnicodeMap(object):
debug = 0
def __init__(self, cid2unicode=None):
self.cid2unicode = cid2unicode or {}
return
def get_unicode(self, cid):
if self.debug:
print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
return self.cid2unicode.get(cid)
## FileCMap
##
class FileCMap(CMap):
def __init__(self):
self.code2cid = {}
self.cid2code = {}
CMap.__init__(self)
self.attrs = {}
return
def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')
def update(self, code2cid=None, cid2code=None):
if code2cid:
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def register_code2cid(self, code, cid):
if isinstance(code, str) and isinstance(cid, int):
self.code2cid[code] = cid
return self
def register_cid2code(self, cid, code):
if isinstance(cid, int):
if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', name2unicode(code.name))
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def decode(self, bytes):
if self.debug:
print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
def is_vertical(self):
return self.attrs.get('WMode', 0)
def tocid(self, code):
return self.code2cid.get(code)
def tocode(self, cid):
return self.cid2code.get(cid)
def set_attr(self, k, v):
self.attrs[k] = v
return
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
def add_code2cid(self, code, cid):
assert isinstance(code, str) and isinstance(cid, int)
d = self.code2cid
for c in code[:-1]:
c = ord(c)
if c in d:
d = d[c]
else:
t = {}
d[c] = t
d =t
c = ord(code[-1])
d[c] = cid
return
## CDBCMap
## FileUnicodeMap
##
class CDBCMap(CMap):
class FileUnicodeMap(UnicodeMap):
def __init__(self, cdbname):
CMap.__init__(self)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
def __init__(self):
UnicodeMap.__init__(self)
self.attrs = {}
return
def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
def tocid(self, code):
k = 'c'+code
if not self.db.has_key(k):
return None
return unpack('>L', self.db[k])
def tocode(self, cid):
k = 'i'+pack('>L', cid)
if not self.db.has_key(k):
return None
return self.db[k]
def set_attr(self, k, v):
self.attrs[k] = v
return
def add_cid2unicode(self, cid, code):
assert isinstance(cid, int)
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
self.cid2unicode[cid] = name2unicode(code.name)
elif isinstance(code, str):
# Interpret as UTF-16BE.
self.cid2unicode[cid] = unpack('>H', code)[0]
elif isinstance(code, int):
self.cid2unicode[cid] = code
else:
raise TypeError(code)
return
## PyCMap
##
class PyCMap(CMap):
def __init__(self, name, module):
CMap.__init__(self, module.CODE2CID)
self.name = name
self._is_vertical = module.IS_VERTICAL
return
def __repr__(self):
return '<PyCMap: %s>' % (self.name)
def is_vertical(self):
return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1')
return self._is_vertical
def getall(self, c):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith(c):
yield (k[1:], unpack('>L', v)[0])
## PyUnicodeMap
##
class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
if vertical:
cid2unicode = module.CID2UNICODE_V
else:
cid2unicode = module.CID2UNICODE_H
UnicodeMap.__init__(self, cid2unicode)
self.name = name
return
def getall_attrs(self):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes):
if self.debug:
print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
def __repr__(self):
return '<PyUnicodeMap: %s>' % (self.name)
## CMapDB
##
class CMapDB(object):
class CMapNotFound(CMapError): pass
CMAP_ALIAS = { }
debug = 0
def __init__(self, dirname=None, cdbdirname=None):
if not dirname:
dirname = find_cmap_path()
self.dirname = dirname
self.cdbdirname = cdbdirname or dirname
self.cmapdb = {}
return
class CMapNotFound(CMapError): pass
def get_cmap(self, cmapname, strict=True):
cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in self.cmapdb:
cmap = self.cmapdb[cmapname]
else:
fname = os.path.join(self.dirname, cmapname)
cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname):
if 1 <= self.debug:
print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= self.debug:
print >>sys.stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname, 'rb')
CMapParser(self, cmap, fp).run()
fp.close()
elif not strict:
cmap = CMap() # just create empty cmap
else:
raise CMapDB.CMapNotFound(cmapname)
self.cmapdb[cmapname] = cmap
return cmap
@classmethod
def get_cmap(klass, name):
if name == 'Identity-H':
return IdentityCMap(False)
elif name == 'Identity-V':
return IdentityCMap(True)
modname = 'pdfminer.cmap.%s' % name.replace('-','_')
if klass.debug:
print >>sys.stderr, 'loading:', modname
try:
module = __import__(modname, fromlist=['pdfminer.cmap'])
except ImportError:
raise CMapDB.CMapNotFound(name)
return PyCMap(name, module)
@classmethod
def get_unicode_map(klass, name, vertical=False):
modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
if klass.debug:
print >>sys.stderr, 'loading:', modname, vertical
try:
module = __import__(modname, fromlist=['pdfminer.cmap'])
except ImportError:
raise CMapDB.CMapNotFound(name)
return PyUnicodeMap(name, module, vertical)
## CMapParser
##
class CMapParser(PSStackParser):
def __init__(self, cmapdb, cmap, fp):
def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp)
self.cmapdb = cmapdb
self.cmap = cmap
self.in_cmap = False
self._in_cmap = False
return
def run(self):
@ -266,29 +255,30 @@ class CMapParser(PSStackParser):
def do_keyword(self, pos, token):
name = token.name
if name == 'begincmap':
self.in_cmap = True
self._in_cmap = True
self.popall()
return
elif name == 'endcmap':
self.in_cmap = False
self._in_cmap = False
return
if not self.in_cmap: return
if not self._in_cmap: return
#
if name == 'def':
try:
((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
pass
return
if name == 'usecmap':
if self.cmapdb:
try:
((_,cmapname),) = self.pop(1)
self.cmap.copycmap(self.cmapdb.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
try:
((_,cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
except CMapDB.CMapNotFound:
pass
return
if name == 'begincodespacerange':
@ -317,7 +307,7 @@ class CMapParser(PSStackParser):
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
self.cmap.add_code2cid(x, cid+i)
return
if name == 'begincidchar':
@ -327,7 +317,7 @@ class CMapParser(PSStackParser):
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid))
self.cmap.add_code2cid(code, nunpack(cid))
return
if name == 'beginbfrange':
@ -343,7 +333,7 @@ class CMapParser(PSStackParser):
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
self.cmap.add_cid2unicode(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
@ -351,7 +341,7 @@ class CMapParser(PSStackParser):
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
self.cmap.add_cid2unicode(s1+i, x)
return
if name == 'beginbfchar':
@ -361,7 +351,7 @@ class CMapParser(PSStackParser):
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code)
self.cmap.add_cid2unicode(nunpack(cid), code)
return
if name == 'beginnotdefrange':
@ -373,123 +363,3 @@ class CMapParser(PSStackParser):
self.push((pos, token))
return
## FontMetricsDB
##
class FontMetricsDB(object):
@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]
## EncodingDB
##
class EncodingDB(object):
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name))
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError:
pass
cid += 1
return cid2unicode
## CMap -> CMapCDB conversion
##
def dump_cdb(cmap, cdbfile, verbose=1):
"""Writes a CMap object into a cdb file."""
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose:
print >>sys.stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code)
m.finish()
return
def convert_cmap(cmapdir, outputdir, force=False):
"""Convert all CMap source files in a directory into cdb files."""
cmapdb = CMapDB(cmapdir)
for fname in os.listdir(cmapdir):
if '.' in fname: continue
cmapname = os.path.basename(fname)
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname):
print >>sys.stderr, 'Skipping: %r' % cmapname
continue
print >>sys.stderr, 'Reading: %r...' % cmapname
cmap = cmapdb.get_cmap(cmapname)
dump_cdb(cmap, cdbname)
return
def main(argv):
"""Converts CMap files into cdb files.
usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]]
"""
import getopt
def usage():
print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'f')
except getopt.GetoptError:
return usage()
if args:
cmapdir = args.pop(0)
else:
try:
cmapdir = find_cmap_path()
except IOError:
print >>sys.stderr, 'cannot find CMap directory'
return 1
if args:
outputdir = args.pop(0)
else:
outputdir = cmapdir
force = False
for (k, v) in opts:
if k == '-f': force = True
if not os.path.isdir(cmapdir):
print >>sys.stderr, 'directory does not exist: %r' % cmapdir
return 1
if not os.path.isdir(outputdir):
print >>sys.stderr, 'directory does not exist: %r' % outputdir
return 1
return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -29,7 +29,7 @@ class TagExtractor(PDFDevice):
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unicode(cid)
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
pass

58
pdfminer/encodingdb.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import re
from psparser import PSLiteral
from glyphlist import charname2unicode
from latin_enc import ENCODING
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
return int(m.group(0))
## EncodingDB
##
class EncodingDB(object):
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name))
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError:
pass
cid += 1
return cid2unicode

View File

@ -89,7 +89,7 @@ class PDFTextDevice(PDFDevice):
else:
for cid in font.decode(obj):
try:
char = font.to_unicode(cid)
char = font.to_unichr(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)

View File

@ -4,17 +4,27 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from cmapdb import CMap, CMapDB, CMapParser
from cmapdb import FontMetricsDB, EncodingDB
from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
from encodingdb import EncodingDB
from struct import pack, unpack
from psparser import LIT, STRICT
from psparser import PSLiteral, literal_name
from pdftypes import PDFException, resolve1
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from fontmetrics import FONT_METRICS
from utils import apply_matrix_norm, nunpack
## FontMetricsDB
##
class FontMetricsDB(object):
@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
@ -249,7 +259,7 @@ class TrueTypeFont(object):
self.tables[name] = (offset, length)
return
def create_cmap(self):
def create_unicode_map(self):
if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
@ -302,9 +312,11 @@ class TrueTypeFont(object):
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
return CMap().update(char2gid, gid2char)
# create unicode map
unicode_map = FileUnicodeMap()
for (char,gid) in char2gid.iteritems():
unicode_map.add_cid2code(gid, char)
return unicode_map
## Fonts
@ -383,20 +395,19 @@ class PDFSimpleFont(PDFFont):
self.encoding = EncodingDB.get_encoding(name, diff)
else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.ucs2_cmap = None
self.unicode_map = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths)
return
def to_unicode(self, cid):
if self.ucs2_cmap:
code = self.ucs2_cmap.tocode(cid)
if code:
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
def to_unichr(self, cid):
if self.unicode_map:
code = self.unicode_map.get_unicode(cid)
if code is not None:
return unichr(code)
try:
return self.encoding[cid]
except KeyError:
@ -476,9 +487,11 @@ class PDFCIDFont(PDFFont):
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = rsrc.get_cmap(name, strict=STRICT)
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
if STRICT:
raise PDFFontError(e)
self.cmap = CMap()
try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
@ -490,21 +503,20 @@ class PDFCIDFont(PDFFont):
self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont,
StringIO(self.fontfile.get_data()))
self.ucs2_cmap = None
self.unicode_map = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
elif self.cidcoding == 'Adobe-Identity':
if ttf:
try:
self.ucs2_cmap = ttf.create_cmap()
self.unicode_map = ttf.create_unicode_map()
except TrueTypeFont.CMapNotFound:
pass
else:
try:
self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
strict=STRICT)
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
@ -558,14 +570,13 @@ class PDFCIDFont(PDFFont):
def char_disp(self, cid):
return self.disps.get(cid, self.default_disp)
def to_unicode(self, cid):
if not self.ucs2_cmap:
def to_unichr(self, cid):
if not self.unicode_map:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
code = self.unicode_map.get_unicode(cid)
if code is not None:
return unichr(code)
raise PDFUnicodeNotDefined(self.cidcoding, cid)
# main

View File

@ -6,7 +6,7 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from cmapdb import CMapDB
from cmapdb import CMapDB, CMap
from psparser import PSException, PSTypeError, PSEOF
from psparser import PSKeyword, literal_name, keyword_name
from psparser import PSStackParser
@ -106,9 +106,8 @@ class PDFResourceManager(object):
'''
debug = 0
def __init__(self, cmapdb):
def __init__(self):
self.fonts = {}
self.cmapdb = cmapdb
return
def get_procset(self, procs):
@ -123,7 +122,11 @@ class PDFResourceManager(object):
return
def get_cmap(self, cmapname, strict=False):
return self.cmapdb.get_cmap(cmapname, strict=strict)
try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict: raise
return CMapDB.CMap()
def get_font(self, objid, spec):
if objid and objid in self.fonts:

View File

@ -19,7 +19,8 @@ PDF parser that can be used for other purposes instead of text analysis.''',
author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[
'pdfminer'
'pdfminer',
'pdfminer.cmap'
],
scripts=[
'tools/pdf2txt.py',

155
tools/conv_cmap.py Executable file
View File

@ -0,0 +1,155 @@
#!/usr/bin/env python
import sys
import os.path
def process_cid2code(fp, check_codecs=[]):
def get_canonicals(name):
if name.endswith('-H'):
return (name, None)
elif name == 'H':
return ('H', 'V')
else:
return (name+'-H', name+'-V')
def get_unicode(codes):
# determine the "most popular" candidate.
d = {}
for code in codes:
char = unicode(code, 'utf-8')
if char not in d:
d[char] = 0
for codec in check_codecs:
try:
char.encode(codec, 'strict')
d[char] += 1
except UnicodeError:
pass
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
return ord(chars[0])
def put(dmap, code, cid, force=False):
for b in code[:-1]:
b = ord(b)
if b in dmap:
dmap = dmap[b]
else:
d = {}
dmap[b] = d
dmap = d
b = ord(code[-1])
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
names = []
code2cid = {} # {'cmapname': ...}
is_vertical = {}
cid2unicode_h = {} # {cid: unicode}
cid2unicode_v = {} # {cid: unicode}
for line in fp:
line = line.strip()
if line.startswith('#'): continue
if line.startswith('CID'):
names = line.split('\t')[1:]
continue
f = line.split('\t')
if not f: continue
cid = int(f[0])
for (x,name) in zip(f[1:], names):
if x == '*': continue
(hmapname, vmapname) = get_canonicals(name)
if hmapname in code2cid:
hmap = code2cid[hmapname]
else:
hmap = {}
code2cid[hmapname] = hmap
vmap = None
if vmapname:
is_vertical[vmapname] = True
if vmapname in code2cid:
vmap = code2cid[vmapname]
else:
vmap = {}
code2cid[vmapname] = vmap
hcodes = []
vcodes = []
for code in x.split(','):
vertical = code.endswith('v')
if vertical:
code = code[:-1]
try:
code = code.decode('hex')
except:
code = chr(int(code, 16))
if vertical:
vcodes.append(code)
else:
hcodes.append(code)
if vcodes:
assert vmap is not None
for code in vcodes:
put(vmap, code, cid, True)
for code in hcodes:
put(hmap, code, cid, True)
if name.endswith('-UTF8'):
if hcodes:
cid2unicode_h[cid] = get_unicode(hcodes)
if vcodes:
cid2unicode_v[cid] = get_unicode(vcodes)
else:
for code in hcodes:
put(hmap, code, cid)
put(vmap, code, cid)
if name.endswith('-UTF8') and hcodes:
code = get_unicode(hcodes)
if cid not in cid2unicode_h:
cid2unicode_h[cid] = code
if cid not in cid2unicode_v:
cid2unicode_v[cid] = code
return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
# main
def main(argv):
def usage():
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
return 100
def pyname(name):
return name.replace('-','_')+'.py'
args = argv[1:]
if len(args) < 3: return usage()
(outdir, regname, src) = args[:3]
check_codecs = args[3:]
print >>sys.stderr, 'reading %r...' % src
fp = file(src)
(code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
fp.close()
for (name, cmap) in code2cid.iteritems():
fname = pyname(name)
print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
print >>fp, 'CODE2CID = %r' % cmap
fp.close()
fname = 'TO_UNICODE_'+pyname(regname)
print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
fp.close()
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmapdb import CMapDB, find_cmap_path
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
# main
@ -22,8 +22,6 @@ def main(argv):
if not args: return usage()
# debug option
debug = 0
# path option
cmapdir = find_cmap_path()
# input option
password = ''
pagenos = set()
@ -38,7 +36,6 @@ def main(argv):
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
@ -59,8 +56,7 @@ def main(argv):
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
cmapdb = CMapDB(cmapdir)
rsrc = PDFResourceManager(cmapdb)
rsrc = PDFResourceManager()
if not outtype:
outtype = 'text'
if outfile: