glyphlist bug (due to my misunderstanding of spec.)

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@237 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-08-26 15:02:46 +00:00 · 2010-08-26 15:02:46 +00:00 · 4554705881
parent 055b4861af
commit 4554705881
7 changed files with 4320 additions and 4299 deletions
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -163,7 +163,7 @@ class FileUnicodeMap(UnicodeMap):
        assert isinstance(cid, int)
        if isinstance(code, PSLiteral):
            # Interpret as an Adobe glyph name.
-            self.cid2unichr[cid] = unichr(name2unicode(code.name))
+            self.cid2unichr[cid] = name2unicode(code.name)
        elif isinstance(code, str):
            # Interpret as UTF-16BE.
            self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -2,7 +2,7 @@

 import re
 from psparser import PSLiteral
-from glyphlist import charname2unicode
+from glyphlist import glyphname2unicode
 from latin_enc import ENCODING


@ -11,11 +11,11 @@ from latin_enc import ENCODING
 STRIP_NAME = re.compile(r'[0-9]+')
 def name2unicode(name):
    """Converts Adobe glyph names to Unicode numbers."""
-    if name in charname2unicode:
-        return charname2unicode[name]
+    if name in glyphname2unicode:
+        return glyphname2unicode[name]
    m = STRIP_NAME.search(name)
    if not m: raise KeyError(name)
-    return int(m.group(0))
+    return unichr(int(m.group(0)))


 ##  EncodingDB
@ -27,7 +27,7 @@ class EncodingDB(object):
    win2unicode = {}
    pdf2unicode = {}
    for (name,std,mac,win,pdf) in ENCODING:
-        c = unichr(name2unicode(name))
+        c = name2unicode(name)
        if std: std2unicode[std] = c
        if mac: mac2unicode[mac] = c
        if win: win2unicode[win] = c
@ -51,7 +51,7 @@ class EncodingDB(object):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
-                        cid2unicode[cid] = unichr(name2unicode(x.name))
+                        cid2unicode[cid] = name2unicode(x.name)
                    except KeyError:
                        pass
                    cid += 1
--- a/pdfminer/glyphlist.py
+++ b/pdfminer/glyphlist.py
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -212,6 +212,7 @@ class LTChar(LTItem, LTText):
        self.adv = font.char_width(cid) * fontsize * scaling
        try:
            text = font.to_unichr(cid)
+            assert isinstance(text, unicode), text
        except PDFUnicodeNotDefined:
            text = '?'
        (a,b,c,d,e,f) = self.matrix
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -159,7 +159,7 @@ class TagExtractor(PDFDevice):

    def begin_tag(self, tag, props=None):
        s = ''
-        if props:
+        if isinstance(props, dict):
            s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
                         in sorted(props.iteritems()) )
        self.outfp.write('<%s%s>' % (enc(tag.name), s))
--- a/tools/conv_afm.py
+++ b/tools/conv_afm.py
@ -1,8 +1,6 @@
 #!/usr/bin/env python
 import sys
 import fileinput
-stdout = sys.stdout
-stderr = sys.stderr

 def main(argv):
    fonts = {}
--- a/tools/conv_glyphlist.py
+++ b/tools/conv_glyphlist.py
@ -0,0 +1,24 @@
+#!/usr/bin/env python
+import sys
+import fileinput
+
+def main(argv):
+    state = 0
+    for line in fileinput.input():
+        line = line.strip()
+        if not line or line.startswith('#'):
+            if state == 1:
+                state = 2
+                print '}'
+                print
+            print line
+            continue
+        if state == 0:
+            print
+            print 'glyphname2unicode = {'
+            state = 1
+        (name,x) = line.split(';')
+        codes = x.split(' ')
+        print ' %r: u\'%s\',' % (name, ''.join( '\\u%s' % code for code in codes ))
+
+if __name__ == '__main__': sys.exit(main(sys.argv))