latin2ascii.py was moved as a utility

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@215 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-05-05 05:51:11 +00:00 · 2010-05-05 05:51:11 +00:00 · 8e92ddca30
parent 7f587cafec
commit 8e92ddca30
2 changed files with 46 additions and 5 deletions
--- a/setup.py
+++ b/setup.py
@ -20,11 +20,12 @@ PDF parser that can be used for other purposes instead of text analysis.''',
    url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
    packages=[
    'pdfminer',
-    'pdfminer.cmap'
+    'pdfminer.cmap',
    ],
    scripts=[
    'tools/pdf2txt.py',
-    'tools/dumppdf.py'
+    'tools/dumppdf.py',
+    'tools/latin2ascii.py',
    ],
    keywords=['pdf parser', 'pdf converter', 'text mining'],
    classifiers=[
--- a/pdfminer/latin2ascii.py
+++ b/pdfminer/latin2ascii.py
@ -1,15 +1,20 @@
 #!/usr/bin/env python
+#
+#  latin2ascii.py - converts latin1 characters into ascii.
+#
+
+import sys

 """ Mappings from Latin-1 characters to ASCII.

 This is an in-house mapping table for some Latin-1 characters
 (acutes, umlauts, etc.) to ASCII strings.
-
-This file is *not* used currently.
-
 """

 LATIN2ASCII = {
+  #0x00a0: '',
+  #0x00a7: '',
+  
  # iso-8859-1
  0x00c0: 'A`',
  0x00c1: "A'",
@ -87,4 +92,39 @@ LATIN2ASCII = {
  0xfb05: 'ft',
  0xfb06: 'st',

+  # Symbols
+  #0x2013: '',
+  0x2014: '--',
+  0x2015: '||',
+  0x2018: '`',
+  0x2019: "'",
+  0x201c: '``',
+  0x201d: "''",
+  #0x2022: '',
+  #0x2212: '',
+
 }
+
+def latin2ascii(s):
+    return ''.join( LATIN2ASCII.get(ord(c),c) for c in s )
+
+
+def main(argv):
+    import getopt, fileinput
+    def usage():
+        print 'usage: %s [-c codec] file ...' % argv[0]
+        return 100
+    try:
+        (opts, args) = getopt.getopt(argv[1:], 'c')
+    except getopt.GetoptError:
+        return usage()
+    if not args: return usage()
+    codec = 'utf-8'
+    for (k, v) in opts:
+        if k == '-c': codec = v
+    for line in fileinput.input(args):
+        line = latin2ascii(unicode(line, codec, 'ignore'))
+        sys.stdout.write(line.encode('ascii', 'replace'))
+    return
+
+if __name__ == '__main__': sys.exit(main(sys.argv))