From 8e92ddca30bedc83a4a3b120b301f5b77c86e08d Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Wed, 5 May 2010 05:51:11 +0000 Subject: [PATCH] latin2ascii.py was moved as a utility git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@215 1aa58f4a-7d42-0410-adbc-911cccaed67c --- setup.py | 5 ++-- {pdfminer => tools}/latin2ascii.py | 46 ++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 5 deletions(-) rename {pdfminer => tools}/latin2ascii.py (60%) mode change 100644 => 100755 diff --git a/setup.py b/setup.py index 3fd5e6a..baca2fe 100644 --- a/setup.py +++ b/setup.py @@ -20,11 +20,12 @@ PDF parser that can be used for other purposes instead of text analysis.''', url='http://www.unixuser.org/~euske/python/pdfminer/index.html', packages=[ 'pdfminer', - 'pdfminer.cmap' + 'pdfminer.cmap', ], scripts=[ 'tools/pdf2txt.py', - 'tools/dumppdf.py' + 'tools/dumppdf.py', + 'tools/latin2ascii.py', ], keywords=['pdf parser', 'pdf converter', 'text mining'], classifiers=[ diff --git a/pdfminer/latin2ascii.py b/tools/latin2ascii.py old mode 100644 new mode 100755 similarity index 60% rename from pdfminer/latin2ascii.py rename to tools/latin2ascii.py index 1bc196a..36588a6 --- a/pdfminer/latin2ascii.py +++ b/tools/latin2ascii.py @@ -1,15 +1,20 @@ #!/usr/bin/env python +# +# latin2ascii.py - converts latin1 characters into ascii. +# + +import sys """ Mappings from Latin-1 characters to ASCII. This is an in-house mapping table for some Latin-1 characters (acutes, umlauts, etc.) to ASCII strings. - -This file is *not* used currently. - """ LATIN2ASCII = { + #0x00a0: '', + #0x00a7: '', + # iso-8859-1 0x00c0: 'A`', 0x00c1: "A'", @@ -87,4 +92,39 @@ LATIN2ASCII = { 0xfb05: 'ft', 0xfb06: 'st', + # Symbols + #0x2013: '', + 0x2014: '--', + 0x2015: '||', + 0x2018: '`', + 0x2019: "'", + 0x201c: '``', + 0x201d: "''", + #0x2022: '', + #0x2212: '', + } + +def latin2ascii(s): + return ''.join( LATIN2ASCII.get(ord(c),c) for c in s ) + + +def main(argv): + import getopt, fileinput + def usage(): + print 'usage: %s [-c codec] file ...' % argv[0] + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'c') + except getopt.GetoptError: + return usage() + if not args: return usage() + codec = 'utf-8' + for (k, v) in opts: + if k == '-c': codec = v + for line in fileinput.input(args): + line = latin2ascii(unicode(line, codec, 'ignore')) + sys.stdout.write(line.encode('ascii', 'replace')) + return + +if __name__ == '__main__': sys.exit(main(sys.argv))