latin2ascii.py was moved as a utility
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@215 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
7f587cafec
commit
8e92ddca30
5
setup.py
5
setup.py
|
@ -20,11 +20,12 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
||||||
packages=[
|
packages=[
|
||||||
'pdfminer',
|
'pdfminer',
|
||||||
'pdfminer.cmap'
|
'pdfminer.cmap',
|
||||||
],
|
],
|
||||||
scripts=[
|
scripts=[
|
||||||
'tools/pdf2txt.py',
|
'tools/pdf2txt.py',
|
||||||
'tools/dumppdf.py'
|
'tools/dumppdf.py',
|
||||||
|
'tools/latin2ascii.py',
|
||||||
],
|
],
|
||||||
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
|
|
@ -1,15 +1,20 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
#
|
||||||
|
# latin2ascii.py - converts latin1 characters into ascii.
|
||||||
|
#
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
""" Mappings from Latin-1 characters to ASCII.
|
""" Mappings from Latin-1 characters to ASCII.
|
||||||
|
|
||||||
This is an in-house mapping table for some Latin-1 characters
|
This is an in-house mapping table for some Latin-1 characters
|
||||||
(acutes, umlauts, etc.) to ASCII strings.
|
(acutes, umlauts, etc.) to ASCII strings.
|
||||||
|
|
||||||
This file is *not* used currently.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
LATIN2ASCII = {
|
LATIN2ASCII = {
|
||||||
|
#0x00a0: '',
|
||||||
|
#0x00a7: '',
|
||||||
|
|
||||||
# iso-8859-1
|
# iso-8859-1
|
||||||
0x00c0: 'A`',
|
0x00c0: 'A`',
|
||||||
0x00c1: "A'",
|
0x00c1: "A'",
|
||||||
|
@ -87,4 +92,39 @@ LATIN2ASCII = {
|
||||||
0xfb05: 'ft',
|
0xfb05: 'ft',
|
||||||
0xfb06: 'st',
|
0xfb06: 'st',
|
||||||
|
|
||||||
|
# Symbols
|
||||||
|
#0x2013: '',
|
||||||
|
0x2014: '--',
|
||||||
|
0x2015: '||',
|
||||||
|
0x2018: '`',
|
||||||
|
0x2019: "'",
|
||||||
|
0x201c: '``',
|
||||||
|
0x201d: "''",
|
||||||
|
#0x2022: '',
|
||||||
|
#0x2212: '',
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def latin2ascii(s):
|
||||||
|
return ''.join( LATIN2ASCII.get(ord(c),c) for c in s )
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
import getopt, fileinput
|
||||||
|
def usage():
|
||||||
|
print 'usage: %s [-c codec] file ...' % argv[0]
|
||||||
|
return 100
|
||||||
|
try:
|
||||||
|
(opts, args) = getopt.getopt(argv[1:], 'c')
|
||||||
|
except getopt.GetoptError:
|
||||||
|
return usage()
|
||||||
|
if not args: return usage()
|
||||||
|
codec = 'utf-8'
|
||||||
|
for (k, v) in opts:
|
||||||
|
if k == '-c': codec = v
|
||||||
|
for line in fileinput.input(args):
|
||||||
|
line = latin2ascii(unicode(line, codec, 'ignore'))
|
||||||
|
sys.stdout.write(line.encode('ascii', 'replace'))
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
Loading…
Reference in New Issue