fix CMapDB initialization stuff. more code cleanup.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@148 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
3dd4f1668b
commit
77986b8273
1
TODO
1
TODO
|
@ -4,3 +4,4 @@ TODOs:
|
||||||
- Better API Documentation.
|
- Better API Documentation.
|
||||||
- Robust error handling.
|
- Robust error handling.
|
||||||
- Any special handling for linearized PDFs?
|
- Any special handling for linearized PDFs?
|
||||||
|
- Handle security handler. (I need more samples!)
|
||||||
|
|
|
@ -4,20 +4,14 @@ import re
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
from sys import stderr
|
from sys import stderr
|
||||||
from struct import pack
|
from struct import pack, unpack
|
||||||
from struct import unpack
|
from psparser import PSStackParser
|
||||||
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
||||||
|
from psparser import PSLiteral, PSKeyword
|
||||||
|
from psparser import literal_name, keyword_name
|
||||||
from fontmetrics import FONT_METRICS
|
from fontmetrics import FONT_METRICS
|
||||||
from latin_enc import ENCODING
|
from latin_enc import ENCODING
|
||||||
from glyphlist import charname2unicode
|
from glyphlist import charname2unicode
|
||||||
from psparser import PSException
|
|
||||||
from psparser import PSSyntaxError
|
|
||||||
from psparser import PSTypeError
|
|
||||||
from psparser import PSEOF
|
|
||||||
from psparser import PSLiteral
|
|
||||||
from psparser import PSKeyword
|
|
||||||
from psparser import literal_name
|
|
||||||
from psparser import keyword_name
|
|
||||||
from psparser import PSStackParser
|
|
||||||
from utils import choplist
|
from utils import choplist
|
||||||
from utils import nunpack
|
from utils import nunpack
|
||||||
try:
|
try:
|
||||||
|
@ -201,36 +195,30 @@ class CMapDB(object):
|
||||||
|
|
||||||
class CMapNotFound(CMapError): pass
|
class CMapNotFound(CMapError): pass
|
||||||
|
|
||||||
CMAP_ALIAS = {
|
CMAP_ALIAS = { }
|
||||||
}
|
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
dirname = None
|
|
||||||
cdbdirname = None
|
|
||||||
cmapdb = {}
|
|
||||||
|
|
||||||
@classmethod
|
def __init__(self, dirname=None, cdbdirname=None):
|
||||||
def initialize(klass, dirname=None, cdbdirname=None):
|
|
||||||
if not dirname:
|
if not dirname:
|
||||||
dirname = find_cmap_path()
|
dirname = find_cmap_path()
|
||||||
klass.dirname = dirname
|
self.dirname = dirname
|
||||||
klass.cdbdirname = cdbdirname or dirname
|
self.cdbdirname = cdbdirname or dirname
|
||||||
|
self.cmapdb = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
@classmethod
|
def get_cmap(self, cmapname, strict=True):
|
||||||
def get_cmap(klass, cmapname, strict=True):
|
cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
|
||||||
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
|
if cmapname in self.cmapdb:
|
||||||
if cmapname in klass.cmapdb:
|
cmap = self.cmapdb[cmapname]
|
||||||
cmap = klass.cmapdb[cmapname]
|
|
||||||
else:
|
else:
|
||||||
fname = os.path.join(klass.dirname, cmapname)
|
fname = os.path.join(self.dirname, cmapname)
|
||||||
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
|
cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
|
||||||
if os.path.exists(cdbname):
|
if os.path.exists(cdbname):
|
||||||
if 1 <= klass.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
||||||
cmap = CDBCMap(cdbname)
|
cmap = CDBCMap(cdbname)
|
||||||
elif os.path.exists(fname):
|
elif os.path.exists(fname):
|
||||||
if 1 <= klass.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Reading: CMap %r...' % fname
|
print >>stderr, 'Reading: CMap %r...' % fname
|
||||||
cmap = CMap()
|
cmap = CMap()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
|
@ -240,7 +228,7 @@ class CMapDB(object):
|
||||||
cmap = CMap() # just create empty cmap
|
cmap = CMap() # just create empty cmap
|
||||||
else:
|
else:
|
||||||
raise CMapDB.CMapNotFound(cmapname)
|
raise CMapDB.CMapNotFound(cmapname)
|
||||||
klass.cmapdb[cmapname] = cmap
|
self.cmapdb[cmapname] = cmap
|
||||||
return cmap
|
return cmap
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,12 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from pdfdevice import PDFDevice
|
from pdfdevice import PDFDevice, PDFTextDevice
|
||||||
from pdfdevice import PDFTextDevice
|
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from layout import LayoutContainer
|
from layout import LayoutContainer
|
||||||
from layout import LTPage
|
from layout import LTPage, LTText, LTLine, LTRect
|
||||||
from layout import LTText
|
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||||
from layout import LTLine
|
|
||||||
from layout import LTRect
|
|
||||||
from layout import LTFigure
|
|
||||||
from layout import LTTextItem
|
|
||||||
from layout import LTTextBox
|
|
||||||
from layout import LTTextLine
|
|
||||||
from utils import apply_matrix_pt
|
|
||||||
from utils import mult_matrix
|
|
||||||
from utils import enc
|
from utils import enc
|
||||||
|
from utils import apply_matrix_pt, mult_matrix
|
||||||
|
|
||||||
|
|
||||||
## TagExtractor
|
## TagExtractor
|
||||||
|
|
|
@ -1,33 +1,19 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from cmap import CMap
|
|
||||||
from cmap import CMapDB
|
|
||||||
from cmap import CMapParser
|
|
||||||
from cmap import FontMetricsDB
|
|
||||||
from cmap import EncodingDB
|
|
||||||
from struct import pack
|
|
||||||
from struct import unpack
|
|
||||||
from psparser import PSLiteralTable
|
|
||||||
from psparser import PSKeywordTable
|
|
||||||
from psparser import PSLiteral
|
|
||||||
from psparser import literal_name
|
|
||||||
from psparser import keyword_name
|
|
||||||
from psparser import STRICT
|
|
||||||
from pdftypes import PDFException
|
|
||||||
from pdftypes import resolve1
|
|
||||||
from pdftypes import int_value
|
|
||||||
from pdftypes import float_value
|
|
||||||
from pdftypes import num_value
|
|
||||||
from pdftypes import str_value
|
|
||||||
from pdftypes import list_value
|
|
||||||
from pdftypes import dict_value
|
|
||||||
from pdftypes import stream_value
|
|
||||||
from utils import apply_matrix_norm
|
|
||||||
from utils import nunpack
|
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
from cmap import CMap, CMapDB, CMapParser
|
||||||
|
from cmap import FontMetricsDB, EncodingDB
|
||||||
|
from struct import pack, unpack
|
||||||
|
from psparser import STRICT
|
||||||
|
from psparser import PSLiteralTable, PSKeywordTable
|
||||||
|
from psparser import PSLiteral, literal_name, keyword_name
|
||||||
|
from pdftypes import PDFException, resolve1
|
||||||
|
from pdftypes import int_value, float_value, num_value
|
||||||
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
|
from utils import apply_matrix_norm, nunpack
|
||||||
|
|
||||||
|
|
||||||
## CFFFont
|
## CFFFont
|
||||||
|
|
|
@ -7,43 +7,26 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from cmap import CMapDB
|
from cmap import CMapDB
|
||||||
from psparser import PSException
|
from psparser import PSException, PSTypeError, PSEOF
|
||||||
from psparser import PSTypeError
|
from psparser import PSLiteralTable, PSKeywordTable
|
||||||
from psparser import PSEOF
|
from psparser import PSKeyword, literal_name, keyword_name
|
||||||
from psparser import PSLiteralTable
|
|
||||||
from psparser import PSKeywordTable
|
|
||||||
from psparser import literal_name
|
|
||||||
from psparser import keyword_name
|
|
||||||
from psparser import PSStackParser
|
from psparser import PSStackParser
|
||||||
from psparser import PSKeyword
|
|
||||||
from psparser import STRICT
|
from psparser import STRICT
|
||||||
from pdftypes import PDFException
|
from pdftypes import PDFException, PDFStream, PDFObjRef
|
||||||
from pdftypes import PDFStream
|
|
||||||
from pdftypes import PDFObjRef
|
|
||||||
from pdftypes import resolve1
|
from pdftypes import resolve1
|
||||||
from pdftypes import int_value
|
from pdftypes import int_value, float_value, num_value
|
||||||
from pdftypes import float_value
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
from pdftypes import num_value
|
|
||||||
from pdftypes import str_value
|
|
||||||
from pdftypes import list_value
|
|
||||||
from pdftypes import dict_value
|
|
||||||
from pdftypes import stream_value
|
|
||||||
from pdffont import PDFFontError
|
from pdffont import PDFFontError
|
||||||
from pdffont import PDFType1Font
|
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
|
||||||
from pdffont import PDFTrueTypeFont
|
|
||||||
from pdffont import PDFType3Font
|
|
||||||
from pdffont import PDFCIDFont
|
from pdffont import PDFCIDFont
|
||||||
from pdfparser import PDFDocument
|
from pdfparser import PDFDocument, PDFParser
|
||||||
from pdfparser import PDFParser
|
|
||||||
from pdfparser import PDFPasswordIncorrect
|
from pdfparser import PDFPasswordIncorrect
|
||||||
from pdfcolor import PDFColorSpace
|
from pdfcolor import PDFColorSpace
|
||||||
from pdfcolor import PREDEFINED_COLORSPACE
|
from pdfcolor import PREDEFINED_COLORSPACE
|
||||||
from pdfcolor import LITERAL_DEVICE_GRAY
|
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||||
from pdfcolor import LITERAL_DEVICE_RGB
|
|
||||||
from pdfcolor import LITERAL_DEVICE_CMYK
|
from pdfcolor import LITERAL_DEVICE_CMYK
|
||||||
from utils import choplist
|
from utils import choplist
|
||||||
from utils import mult_matrix
|
from utils import mult_matrix, MATRIX_IDENTITY
|
||||||
from utils import MATRIX_IDENTITY
|
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
|
@ -124,8 +107,9 @@ class PDFResourceManager(object):
|
||||||
'''
|
'''
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, cmapdb):
|
||||||
self.fonts = {}
|
self.fonts = {}
|
||||||
|
self.cmapdb = cmapdb
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_procset(self, procs):
|
def get_procset(self, procs):
|
||||||
|
@ -140,7 +124,7 @@ class PDFResourceManager(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cmap(self, cmapname, strict=False):
|
def get_cmap(self, cmapname, strict=False):
|
||||||
return CMapDB.get_cmap(cmapname, strict=strict)
|
return self.cmapdb.get_cmap(cmapname, strict=strict)
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid, spec):
|
||||||
if objid and objid in self.fonts:
|
if objid and objid in self.fonts:
|
||||||
|
|
|
@ -4,32 +4,19 @@ import re
|
||||||
import md5
|
import md5
|
||||||
import struct
|
import struct
|
||||||
from sys import stderr
|
from sys import stderr
|
||||||
from utils import choplist
|
|
||||||
from utils import nunpack
|
|
||||||
from utils import decode_text
|
|
||||||
from arcfour import Arcfour
|
|
||||||
from psparser import PSStackParser
|
from psparser import PSStackParser
|
||||||
from psparser import PSSyntaxError
|
from psparser import PSSyntaxError, PSEOF
|
||||||
from psparser import PSEOF
|
from psparser import PSLiteralTable, PSKeywordTable
|
||||||
from psparser import PSLiteralTable
|
from psparser import literal_name, keyword_name
|
||||||
from psparser import PSKeywordTable
|
|
||||||
from psparser import literal_name
|
|
||||||
from psparser import keyword_name
|
|
||||||
from psparser import STRICT
|
from psparser import STRICT
|
||||||
from pdftypes import PDFException
|
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
|
||||||
from pdftypes import PDFTypeError
|
from pdftypes import PDFStream, PDFObjRef
|
||||||
from pdftypes import PDFNotImplementedError
|
from pdftypes import resolve1, decipher_all
|
||||||
from pdftypes import PDFStream
|
from pdftypes import int_value, float_value, num_value
|
||||||
from pdftypes import PDFObjRef
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
from pdftypes import resolve1
|
from arcfour import Arcfour
|
||||||
from pdftypes import decipher_all
|
from utils import choplist, nunpack
|
||||||
from pdftypes import int_value
|
from utils import decode_text
|
||||||
from pdftypes import float_value
|
|
||||||
from pdftypes import num_value
|
|
||||||
from pdftypes import str_value
|
|
||||||
from pdftypes import list_value
|
|
||||||
from pdftypes import dict_value
|
|
||||||
from pdftypes import stream_value
|
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
|
|
|
@ -3,13 +3,9 @@ import sys
|
||||||
import zlib
|
import zlib
|
||||||
from lzw import LZWDecoder
|
from lzw import LZWDecoder
|
||||||
from psparser import PSException
|
from psparser import PSException
|
||||||
from psparser import PSObject
|
from psparser import PSObject, PSLiteral, PSKeyword
|
||||||
from psparser import PSLiteral
|
from psparser import PSLiteralTable, PSKeywordTable
|
||||||
from psparser import PSKeyword
|
from psparser import literal_name, keyword_name
|
||||||
from psparser import PSLiteralTable
|
|
||||||
from psparser import PSKeywordTable
|
|
||||||
from psparser import literal_name
|
|
||||||
from psparser import keyword_name
|
|
||||||
from psparser import STRICT
|
from psparser import STRICT
|
||||||
|
|
||||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||||
|
|
|
@ -8,8 +8,7 @@
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
from struct import pack
|
from struct import pack, unpack
|
||||||
from struct import unpack
|
|
||||||
from array import array
|
from array import array
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,7 @@
|
||||||
## http://www.efgh.com/software/rijndael.htm
|
## http://www.efgh.com/software/rijndael.htm
|
||||||
##
|
##
|
||||||
import sys
|
import sys
|
||||||
from struct import pack
|
from struct import pack, unpack
|
||||||
from struct import unpack
|
|
||||||
|
|
||||||
def KEYLENGTH(keybits): return (keybits)/8
|
def KEYLENGTH(keybits): return (keybits)/8
|
||||||
def RKLENGTH(keybits): return (keybits)/8+28
|
def RKLENGTH(keybits): return (keybits)/8+28
|
||||||
|
|
|
@ -45,7 +45,7 @@ clean:
|
||||||
|
|
||||||
test: htmls texts xmls
|
test: htmls texts xmls
|
||||||
htmls: $(HTMLS)
|
htmls: $(HTMLS)
|
||||||
tests: $(TEXTS)
|
texts: $(TEXTS)
|
||||||
xmls: $(XMLS)
|
xmls: $(XMLS)
|
||||||
|
|
||||||
.SUFFIXES: .pdf .html .xml .txt
|
.SUFFIXES: .pdf .html .xml .txt
|
||||||
|
|
|
@ -58,8 +58,8 @@ def main(argv):
|
||||||
PDFPageInterpreter.debug = debug
|
PDFPageInterpreter.debug = debug
|
||||||
PDFDevice.debug = debug
|
PDFDevice.debug = debug
|
||||||
#
|
#
|
||||||
CMapDB.initialize(cmapdir)
|
cmapdb = CMapDB(cmapdir)
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager(cmapdb)
|
||||||
if not outtype:
|
if not outtype:
|
||||||
outtype = 'text'
|
outtype = 'text'
|
||||||
if outfile:
|
if outfile:
|
||||||
|
|
Loading…
Reference in New Issue