fix CMapDB initialization stuff. more code cleanup.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@148 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-11-03 13:39:34 +00:00
parent 3dd4f1668b
commit 77986b8273
11 changed files with 66 additions and 134 deletions

1
TODO
View File

@ -4,3 +4,4 @@ TODOs:
- Better API Documentation.
- Robust error handling.
- Any special handling for linearized PDFs?
- Handle security handler. (I need more samples!)

View File

@ -4,20 +4,14 @@ import re
import os
import os.path
from sys import stderr
from struct import pack
from struct import unpack
from struct import pack, unpack
from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
from psparser import PSLiteral, PSKeyword
from psparser import literal_name, keyword_name
from fontmetrics import FONT_METRICS
from latin_enc import ENCODING
from glyphlist import charname2unicode
from psparser import PSException
from psparser import PSSyntaxError
from psparser import PSTypeError
from psparser import PSEOF
from psparser import PSLiteral
from psparser import PSKeyword
from psparser import literal_name
from psparser import keyword_name
from psparser import PSStackParser
from utils import choplist
from utils import nunpack
try:
@ -201,36 +195,30 @@ class CMapDB(object):
class CMapNotFound(CMapError): pass
CMAP_ALIAS = {
}
CMAP_ALIAS = { }
debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
@classmethod
def initialize(klass, dirname=None, cdbdirname=None):
def __init__(self, dirname=None, cdbdirname=None):
if not dirname:
dirname = find_cmap_path()
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
self.dirname = dirname
self.cdbdirname = cdbdirname or dirname
self.cmapdb = {}
return
@classmethod
def get_cmap(klass, cmapname, strict=True):
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in klass.cmapdb:
cmap = klass.cmapdb[cmapname]
def get_cmap(self, cmapname, strict=True):
cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in self.cmapdb:
cmap = self.cmapdb[cmapname]
else:
fname = os.path.join(klass.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
fname = os.path.join(self.dirname, cmapname)
cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname):
if 1 <= klass.debug:
if 1 <= self.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= klass.debug:
if 1 <= self.debug:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname, 'rb')
@ -240,7 +228,7 @@ class CMapDB(object):
cmap = CMap() # just create empty cmap
else:
raise CMapDB.CMapNotFound(cmapname)
klass.cmapdb[cmapname] = cmap
self.cmapdb[cmapname] = cmap
return cmap

View File

@ -1,20 +1,12 @@
#!/usr/bin/env python
import sys
from pdfdevice import PDFDevice
from pdfdevice import PDFTextDevice
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer
from layout import LTPage
from layout import LTText
from layout import LTLine
from layout import LTRect
from layout import LTFigure
from layout import LTTextItem
from layout import LTTextBox
from layout import LTTextLine
from utils import apply_matrix_pt
from utils import mult_matrix
from layout import LTPage, LTText, LTLine, LTRect
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
from utils import enc
from utils import apply_matrix_pt, mult_matrix
## TagExtractor

View File

@ -1,33 +1,19 @@
#!/usr/bin/env python
import sys
from cmap import CMap
from cmap import CMapDB
from cmap import CMapParser
from cmap import FontMetricsDB
from cmap import EncodingDB
from struct import pack
from struct import unpack
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import PSLiteral
from psparser import literal_name
from psparser import keyword_name
from psparser import STRICT
from pdftypes import PDFException
from pdftypes import resolve1
from pdftypes import int_value
from pdftypes import float_value
from pdftypes import num_value
from pdftypes import str_value
from pdftypes import list_value
from pdftypes import dict_value
from pdftypes import stream_value
from utils import apply_matrix_norm
from utils import nunpack
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from cmap import CMap, CMapDB, CMapParser
from cmap import FontMetricsDB, EncodingDB
from struct import pack, unpack
from psparser import STRICT
from psparser import PSLiteralTable, PSKeywordTable
from psparser import PSLiteral, literal_name, keyword_name
from pdftypes import PDFException, resolve1
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from utils import apply_matrix_norm, nunpack
## CFFFont

View File

@ -7,43 +7,26 @@ try:
except ImportError:
from StringIO import StringIO
from cmap import CMapDB
from psparser import PSException
from psparser import PSTypeError
from psparser import PSEOF
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import literal_name
from psparser import keyword_name
from psparser import PSException, PSTypeError, PSEOF
from psparser import PSLiteralTable, PSKeywordTable
from psparser import PSKeyword, literal_name, keyword_name
from psparser import PSStackParser
from psparser import PSKeyword
from psparser import STRICT
from pdftypes import PDFException
from pdftypes import PDFStream
from pdftypes import PDFObjRef
from pdftypes import PDFException, PDFStream, PDFObjRef
from pdftypes import resolve1
from pdftypes import int_value
from pdftypes import float_value
from pdftypes import num_value
from pdftypes import str_value
from pdftypes import list_value
from pdftypes import dict_value
from pdftypes import stream_value
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from pdffont import PDFFontError
from pdffont import PDFType1Font
from pdffont import PDFTrueTypeFont
from pdffont import PDFType3Font
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
from pdffont import PDFCIDFont
from pdfparser import PDFDocument
from pdfparser import PDFParser
from pdfparser import PDFDocument, PDFParser
from pdfparser import PDFPasswordIncorrect
from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY
from pdfcolor import LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_CMYK
from utils import choplist
from utils import mult_matrix
from utils import MATRIX_IDENTITY
from utils import mult_matrix, MATRIX_IDENTITY
## Exceptions
@ -124,8 +107,9 @@ class PDFResourceManager(object):
'''
debug = 0
def __init__(self):
def __init__(self, cmapdb):
self.fonts = {}
self.cmapdb = cmapdb
return
def get_procset(self, procs):
@ -140,7 +124,7 @@ class PDFResourceManager(object):
return
def get_cmap(self, cmapname, strict=False):
return CMapDB.get_cmap(cmapname, strict=strict)
return self.cmapdb.get_cmap(cmapname, strict=strict)
def get_font(self, objid, spec):
if objid and objid in self.fonts:

View File

@ -4,32 +4,19 @@ import re
import md5
import struct
from sys import stderr
from utils import choplist
from utils import nunpack
from utils import decode_text
from arcfour import Arcfour
from psparser import PSStackParser
from psparser import PSSyntaxError
from psparser import PSEOF
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import literal_name
from psparser import keyword_name
from psparser import PSSyntaxError, PSEOF
from psparser import PSLiteralTable, PSKeywordTable
from psparser import literal_name, keyword_name
from psparser import STRICT
from pdftypes import PDFException
from pdftypes import PDFTypeError
from pdftypes import PDFNotImplementedError
from pdftypes import PDFStream
from pdftypes import PDFObjRef
from pdftypes import resolve1
from pdftypes import decipher_all
from pdftypes import int_value
from pdftypes import float_value
from pdftypes import num_value
from pdftypes import str_value
from pdftypes import list_value
from pdftypes import dict_value
from pdftypes import stream_value
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFStream, PDFObjRef
from pdftypes import resolve1, decipher_all
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from arcfour import Arcfour
from utils import choplist, nunpack
from utils import decode_text
## Exceptions

View File

@ -3,13 +3,9 @@ import sys
import zlib
from lzw import LZWDecoder
from psparser import PSException
from psparser import PSObject
from psparser import PSLiteral
from psparser import PSKeyword
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import literal_name
from psparser import keyword_name
from psparser import PSObject, PSLiteral, PSKeyword
from psparser import PSLiteralTable, PSKeywordTable
from psparser import literal_name, keyword_name
from psparser import STRICT
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')

View File

@ -8,8 +8,7 @@
import sys
import os
from struct import pack
from struct import unpack
from struct import pack, unpack
from array import array

View File

@ -7,8 +7,7 @@
## http://www.efgh.com/software/rijndael.htm
##
import sys
from struct import pack
from struct import unpack
from struct import pack, unpack
def KEYLENGTH(keybits): return (keybits)/8
def RKLENGTH(keybits): return (keybits)/8+28

View File

@ -45,7 +45,7 @@ clean:
test: htmls texts xmls
htmls: $(HTMLS)
tests: $(TEXTS)
texts: $(TEXTS)
xmls: $(XMLS)
.SUFFIXES: .pdf .html .xml .txt

View File

@ -58,8 +58,8 @@ def main(argv):
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
CMapDB.initialize(cmapdir)
rsrc = PDFResourceManager()
cmapdb = CMapDB(cmapdir)
rsrc = PDFResourceManager(cmapdb)
if not outtype:
outtype = 'text'
if outfile: