fix CMapDB initialization stuff. more code cleanup.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@148 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-11-03 13:39:34 +00:00
parent 3dd4f1668b
commit 77986b8273
11 changed files with 66 additions and 134 deletions

1
TODO
View File

@ -4,3 +4,4 @@ TODOs:
- Better API Documentation. - Better API Documentation.
- Robust error handling. - Robust error handling.
- Any special handling for linearized PDFs? - Any special handling for linearized PDFs?
- Handle security handler. (I need more samples!)

View File

@ -4,20 +4,14 @@ import re
import os import os
import os.path import os.path
from sys import stderr from sys import stderr
from struct import pack from struct import pack, unpack
from struct import unpack from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
from psparser import PSLiteral, PSKeyword
from psparser import literal_name, keyword_name
from fontmetrics import FONT_METRICS from fontmetrics import FONT_METRICS
from latin_enc import ENCODING from latin_enc import ENCODING
from glyphlist import charname2unicode from glyphlist import charname2unicode
from psparser import PSException
from psparser import PSSyntaxError
from psparser import PSTypeError
from psparser import PSEOF
from psparser import PSLiteral
from psparser import PSKeyword
from psparser import literal_name
from psparser import keyword_name
from psparser import PSStackParser
from utils import choplist from utils import choplist
from utils import nunpack from utils import nunpack
try: try:
@ -201,36 +195,30 @@ class CMapDB(object):
class CMapNotFound(CMapError): pass class CMapNotFound(CMapError): pass
CMAP_ALIAS = { CMAP_ALIAS = { }
}
debug = 0 debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
@classmethod def __init__(self, dirname=None, cdbdirname=None):
def initialize(klass, dirname=None, cdbdirname=None):
if not dirname: if not dirname:
dirname = find_cmap_path() dirname = find_cmap_path()
klass.dirname = dirname self.dirname = dirname
klass.cdbdirname = cdbdirname or dirname self.cdbdirname = cdbdirname or dirname
self.cmapdb = {}
return return
@classmethod def get_cmap(self, cmapname, strict=True):
def get_cmap(klass, cmapname, strict=True): cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) if cmapname in self.cmapdb:
if cmapname in klass.cmapdb: cmap = self.cmapdb[cmapname]
cmap = klass.cmapdb[cmapname]
else: else:
fname = os.path.join(klass.dirname, cmapname) fname = os.path.join(self.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname): if os.path.exists(cdbname):
if 1 <= klass.debug: if 1 <= self.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname) cmap = CDBCMap(cdbname)
elif os.path.exists(fname): elif os.path.exists(fname):
if 1 <= klass.debug: if 1 <= self.debug:
print >>stderr, 'Reading: CMap %r...' % fname print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap() cmap = CMap()
fp = file(fname, 'rb') fp = file(fname, 'rb')
@ -240,7 +228,7 @@ class CMapDB(object):
cmap = CMap() # just create empty cmap cmap = CMap() # just create empty cmap
else: else:
raise CMapDB.CMapNotFound(cmapname) raise CMapDB.CMapNotFound(cmapname)
klass.cmapdb[cmapname] = cmap self.cmapdb[cmapname] = cmap
return cmap return cmap

View File

@ -1,20 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from pdfdevice import PDFDevice from pdfdevice import PDFDevice, PDFTextDevice
from pdfdevice import PDFTextDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage from layout import LTPage, LTText, LTLine, LTRect
from layout import LTText from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
from layout import LTLine
from layout import LTRect
from layout import LTFigure
from layout import LTTextItem
from layout import LTTextBox
from layout import LTTextLine
from utils import apply_matrix_pt
from utils import mult_matrix
from utils import enc from utils import enc
from utils import apply_matrix_pt, mult_matrix
## TagExtractor ## TagExtractor

View File

@ -1,33 +1,19 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from cmap import CMap
from cmap import CMapDB
from cmap import CMapParser
from cmap import FontMetricsDB
from cmap import EncodingDB
from struct import pack
from struct import unpack
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import PSLiteral
from psparser import literal_name
from psparser import keyword_name
from psparser import STRICT
from pdftypes import PDFException
from pdftypes import resolve1
from pdftypes import int_value
from pdftypes import float_value
from pdftypes import num_value
from pdftypes import str_value
from pdftypes import list_value
from pdftypes import dict_value
from pdftypes import stream_value
from utils import apply_matrix_norm
from utils import nunpack
try: try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from cmap import CMap, CMapDB, CMapParser
from cmap import FontMetricsDB, EncodingDB
from struct import pack, unpack
from psparser import STRICT
from psparser import PSLiteralTable, PSKeywordTable
from psparser import PSLiteral, literal_name, keyword_name
from pdftypes import PDFException, resolve1
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from utils import apply_matrix_norm, nunpack
## CFFFont ## CFFFont

View File

@ -7,43 +7,26 @@ try:
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from cmap import CMapDB from cmap import CMapDB
from psparser import PSException from psparser import PSException, PSTypeError, PSEOF
from psparser import PSTypeError from psparser import PSLiteralTable, PSKeywordTable
from psparser import PSEOF from psparser import PSKeyword, literal_name, keyword_name
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import literal_name
from psparser import keyword_name
from psparser import PSStackParser from psparser import PSStackParser
from psparser import PSKeyword
from psparser import STRICT from psparser import STRICT
from pdftypes import PDFException from pdftypes import PDFException, PDFStream, PDFObjRef
from pdftypes import PDFStream
from pdftypes import PDFObjRef
from pdftypes import resolve1 from pdftypes import resolve1
from pdftypes import int_value from pdftypes import int_value, float_value, num_value
from pdftypes import float_value from pdftypes import str_value, list_value, dict_value, stream_value
from pdftypes import num_value
from pdftypes import str_value
from pdftypes import list_value
from pdftypes import dict_value
from pdftypes import stream_value
from pdffont import PDFFontError from pdffont import PDFFontError
from pdffont import PDFType1Font from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
from pdffont import PDFTrueTypeFont
from pdffont import PDFType3Font
from pdffont import PDFCIDFont from pdffont import PDFCIDFont
from pdfparser import PDFDocument from pdfparser import PDFDocument, PDFParser
from pdfparser import PDFParser
from pdfparser import PDFPasswordIncorrect from pdfparser import PDFPasswordIncorrect
from pdfcolor import PDFColorSpace from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_CMYK from pdfcolor import LITERAL_DEVICE_CMYK
from utils import choplist from utils import choplist
from utils import mult_matrix from utils import mult_matrix, MATRIX_IDENTITY
from utils import MATRIX_IDENTITY
## Exceptions ## Exceptions
@ -124,8 +107,9 @@ class PDFResourceManager(object):
''' '''
debug = 0 debug = 0
def __init__(self): def __init__(self, cmapdb):
self.fonts = {} self.fonts = {}
self.cmapdb = cmapdb
return return
def get_procset(self, procs): def get_procset(self, procs):
@ -140,7 +124,7 @@ class PDFResourceManager(object):
return return
def get_cmap(self, cmapname, strict=False): def get_cmap(self, cmapname, strict=False):
return CMapDB.get_cmap(cmapname, strict=strict) return self.cmapdb.get_cmap(cmapname, strict=strict)
def get_font(self, objid, spec): def get_font(self, objid, spec):
if objid and objid in self.fonts: if objid and objid in self.fonts:

View File

@ -4,32 +4,19 @@ import re
import md5 import md5
import struct import struct
from sys import stderr from sys import stderr
from utils import choplist
from utils import nunpack
from utils import decode_text
from arcfour import Arcfour
from psparser import PSStackParser from psparser import PSStackParser
from psparser import PSSyntaxError from psparser import PSSyntaxError, PSEOF
from psparser import PSEOF from psparser import PSLiteralTable, PSKeywordTable
from psparser import PSLiteralTable from psparser import literal_name, keyword_name
from psparser import PSKeywordTable
from psparser import literal_name
from psparser import keyword_name
from psparser import STRICT from psparser import STRICT
from pdftypes import PDFException from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFTypeError from pdftypes import PDFStream, PDFObjRef
from pdftypes import PDFNotImplementedError from pdftypes import resolve1, decipher_all
from pdftypes import PDFStream from pdftypes import int_value, float_value, num_value
from pdftypes import PDFObjRef from pdftypes import str_value, list_value, dict_value, stream_value
from pdftypes import resolve1 from arcfour import Arcfour
from pdftypes import decipher_all from utils import choplist, nunpack
from pdftypes import int_value from utils import decode_text
from pdftypes import float_value
from pdftypes import num_value
from pdftypes import str_value
from pdftypes import list_value
from pdftypes import dict_value
from pdftypes import stream_value
## Exceptions ## Exceptions

View File

@ -3,13 +3,9 @@ import sys
import zlib import zlib
from lzw import LZWDecoder from lzw import LZWDecoder
from psparser import PSException from psparser import PSException
from psparser import PSObject from psparser import PSObject, PSLiteral, PSKeyword
from psparser import PSLiteral from psparser import PSLiteralTable, PSKeywordTable
from psparser import PSKeyword from psparser import literal_name, keyword_name
from psparser import PSLiteralTable
from psparser import PSKeywordTable
from psparser import literal_name
from psparser import keyword_name
from psparser import STRICT from psparser import STRICT
LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERAL_CRYPT = PSLiteralTable.intern('Crypt')

View File

@ -8,8 +8,7 @@
import sys import sys
import os import os
from struct import pack from struct import pack, unpack
from struct import unpack
from array import array from array import array

View File

@ -7,8 +7,7 @@
## http://www.efgh.com/software/rijndael.htm ## http://www.efgh.com/software/rijndael.htm
## ##
import sys import sys
from struct import pack from struct import pack, unpack
from struct import unpack
def KEYLENGTH(keybits): return (keybits)/8 def KEYLENGTH(keybits): return (keybits)/8
def RKLENGTH(keybits): return (keybits)/8+28 def RKLENGTH(keybits): return (keybits)/8+28

View File

@ -45,7 +45,7 @@ clean:
test: htmls texts xmls test: htmls texts xmls
htmls: $(HTMLS) htmls: $(HTMLS)
tests: $(TEXTS) texts: $(TEXTS)
xmls: $(XMLS) xmls: $(XMLS)
.SUFFIXES: .pdf .html .xml .txt .SUFFIXES: .pdf .html .xml .txt

View File

@ -58,8 +58,8 @@ def main(argv):
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
CMapDB.initialize(cmapdir) cmapdb = CMapDB(cmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager(cmapdb)
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
if outfile: if outfile: