fix CMapDB initialization stuff. more code cleanup.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@148 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
3dd4f1668b
commit
77986b8273
1
TODO
1
TODO
|
@ -4,3 +4,4 @@ TODOs:
|
|||
- Better API Documentation.
|
||||
- Robust error handling.
|
||||
- Any special handling for linearized PDFs?
|
||||
- Handle security handler. (I need more samples!)
|
||||
|
|
|
@ -4,20 +4,14 @@ import re
|
|||
import os
|
||||
import os.path
|
||||
from sys import stderr
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
from struct import pack, unpack
|
||||
from psparser import PSStackParser
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
||||
from psparser import PSLiteral, PSKeyword
|
||||
from psparser import literal_name, keyword_name
|
||||
from fontmetrics import FONT_METRICS
|
||||
from latin_enc import ENCODING
|
||||
from glyphlist import charname2unicode
|
||||
from psparser import PSException
|
||||
from psparser import PSSyntaxError
|
||||
from psparser import PSTypeError
|
||||
from psparser import PSEOF
|
||||
from psparser import PSLiteral
|
||||
from psparser import PSKeyword
|
||||
from psparser import literal_name
|
||||
from psparser import keyword_name
|
||||
from psparser import PSStackParser
|
||||
from utils import choplist
|
||||
from utils import nunpack
|
||||
try:
|
||||
|
@ -201,36 +195,30 @@ class CMapDB(object):
|
|||
|
||||
class CMapNotFound(CMapError): pass
|
||||
|
||||
CMAP_ALIAS = {
|
||||
}
|
||||
|
||||
CMAP_ALIAS = { }
|
||||
debug = 0
|
||||
dirname = None
|
||||
cdbdirname = None
|
||||
cmapdb = {}
|
||||
|
||||
@classmethod
|
||||
def initialize(klass, dirname=None, cdbdirname=None):
|
||||
def __init__(self, dirname=None, cdbdirname=None):
|
||||
if not dirname:
|
||||
dirname = find_cmap_path()
|
||||
klass.dirname = dirname
|
||||
klass.cdbdirname = cdbdirname or dirname
|
||||
self.dirname = dirname
|
||||
self.cdbdirname = cdbdirname or dirname
|
||||
self.cmapdb = {}
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def get_cmap(klass, cmapname, strict=True):
|
||||
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
|
||||
if cmapname in klass.cmapdb:
|
||||
cmap = klass.cmapdb[cmapname]
|
||||
def get_cmap(self, cmapname, strict=True):
|
||||
cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
|
||||
if cmapname in self.cmapdb:
|
||||
cmap = self.cmapdb[cmapname]
|
||||
else:
|
||||
fname = os.path.join(klass.dirname, cmapname)
|
||||
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
|
||||
fname = os.path.join(self.dirname, cmapname)
|
||||
cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
|
||||
if os.path.exists(cdbname):
|
||||
if 1 <= klass.debug:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
||||
cmap = CDBCMap(cdbname)
|
||||
elif os.path.exists(fname):
|
||||
if 1 <= klass.debug:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Reading: CMap %r...' % fname
|
||||
cmap = CMap()
|
||||
fp = file(fname, 'rb')
|
||||
|
@ -240,7 +228,7 @@ class CMapDB(object):
|
|||
cmap = CMap() # just create empty cmap
|
||||
else:
|
||||
raise CMapDB.CMapNotFound(cmapname)
|
||||
klass.cmapdb[cmapname] = cmap
|
||||
self.cmapdb[cmapname] = cmap
|
||||
return cmap
|
||||
|
||||
|
||||
|
|
|
@ -1,20 +1,12 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfdevice import PDFDevice
|
||||
from pdfdevice import PDFTextDevice
|
||||
from pdfdevice import PDFDevice, PDFTextDevice
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from layout import LayoutContainer
|
||||
from layout import LTPage
|
||||
from layout import LTText
|
||||
from layout import LTLine
|
||||
from layout import LTRect
|
||||
from layout import LTFigure
|
||||
from layout import LTTextItem
|
||||
from layout import LTTextBox
|
||||
from layout import LTTextLine
|
||||
from utils import apply_matrix_pt
|
||||
from utils import mult_matrix
|
||||
from layout import LTPage, LTText, LTLine, LTRect
|
||||
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||
from utils import enc
|
||||
from utils import apply_matrix_pt, mult_matrix
|
||||
|
||||
|
||||
## TagExtractor
|
||||
|
|
|
@ -1,33 +1,19 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from cmap import CMap
|
||||
from cmap import CMapDB
|
||||
from cmap import CMapParser
|
||||
from cmap import FontMetricsDB
|
||||
from cmap import EncodingDB
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
from psparser import PSLiteralTable
|
||||
from psparser import PSKeywordTable
|
||||
from psparser import PSLiteral
|
||||
from psparser import literal_name
|
||||
from psparser import keyword_name
|
||||
from psparser import STRICT
|
||||
from pdftypes import PDFException
|
||||
from pdftypes import resolve1
|
||||
from pdftypes import int_value
|
||||
from pdftypes import float_value
|
||||
from pdftypes import num_value
|
||||
from pdftypes import str_value
|
||||
from pdftypes import list_value
|
||||
from pdftypes import dict_value
|
||||
from pdftypes import stream_value
|
||||
from utils import apply_matrix_norm
|
||||
from utils import nunpack
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from cmap import CMap, CMapDB, CMapParser
|
||||
from cmap import FontMetricsDB, EncodingDB
|
||||
from struct import pack, unpack
|
||||
from psparser import STRICT
|
||||
from psparser import PSLiteralTable, PSKeywordTable
|
||||
from psparser import PSLiteral, literal_name, keyword_name
|
||||
from pdftypes import PDFException, resolve1
|
||||
from pdftypes import int_value, float_value, num_value
|
||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
from utils import apply_matrix_norm, nunpack
|
||||
|
||||
|
||||
## CFFFont
|
||||
|
|
|
@ -7,43 +7,26 @@ try:
|
|||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from cmap import CMapDB
|
||||
from psparser import PSException
|
||||
from psparser import PSTypeError
|
||||
from psparser import PSEOF
|
||||
from psparser import PSLiteralTable
|
||||
from psparser import PSKeywordTable
|
||||
from psparser import literal_name
|
||||
from psparser import keyword_name
|
||||
from psparser import PSException, PSTypeError, PSEOF
|
||||
from psparser import PSLiteralTable, PSKeywordTable
|
||||
from psparser import PSKeyword, literal_name, keyword_name
|
||||
from psparser import PSStackParser
|
||||
from psparser import PSKeyword
|
||||
from psparser import STRICT
|
||||
from pdftypes import PDFException
|
||||
from pdftypes import PDFStream
|
||||
from pdftypes import PDFObjRef
|
||||
from pdftypes import PDFException, PDFStream, PDFObjRef
|
||||
from pdftypes import resolve1
|
||||
from pdftypes import int_value
|
||||
from pdftypes import float_value
|
||||
from pdftypes import num_value
|
||||
from pdftypes import str_value
|
||||
from pdftypes import list_value
|
||||
from pdftypes import dict_value
|
||||
from pdftypes import stream_value
|
||||
from pdftypes import int_value, float_value, num_value
|
||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
from pdffont import PDFFontError
|
||||
from pdffont import PDFType1Font
|
||||
from pdffont import PDFTrueTypeFont
|
||||
from pdffont import PDFType3Font
|
||||
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
|
||||
from pdffont import PDFCIDFont
|
||||
from pdfparser import PDFDocument
|
||||
from pdfparser import PDFParser
|
||||
from pdfparser import PDFDocument, PDFParser
|
||||
from pdfparser import PDFPasswordIncorrect
|
||||
from pdfcolor import PDFColorSpace
|
||||
from pdfcolor import PREDEFINED_COLORSPACE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY
|
||||
from pdfcolor import LITERAL_DEVICE_RGB
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||
from pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from utils import choplist
|
||||
from utils import mult_matrix
|
||||
from utils import MATRIX_IDENTITY
|
||||
from utils import mult_matrix, MATRIX_IDENTITY
|
||||
|
||||
|
||||
## Exceptions
|
||||
|
@ -124,8 +107,9 @@ class PDFResourceManager(object):
|
|||
'''
|
||||
debug = 0
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, cmapdb):
|
||||
self.fonts = {}
|
||||
self.cmapdb = cmapdb
|
||||
return
|
||||
|
||||
def get_procset(self, procs):
|
||||
|
@ -140,7 +124,7 @@ class PDFResourceManager(object):
|
|||
return
|
||||
|
||||
def get_cmap(self, cmapname, strict=False):
|
||||
return CMapDB.get_cmap(cmapname, strict=strict)
|
||||
return self.cmapdb.get_cmap(cmapname, strict=strict)
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
if objid and objid in self.fonts:
|
||||
|
|
|
@ -4,32 +4,19 @@ import re
|
|||
import md5
|
||||
import struct
|
||||
from sys import stderr
|
||||
from utils import choplist
|
||||
from utils import nunpack
|
||||
from utils import decode_text
|
||||
from arcfour import Arcfour
|
||||
from psparser import PSStackParser
|
||||
from psparser import PSSyntaxError
|
||||
from psparser import PSEOF
|
||||
from psparser import PSLiteralTable
|
||||
from psparser import PSKeywordTable
|
||||
from psparser import literal_name
|
||||
from psparser import keyword_name
|
||||
from psparser import PSSyntaxError, PSEOF
|
||||
from psparser import PSLiteralTable, PSKeywordTable
|
||||
from psparser import literal_name, keyword_name
|
||||
from psparser import STRICT
|
||||
from pdftypes import PDFException
|
||||
from pdftypes import PDFTypeError
|
||||
from pdftypes import PDFNotImplementedError
|
||||
from pdftypes import PDFStream
|
||||
from pdftypes import PDFObjRef
|
||||
from pdftypes import resolve1
|
||||
from pdftypes import decipher_all
|
||||
from pdftypes import int_value
|
||||
from pdftypes import float_value
|
||||
from pdftypes import num_value
|
||||
from pdftypes import str_value
|
||||
from pdftypes import list_value
|
||||
from pdftypes import dict_value
|
||||
from pdftypes import stream_value
|
||||
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
|
||||
from pdftypes import PDFStream, PDFObjRef
|
||||
from pdftypes import resolve1, decipher_all
|
||||
from pdftypes import int_value, float_value, num_value
|
||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
from arcfour import Arcfour
|
||||
from utils import choplist, nunpack
|
||||
from utils import decode_text
|
||||
|
||||
|
||||
## Exceptions
|
||||
|
|
|
@ -3,13 +3,9 @@ import sys
|
|||
import zlib
|
||||
from lzw import LZWDecoder
|
||||
from psparser import PSException
|
||||
from psparser import PSObject
|
||||
from psparser import PSLiteral
|
||||
from psparser import PSKeyword
|
||||
from psparser import PSLiteralTable
|
||||
from psparser import PSKeywordTable
|
||||
from psparser import literal_name
|
||||
from psparser import keyword_name
|
||||
from psparser import PSObject, PSLiteral, PSKeyword
|
||||
from psparser import PSLiteralTable, PSKeywordTable
|
||||
from psparser import literal_name, keyword_name
|
||||
from psparser import STRICT
|
||||
|
||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||
|
|
|
@ -8,8 +8,7 @@
|
|||
|
||||
import sys
|
||||
import os
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
from struct import pack, unpack
|
||||
from array import array
|
||||
|
||||
|
||||
|
|
|
@ -7,8 +7,7 @@
|
|||
## http://www.efgh.com/software/rijndael.htm
|
||||
##
|
||||
import sys
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
from struct import pack, unpack
|
||||
|
||||
def KEYLENGTH(keybits): return (keybits)/8
|
||||
def RKLENGTH(keybits): return (keybits)/8+28
|
||||
|
|
|
@ -45,7 +45,7 @@ clean:
|
|||
|
||||
test: htmls texts xmls
|
||||
htmls: $(HTMLS)
|
||||
tests: $(TEXTS)
|
||||
texts: $(TEXTS)
|
||||
xmls: $(XMLS)
|
||||
|
||||
.SUFFIXES: .pdf .html .xml .txt
|
||||
|
|
|
@ -58,8 +58,8 @@ def main(argv):
|
|||
PDFPageInterpreter.debug = debug
|
||||
PDFDevice.debug = debug
|
||||
#
|
||||
CMapDB.initialize(cmapdir)
|
||||
rsrc = PDFResourceManager()
|
||||
cmapdb = CMapDB(cmapdir)
|
||||
rsrc = PDFResourceManager(cmapdb)
|
||||
if not outtype:
|
||||
outtype = 'text'
|
||||
if outfile:
|
||||
|
|
Loading…
Reference in New Issue