diff --git a/TODO b/TODO index ecc746c..34cffce 100644 --- a/TODO +++ b/TODO @@ -4,3 +4,4 @@ TODOs: - Better API Documentation. - Robust error handling. - Any special handling for linearized PDFs? + - Handle security handler. (I need more samples!) diff --git a/pdfminer/cmap.py b/pdfminer/cmap.py index b924272..f0ea641 100644 --- a/pdfminer/cmap.py +++ b/pdfminer/cmap.py @@ -4,20 +4,14 @@ import re import os import os.path from sys import stderr -from struct import pack -from struct import unpack +from struct import pack, unpack +from psparser import PSStackParser +from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF +from psparser import PSLiteral, PSKeyword +from psparser import literal_name, keyword_name from fontmetrics import FONT_METRICS from latin_enc import ENCODING from glyphlist import charname2unicode -from psparser import PSException -from psparser import PSSyntaxError -from psparser import PSTypeError -from psparser import PSEOF -from psparser import PSLiteral -from psparser import PSKeyword -from psparser import literal_name -from psparser import keyword_name -from psparser import PSStackParser from utils import choplist from utils import nunpack try: @@ -201,36 +195,30 @@ class CMapDB(object): class CMapNotFound(CMapError): pass - CMAP_ALIAS = { - } - + CMAP_ALIAS = { } debug = 0 - dirname = None - cdbdirname = None - cmapdb = {} - @classmethod - def initialize(klass, dirname=None, cdbdirname=None): + def __init__(self, dirname=None, cdbdirname=None): if not dirname: dirname = find_cmap_path() - klass.dirname = dirname - klass.cdbdirname = cdbdirname or dirname + self.dirname = dirname + self.cdbdirname = cdbdirname or dirname + self.cmapdb = {} return - @classmethod - def get_cmap(klass, cmapname, strict=True): - cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) - if cmapname in klass.cmapdb: - cmap = klass.cmapdb[cmapname] + def get_cmap(self, cmapname, strict=True): + cmapname = self.CMAP_ALIAS.get(cmapname, cmapname) + if cmapname in self.cmapdb: + cmap = self.cmapdb[cmapname] else: - fname = os.path.join(klass.dirname, cmapname) - cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') + fname = os.path.join(self.dirname, cmapname) + cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb') if os.path.exists(cdbname): - if 1 <= klass.debug: + if 1 <= self.debug: print >>stderr, 'Opening: CDBCMap %r...' % cdbname cmap = CDBCMap(cdbname) elif os.path.exists(fname): - if 1 <= klass.debug: + if 1 <= self.debug: print >>stderr, 'Reading: CMap %r...' % fname cmap = CMap() fp = file(fname, 'rb') @@ -240,7 +228,7 @@ class CMapDB(object): cmap = CMap() # just create empty cmap else: raise CMapDB.CMapNotFound(cmapname) - klass.cmapdb[cmapname] = cmap + self.cmapdb[cmapname] = cmap return cmap diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 214f9fe..63fdc5a 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,20 +1,12 @@ #!/usr/bin/env python import sys -from pdfdevice import PDFDevice -from pdfdevice import PDFTextDevice +from pdfdevice import PDFDevice, PDFTextDevice from pdffont import PDFUnicodeNotDefined from layout import LayoutContainer -from layout import LTPage -from layout import LTText -from layout import LTLine -from layout import LTRect -from layout import LTFigure -from layout import LTTextItem -from layout import LTTextBox -from layout import LTTextLine -from utils import apply_matrix_pt -from utils import mult_matrix +from layout import LTPage, LTText, LTLine, LTRect +from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine from utils import enc +from utils import apply_matrix_pt, mult_matrix ## TagExtractor diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 4e74b8c..9b7f0f5 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,33 +1,19 @@ #!/usr/bin/env python import sys -from cmap import CMap -from cmap import CMapDB -from cmap import CMapParser -from cmap import FontMetricsDB -from cmap import EncodingDB -from struct import pack -from struct import unpack -from psparser import PSLiteralTable -from psparser import PSKeywordTable -from psparser import PSLiteral -from psparser import literal_name -from psparser import keyword_name -from psparser import STRICT -from pdftypes import PDFException -from pdftypes import resolve1 -from pdftypes import int_value -from pdftypes import float_value -from pdftypes import num_value -from pdftypes import str_value -from pdftypes import list_value -from pdftypes import dict_value -from pdftypes import stream_value -from utils import apply_matrix_norm -from utils import nunpack try: from cStringIO import StringIO except ImportError: from StringIO import StringIO +from cmap import CMap, CMapDB, CMapParser +from cmap import FontMetricsDB, EncodingDB +from struct import pack, unpack +from psparser import STRICT +from psparser import PSLiteralTable, PSKeywordTable +from psparser import PSLiteral, literal_name, keyword_name +from pdftypes import PDFException, resolve1 +from pdftypes import int_value, float_value, num_value +from pdftypes import str_value, list_value, dict_value, stream_value +from utils import apply_matrix_norm, nunpack ## CFFFont diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 520df73..e403087 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -7,43 +7,26 @@ try: except ImportError: from StringIO import StringIO from cmap import CMapDB -from psparser import PSException -from psparser import PSTypeError -from psparser import PSEOF -from psparser import PSLiteralTable -from psparser import PSKeywordTable -from psparser import literal_name -from psparser import keyword_name +from psparser import PSException, PSTypeError, PSEOF +from psparser import PSLiteralTable, PSKeywordTable +from psparser import PSKeyword, literal_name, keyword_name from psparser import PSStackParser -from psparser import PSKeyword from psparser import STRICT -from pdftypes import PDFException -from pdftypes import PDFStream -from pdftypes import PDFObjRef +from pdftypes import PDFException, PDFStream, PDFObjRef from pdftypes import resolve1 -from pdftypes import int_value -from pdftypes import float_value -from pdftypes import num_value -from pdftypes import str_value -from pdftypes import list_value -from pdftypes import dict_value -from pdftypes import stream_value +from pdftypes import int_value, float_value, num_value +from pdftypes import str_value, list_value, dict_value, stream_value from pdffont import PDFFontError -from pdffont import PDFType1Font -from pdffont import PDFTrueTypeFont -from pdffont import PDFType3Font +from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font from pdffont import PDFCIDFont -from pdfparser import PDFDocument -from pdfparser import PDFParser +from pdfparser import PDFDocument, PDFParser from pdfparser import PDFPasswordIncorrect from pdfcolor import PDFColorSpace from pdfcolor import PREDEFINED_COLORSPACE -from pdfcolor import LITERAL_DEVICE_GRAY -from pdfcolor import LITERAL_DEVICE_RGB +from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from pdfcolor import LITERAL_DEVICE_CMYK from utils import choplist -from utils import mult_matrix -from utils import MATRIX_IDENTITY +from utils import mult_matrix, MATRIX_IDENTITY ## Exceptions @@ -124,8 +107,9 @@ class PDFResourceManager(object): ''' debug = 0 - def __init__(self): + def __init__(self, cmapdb): self.fonts = {} + self.cmapdb = cmapdb return def get_procset(self, procs): @@ -140,7 +124,7 @@ class PDFResourceManager(object): return def get_cmap(self, cmapname, strict=False): - return CMapDB.get_cmap(cmapname, strict=strict) + return self.cmapdb.get_cmap(cmapname, strict=strict) def get_font(self, objid, spec): if objid and objid in self.fonts: diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 2e903bf..1dabc91 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -4,32 +4,19 @@ import re import md5 import struct from sys import stderr -from utils import choplist -from utils import nunpack -from utils import decode_text -from arcfour import Arcfour from psparser import PSStackParser -from psparser import PSSyntaxError -from psparser import PSEOF -from psparser import PSLiteralTable -from psparser import PSKeywordTable -from psparser import literal_name -from psparser import keyword_name +from psparser import PSSyntaxError, PSEOF +from psparser import PSLiteralTable, PSKeywordTable +from psparser import literal_name, keyword_name from psparser import STRICT -from pdftypes import PDFException -from pdftypes import PDFTypeError -from pdftypes import PDFNotImplementedError -from pdftypes import PDFStream -from pdftypes import PDFObjRef -from pdftypes import resolve1 -from pdftypes import decipher_all -from pdftypes import int_value -from pdftypes import float_value -from pdftypes import num_value -from pdftypes import str_value -from pdftypes import list_value -from pdftypes import dict_value -from pdftypes import stream_value +from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError +from pdftypes import PDFStream, PDFObjRef +from pdftypes import resolve1, decipher_all +from pdftypes import int_value, float_value, num_value +from pdftypes import str_value, list_value, dict_value, stream_value +from arcfour import Arcfour +from utils import choplist, nunpack +from utils import decode_text ## Exceptions diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 129bcd0..316eb35 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -3,13 +3,9 @@ import sys import zlib from lzw import LZWDecoder from psparser import PSException -from psparser import PSObject -from psparser import PSLiteral -from psparser import PSKeyword -from psparser import PSLiteralTable -from psparser import PSKeywordTable -from psparser import literal_name -from psparser import keyword_name +from psparser import PSObject, PSLiteral, PSKeyword +from psparser import PSLiteralTable, PSKeywordTable +from psparser import literal_name, keyword_name from psparser import STRICT LITERAL_CRYPT = PSLiteralTable.intern('Crypt') diff --git a/pdfminer/pycdb.py b/pdfminer/pycdb.py index 9229887..6663413 100755 --- a/pdfminer/pycdb.py +++ b/pdfminer/pycdb.py @@ -8,8 +8,7 @@ import sys import os -from struct import pack -from struct import unpack +from struct import pack, unpack from array import array diff --git a/pdfminer/rijndael.py b/pdfminer/rijndael.py index b12d354..0d53334 100644 --- a/pdfminer/rijndael.py +++ b/pdfminer/rijndael.py @@ -7,8 +7,7 @@ ## http://www.efgh.com/software/rijndael.htm ## import sys -from struct import pack -from struct import unpack +from struct import pack, unpack def KEYLENGTH(keybits): return (keybits)/8 def RKLENGTH(keybits): return (keybits)/8+28 diff --git a/samples/Makefile b/samples/Makefile index 13d196c..976a5ae 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -45,7 +45,7 @@ clean: test: htmls texts xmls htmls: $(HTMLS) -tests: $(TEXTS) +texts: $(TEXTS) xmls: $(XMLS) .SUFFIXES: .pdf .html .xml .txt diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 95db6f8..4b10119 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -58,8 +58,8 @@ def main(argv): PDFPageInterpreter.debug = debug PDFDevice.debug = debug # - CMapDB.initialize(cmapdir) - rsrc = PDFResourceManager() + cmapdb = CMapDB(cmapdir) + rsrc = PDFResourceManager(cmapdb) if not outtype: outtype = 'text' if outfile: