From c1da8b835c6fe3c3e992a8c426efcadbbd5a0336 Mon Sep 17 00:00:00 2001 From: Matthew Duggan Date: Thu, 7 Nov 2013 16:14:53 +0900 Subject: [PATCH] PEP8: Remove trailing whitespace --- pdfminer/ascii85.py | 8 ++++---- pdfminer/ccitt.py | 24 ++++++++++++------------ pdfminer/cmapdb.py | 14 +++++++------- pdfminer/converter.py | 14 +++++++------- pdfminer/fontmetrics.py | 2 +- pdfminer/image.py | 4 ++-- pdfminer/layout.py | 38 +++++++++++++++++++------------------- pdfminer/pdfdevice.py | 8 ++++---- pdfminer/pdfdocument.py | 4 ++-- pdfminer/pdffont.py | 4 ++-- pdfminer/pdfinterp.py | 4 ++-- pdfminer/pdfpage.py | 2 +- pdfminer/pdfparser.py | 8 ++++---- pdfminer/pdftypes.py | 8 ++++---- pdfminer/psparser.py | 18 +++++++++--------- pdfminer/rijndael.py | 2 +- pdfminer/utils.py | 2 +- setup.py | 4 ++-- tools/conv_cmap.py | 16 ++++++++-------- tools/dumppdf.py | 4 ++-- tools/latin2ascii.py | 2 +- tools/pdf2html.cgi | 2 +- 22 files changed, 96 insertions(+), 96 deletions(-) diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index a78d2bf..249cdf9 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -16,13 +16,13 @@ def ascii85decode(data): letters, using 85 different types of characters (as 256**4 < 85**5). When the length of the original bytes is not a multiple of 4, a special rule is used for round up. - + The Adobe's ASCII85 implementation is slightly different from its original in handling the last characters. - + The sample string is taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 - + >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q') 'Man is distinguished' >>> ascii85decode('E,9)oF*2M7/c~>') @@ -60,7 +60,7 @@ def asciihexdecode(data): EOD. Any other characters will cause an error. If the filter encounters the EOD marker after reading an odd number of hexadecimal digits, it will behave as if a 0 followed the last digit. - + >>> asciihexdecode('61 62 2e6364 65') 'ab.cde' >>> asciihexdecode('61 62 2e6364 657>') diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index 5fb0fbe..4d764f5 100644 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -308,7 +308,7 @@ class CCITTG4Parser(BitParser): BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010') BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011') BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010') - + class EOFB(Exception): pass class InvalidData(Exception): pass class ByteSkip(Exception): pass @@ -386,7 +386,7 @@ class CCITTG4Parser(BitParser): def _parse_uncompressed(self, bits): if not bits: raise self.InvalidData if bits.startswith('T'): - self._accept = self._parse_mode + self._accept = self._parse_mode self._color = int(bits[1]) self._do_uncompressed(bits[2:]) return self.MODE @@ -418,14 +418,14 @@ class CCITTG4Parser(BitParser): def output_line(self, y, bits): print y, ''.join( str(b) for b in bits ) return - + def _reset_line(self): self._refline = self._curline self._curline = array.array('b', [1]*self.width) self._curpos = -1 self._color = 1 return - + def _flush_line(self): if self.width <= self._curpos: self.output_line(self._y, self._curline) @@ -460,7 +460,7 @@ class CCITTG4Parser(BitParser): self._curpos = x1 self._color = 1-self._color return - + def _do_pass(self): #print '* pass: curpos=%r, color=%r' % (self._curpos, self._color) #print ' refline:', self._get_refline(self._curpos+1) @@ -487,7 +487,7 @@ class CCITTG4Parser(BitParser): self._curline[x] = self._color self._curpos = x1 return - + def _do_horizontal(self, n1, n2): #print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color) if self._curpos < 0: @@ -503,7 +503,7 @@ class CCITTG4Parser(BitParser): x += 1 self._curpos = x return - + def _do_uncompressed(self, bits): #print '* uncompressed(%r): curpos=%r' % (bits, self._curpos) for c in bits: @@ -672,16 +672,16 @@ class TestCCITTG4Parser(unittest.TestCase): ## CCITTFaxDecoder ## class CCITTFaxDecoder(CCITTG4Parser): - + def __init__(self, width, bytealign=False, reversed=False): CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.reversed = reversed self._buf = '' return - + def close(self): return self._buf - + def output_line(self, y, bits): bytes = array.array('B', [0]*((len(bits)+7)/8)) if self.reversed: @@ -704,8 +704,8 @@ def ccittfaxdecode(data, params): raise ValueError(K) parser.feedbytes(data) return parser.close() - - + + # test def main(argv): import pygame diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index fdd6a53..3525248 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -81,7 +81,7 @@ class CMap(object): else: self.dump(out=out, code2cid=v, code=c) return - + ## IdentityCMap ## @@ -100,8 +100,8 @@ class IdentityCMap(object): return struct.unpack('>%dH' % n, code) else: return () - - + + ## UnicodeMap ## @@ -162,7 +162,7 @@ class FileCMap(CMap): ## FileUnicodeMap ## class FileUnicodeMap(UnicodeMap): - + def __init__(self): UnicodeMap.__init__(self) self.attrs = {} @@ -205,12 +205,12 @@ class PyCMap(CMap): def is_vertical(self): return self._is_vertical - + ## PyUnicodeMap ## class PyUnicodeMap(UnicodeMap): - + def __init__(self, name, module, vertical): if vertical: cid2unichr = module.CID2UNICHR_V @@ -231,7 +231,7 @@ class CMapDB(object): debug = 0 _cmap_cache = {} _umap_cache = {} - + class CMapNotFound(CMapError): pass @classmethod diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 08758a8..a474801 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -119,7 +119,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer): PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.result = None return - + def receive_layout(self, ltpage): self.result = ltpage return @@ -137,7 +137,7 @@ class PDFConverter(PDFLayoutAnalyzer): self.outfp = outfp self.codec = codec return - + ## TextConverter ## @@ -179,7 +179,7 @@ class TextConverter(PDFConverter): if self.imagewriter is None: return PDFConverter.render_image(self, name, stream) return - + def paint_path(self, gstate, stroke, fill, evenodd, path): return @@ -197,13 +197,13 @@ class HTMLConverter(PDFConverter): 'curve': 'black', 'page': 'gray', } - + TEXT_COLORS = { 'textbox': 'blue', 'char': 'black', } - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, scale=1, fontscale=1.0, layoutmode='normal', showpageno=True, pagemargin=50, imagewriter=None, rect_colors={'curve':'black', 'page':'gray'}, @@ -295,7 +295,7 @@ class HTMLConverter(PDFConverter): self._font = self._fontstack.pop() self.write('') return - + def put_text(self, text, fontname, fontsize): font = (fontname, fontsize) if font != self._font: @@ -399,7 +399,7 @@ class XMLConverter(PDFConverter): def write_footer(self): self.outfp.write('\n') return - + def write_text(self, text): self.outfp.write(enc(text, self.codec)) return diff --git a/pdfminer/fontmetrics.py b/pdfminer/fontmetrics.py index def2178..bf0be75 100644 --- a/pdfminer/fontmetrics.py +++ b/pdfminer/fontmetrics.py @@ -8,7 +8,7 @@ written with a proportional font. The following data were extracted from the AFM files: http://www.ctan.org/tex-archive/fonts/adobe/afm/ - + """ ### BEGIN Verbatim copy of the license part diff --git a/pdfminer/image.py b/pdfminer/image.py index 2ac41be..782bcc6 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -70,7 +70,7 @@ class ImageWriter(object): (width, height) = image.srcsize if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE: ext = '.jpg' - elif (image.bits == 1 or + elif (image.bits == 1 or image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)): ext = '.%dx%d.bmp' % (width, height) else: @@ -84,7 +84,7 @@ class ImageWriter(object): from PIL import Image from PIL import ImageChops ifp = cStringIO.StringIO(raw_data) - i = Image.open(ifp) + i = Image.open(ifp) i = ImageChops.invert(i) i = i.convert('RGB') i.save(fp, 'JPEG') diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 9b7e030..0522452 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -94,7 +94,7 @@ class LTComponent(LTItem): def is_empty(self): return self.width <= 0 or self.height <= 0 - + def is_hoverlap(self, obj): assert isinstance(obj, LTComponent) return obj.x0 <= self.x1 and self.x0 <= obj.x1 @@ -247,7 +247,7 @@ class LTChar(LTComponent, LTText): def __repr__(self): return ('<%s %s matrix=%s font=%r adv=%s text=%r>' % - (self.__class__.__name__, bbox2str(self.bbox), + (self.__class__.__name__, bbox2str(self.bbox), matrix2str(self.matrix), self.fontname, self.adv, self.get_text())) @@ -258,7 +258,7 @@ class LTChar(LTComponent, LTText): """Returns True if two characters can coexist in the same line.""" return True - + ## LTContainer ## class LTContainer(LTComponent): @@ -287,7 +287,7 @@ class LTContainer(LTComponent): for obj in self._objs: obj.analyze(laparams) return - + ## LTExpandableContainer ## @@ -315,7 +315,7 @@ class LTTextContainer(LTExpandableContainer, LTText): def get_text(self): return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) ) - + ## LTTextLine ## @@ -363,7 +363,7 @@ class LTTextLineHorizontal(LTTextLine): abs(obj.height-self.height) < d and (abs(obj.x0-self.x0) < d or abs(obj.x1-self.x1) < d)) ] - + class LTTextLineVertical(LTTextLine): def __init__(self, word_margin): @@ -379,7 +379,7 @@ class LTTextLineVertical(LTTextLine): self._y0 = obj.y0 LTTextLine.add(self, obj) return - + def find_neighbors(self, plane, ratio): d = ratio*self.width objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1)) @@ -387,8 +387,8 @@ class LTTextLineVertical(LTTextLine): if (isinstance(obj, LTTextLineVertical) and abs(obj.width-self.width) < d and (abs(obj.y0-self.y0) < d or - abs(obj.y1-self.y1) < d)) ] - + abs(obj.y1-self.y1) < d)) ] + ## LTTextBox ## @@ -408,7 +408,7 @@ class LTTextBox(LTTextContainer): self.index, bbox2str(self.bbox), self.get_text())) class LTTextBoxHorizontal(LTTextBox): - + def analyze(self, laparams): LTTextBox.analyze(self, laparams) self._objs = csort(self._objs, key=lambda obj: -obj.y1) @@ -438,7 +438,7 @@ class LTTextGroup(LTTextContainer): return class LTTextGroupLRTB(LTTextGroup): - + def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-left to bottom-right. @@ -448,7 +448,7 @@ class LTTextGroupLRTB(LTTextGroup): return class LTTextGroupTBRL(LTTextGroup): - + def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-right to bottom-left. @@ -466,14 +466,14 @@ class LTLayoutContainer(LTContainer): LTContainer.__init__(self, bbox) self.groups = None return - + def get_textlines(self, laparams, objs): obj0 = None line = None for obj1 in objs: if obj0 is not None: k = 0 - if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and + if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin): # obj0 and obj1 is horizontally aligned: @@ -488,7 +488,7 @@ class LTLayoutContainer(LTContainer): # (char_margin) k |= 1 if (laparams.detect_vertical and - obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and + obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin): # obj0 and obj1 is vertically aligned: @@ -565,9 +565,9 @@ class LTLayoutContainer(LTContainer): assert boxes def dist(obj1, obj2): """A distance function between two TextBoxes. - + Consider the bounding rectangle for obj1 and obj2. - Return its area less the areas of obj1 and obj2, + Return its area less the areas of obj1 and obj2, shown as 'www' below. This value may be negative. +------+..........+ (x1,y1) | obj1 |wwwwwwwwww: @@ -621,7 +621,7 @@ class LTLayoutContainer(LTContainer): plane.add(group) assert len(plane) == 1 return list(plane) - + def analyze(self, laparams): # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. @@ -668,7 +668,7 @@ class LTFigure(LTLayoutContainer): def analyze(self, laparams): if not laparams.all_texts: return LTLayoutContainer.analyze(self, laparams) - return + return ## LTPage diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index aab901b..be13263 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -74,8 +74,8 @@ class PDFTextDevice(PDFDevice): seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) return - - def render_string_horizontal(self, seq, matrix, (x,y), + + def render_string_horizontal(self, seq, matrix, (x,y), font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: @@ -93,7 +93,7 @@ class PDFTextDevice(PDFDevice): needcharspace = True return (x, y) - def render_string_vertical(self, seq, matrix, (x,y), + def render_string_vertical(self, seq, matrix, (x,y), font, fontsize, scaling, charspace, wordspace, rise, dxscale): needcharspace = False for obj in seq: @@ -104,7 +104,7 @@ class PDFTextDevice(PDFDevice): for cid in font.decode(obj): if needcharspace: y += charspace - y += self.render_char(translate_matrix(matrix, (x,y)), + y += self.render_char(translate_matrix(matrix, (x,y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index bce75cd..be6c243 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -260,7 +260,7 @@ class PDFDocument(object): doc = PDFDocument(parser) doc.initialize(password) obj = doc.getobj(objid) - + """ debug = 0 @@ -425,7 +425,7 @@ class PDFDocument(object): raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) (_,obj) = self._parser.nextobject() return obj - + # can raise PDFObjectNotFound def getobj(self, objid): assert objid != 0 diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 7bb1ae8..7435499 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -102,7 +102,7 @@ class Type1FontHeaderParser(PSStackParser): except KeyError: pass return self._cid2unicode - + def do_keyword(self, pos, token): if token is self.KEYWORD_PUT: ((_,key),(_,value)) = self.pop(2) @@ -111,7 +111,7 @@ class Type1FontHeaderParser(PSStackParser): self.add_results((key, literal_name(value))) return - + ## CFFFont ## (Format specified in Adobe Technical Note: #5176 ## "The Compact Font Format Specification") diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index e80b1bd..3c3e629 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -125,7 +125,7 @@ class PDFGraphicState(object): class PDFResourceManager(object): """Repository of shared resources. - + ResourceManager facilitates reuse of shared resources such as fonts and images so that large objects are not allocated multiple times. @@ -725,7 +725,7 @@ class PDFPageInterpreter(object): interpreter = self.dup() bbox = list_value(xobj['BBox']) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) - # According to PDF reference 1.7 section 4.9.1, XObjects in + # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get('Resources')) or self.resources.copy() diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 5beb7b8..b4ae84d 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -40,7 +40,7 @@ class PDFPage(object): def __init__(self, doc, pageid, attrs): """Initialize a page object. - + doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index cfcd917..a47123e 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -35,7 +35,7 @@ class PDFParser(PSStackParser): parser.set_document(doc) parser.seek(offset) parser.nextobject() - + """ def __init__(self, fp): @@ -57,10 +57,10 @@ class PDFParser(PSStackParser): KEYWORD_STARTXREF = KWD('startxref') def do_keyword(self, pos, token): """Handles PDF-related keywords.""" - + if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) - + elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) @@ -125,7 +125,7 @@ class PDFParser(PSStackParser): else: # others self.push((pos, token)) - + return diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index a698268..f1d605f 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -68,7 +68,7 @@ def resolve1(x, default=None): def resolve_all(x, default=None): """Recursively resolves the given object and all the internals. - + Make sure there is no indirect reference within the nested object. This procedure might be slow. """ @@ -180,13 +180,13 @@ class PDFStream(PDFObject): def __contains__(self, name): return name in self.attrs - + def __getitem__(self, name): return self.attrs[name] - + def get(self, name, default=None): return self.attrs.get(name, default) - + def get_any(self, names, default=None): for name in names: if name in self.attrs: diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 85765e7..98434bb 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -32,7 +32,7 @@ class PSObject(object): class PSLiteral(PSObject): """A class that represents a PostScript literal. - + Postscript literals are used as identifiers, such as variable names, property names and dictionary keys. Literals are case sensitive and denoted by a preceding @@ -55,11 +55,11 @@ class PSLiteral(PSObject): class PSKeyword(PSObject): """A class that represents a PostScript keyword. - + PostScript keywords are a dozen of predefined words. Commands and directives in PostScript are expressed by keywords. They are also used to denote the content boundaries. - + Note: Do not create an instance of PSKeyword directly. Always use PSKeywordTable.intern(). """ @@ -80,7 +80,7 @@ class PSSymbolTable(object): Interned objects can be checked its identity with "is" operator. """ - + def __init__(self, klass): self.dict = {} self.klass = klass @@ -357,7 +357,7 @@ class PSBaseParser(object): pass self._parse1 = self._parse_main return j - + def _parse_float(self, s, i): m = END_NUMBER.search(s, i) if not m: @@ -493,17 +493,17 @@ class PSStackParser(PSBaseParser): def push(self, *objs): self.curstack.extend(objs) return - + def pop(self, n): objs = self.curstack[-n:] self.curstack[-n:] = [] return objs - + def popall(self): objs = self.curstack self.curstack = [] return objs - + def add_results(self, *objs): if 2 <= self.debug: print >>sys.stderr, 'add_results: %r' % (objs,) @@ -516,7 +516,7 @@ class PSStackParser(PSBaseParser): if 2 <= self.debug: print >>sys.stderr, 'start_type: pos=%r, type=%r' % (pos, type) return - + def end_type(self, type): if self.curtype != type: raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) diff --git a/pdfminer/rijndael.py b/pdfminer/rijndael.py index cd41825..39895bf 100644 --- a/pdfminer/rijndael.py +++ b/pdfminer/rijndael.py @@ -1060,7 +1060,7 @@ class RijndaelEncryptor(object): >>> RijndaelEncryptor(key, 128).encrypt(plaintext).encode('hex') 'd8f532538289ef7d06b506a4fd5be9c9' """ - + def __init__(self, key, keybits=256): assert len(key) == KEYLENGTH(keybits) (self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 670bca4..d6452cb 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -255,7 +255,7 @@ class Plane(object): for obj in objs: self.add(obj) return - + # add(obj): place an object. def add(self, obj): for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): diff --git a/setup.py b/setup.py index e5e8d7c..d3f73fa 100644 --- a/setup.py +++ b/setup.py @@ -7,9 +7,9 @@ setup( version=__version__, description='PDF parser and analyzer', long_description='''PDFMiner is a tool for extracting information from PDF documents. -Unlike other PDF-related tools, it focuses entirely on getting +Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data. PDFMiner allows to obtain -the exact location of texts in a page, as well as +the exact location of texts in a page, as well as other information such as fonts or lines. It includes a PDF converter that can transform PDF files into other text formats (such as HTML). It has an extensible diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py index a32ace6..4e908bd 100755 --- a/tools/conv_cmap.py +++ b/tools/conv_cmap.py @@ -50,7 +50,7 @@ class CMapConverter(object): assert values[0] == 'CID' encs = values continue - + def put(dmap, code, cid, force=False): for b in code[:-1]: b = ord(b) @@ -64,7 +64,7 @@ class CMapConverter(object): if force or ((b not in dmap) or dmap[b] == cid): dmap[b] = cid return - + def add(unimap, enc, code): try: codec = self.enc2codec[enc] @@ -78,20 +78,20 @@ class CMapConverter(object): except UnicodeError: pass return - + def pick(unimap): chars = unimap.items() chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True) (c,_) = chars[0] return c - + cid = int(values[0]) unimap_h = {} unimap_v = {} for (enc,value) in zip(encs, values): if enc == 'CID': continue if value == '*': continue - + # hcodes, vcodes: encoded bytes for each writing mode. hcodes = [] vcodes = [] @@ -121,7 +121,7 @@ class CMapConverter(object): for code in hcodes: put(hmap, code, cid) put(vmap, code, cid) - + # Determine the "most popular" candidate. if unimap_h: self.cid2unichr_h[cid] = pick(unimap_h) @@ -137,7 +137,7 @@ class CMapConverter(object): ) fp.write(pickle.dumps(data)) return - + def dump_unicodemap(self, fp): data = dict( CID2UNICHR_H=self.cid2unichr_h, @@ -151,7 +151,7 @@ def main(argv): import getopt import gzip import os.path - + def usage(): print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0] return 100 diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 380356e..bf68f30 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -25,7 +25,7 @@ def dumpxml(out, obj, codec=None): if obj is None: out.write('') return - + if isinstance(obj, dict): out.write('\n' % len(obj)) for (k,v) in obj.iteritems(): @@ -179,7 +179,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='', out.write(fileobj.get_data()) out.close() return - + fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) diff --git a/tools/latin2ascii.py b/tools/latin2ascii.py index 36588a6..ab6c708 100755 --- a/tools/latin2ascii.py +++ b/tools/latin2ascii.py @@ -14,7 +14,7 @@ This is an in-house mapping table for some Latin-1 characters LATIN2ASCII = { #0x00a0: '', #0x00a7: '', - + # iso-8859-1 0x00c0: 'A`', 0x00c1: "A'", diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index 3fd445a..aa11f4c 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -159,7 +159,7 @@ class WebApp(object): def convert(self): self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ) - if (self.method != 'POST' or + if (self.method != 'POST' or 'c' not in self.form or 'f' not in self.form): self.response_200()