PEP8: Remove trailing whitespace
parent
9ff6aa0463
commit
c1da8b835c
|
@ -16,13 +16,13 @@ def ascii85decode(data):
|
|||
letters, using 85 different types of characters (as 256**4 < 85**5).
|
||||
When the length of the original bytes is not a multiple of 4, a special
|
||||
rule is used for round up.
|
||||
|
||||
|
||||
The Adobe's ASCII85 implementation is slightly different from
|
||||
its original in handling the last characters.
|
||||
|
||||
|
||||
The sample string is taken from:
|
||||
http://en.wikipedia.org/w/index.php?title=Ascii85
|
||||
|
||||
|
||||
>>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
|
||||
'Man is distinguished'
|
||||
>>> ascii85decode('E,9)oF*2M7/c~>')
|
||||
|
@ -60,7 +60,7 @@ def asciihexdecode(data):
|
|||
EOD. Any other characters will cause an error. If the filter encounters
|
||||
the EOD marker after reading an odd number of hexadecimal digits, it
|
||||
will behave as if a 0 followed the last digit.
|
||||
|
||||
|
||||
>>> asciihexdecode('61 62 2e6364 65')
|
||||
'ab.cde'
|
||||
>>> asciihexdecode('61 62 2e6364 657>')
|
||||
|
|
|
@ -308,7 +308,7 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010')
|
||||
|
||||
|
||||
class EOFB(Exception): pass
|
||||
class InvalidData(Exception): pass
|
||||
class ByteSkip(Exception): pass
|
||||
|
@ -386,7 +386,7 @@ class CCITTG4Parser(BitParser):
|
|||
def _parse_uncompressed(self, bits):
|
||||
if not bits: raise self.InvalidData
|
||||
if bits.startswith('T'):
|
||||
self._accept = self._parse_mode
|
||||
self._accept = self._parse_mode
|
||||
self._color = int(bits[1])
|
||||
self._do_uncompressed(bits[2:])
|
||||
return self.MODE
|
||||
|
@ -418,14 +418,14 @@ class CCITTG4Parser(BitParser):
|
|||
def output_line(self, y, bits):
|
||||
print y, ''.join( str(b) for b in bits )
|
||||
return
|
||||
|
||||
|
||||
def _reset_line(self):
|
||||
self._refline = self._curline
|
||||
self._curline = array.array('b', [1]*self.width)
|
||||
self._curpos = -1
|
||||
self._color = 1
|
||||
return
|
||||
|
||||
|
||||
def _flush_line(self):
|
||||
if self.width <= self._curpos:
|
||||
self.output_line(self._y, self._curline)
|
||||
|
@ -460,7 +460,7 @@ class CCITTG4Parser(BitParser):
|
|||
self._curpos = x1
|
||||
self._color = 1-self._color
|
||||
return
|
||||
|
||||
|
||||
def _do_pass(self):
|
||||
#print '* pass: curpos=%r, color=%r' % (self._curpos, self._color)
|
||||
#print ' refline:', self._get_refline(self._curpos+1)
|
||||
|
@ -487,7 +487,7 @@ class CCITTG4Parser(BitParser):
|
|||
self._curline[x] = self._color
|
||||
self._curpos = x1
|
||||
return
|
||||
|
||||
|
||||
def _do_horizontal(self, n1, n2):
|
||||
#print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color)
|
||||
if self._curpos < 0:
|
||||
|
@ -503,7 +503,7 @@ class CCITTG4Parser(BitParser):
|
|||
x += 1
|
||||
self._curpos = x
|
||||
return
|
||||
|
||||
|
||||
def _do_uncompressed(self, bits):
|
||||
#print '* uncompressed(%r): curpos=%r' % (bits, self._curpos)
|
||||
for c in bits:
|
||||
|
@ -672,16 +672,16 @@ class TestCCITTG4Parser(unittest.TestCase):
|
|||
## CCITTFaxDecoder
|
||||
##
|
||||
class CCITTFaxDecoder(CCITTG4Parser):
|
||||
|
||||
|
||||
def __init__(self, width, bytealign=False, reversed=False):
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.reversed = reversed
|
||||
self._buf = ''
|
||||
return
|
||||
|
||||
|
||||
def close(self):
|
||||
return self._buf
|
||||
|
||||
|
||||
def output_line(self, y, bits):
|
||||
bytes = array.array('B', [0]*((len(bits)+7)/8))
|
||||
if self.reversed:
|
||||
|
@ -704,8 +704,8 @@ def ccittfaxdecode(data, params):
|
|||
raise ValueError(K)
|
||||
parser.feedbytes(data)
|
||||
return parser.close()
|
||||
|
||||
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
import pygame
|
||||
|
|
|
@ -81,7 +81,7 @@ class CMap(object):
|
|||
else:
|
||||
self.dump(out=out, code2cid=v, code=c)
|
||||
return
|
||||
|
||||
|
||||
|
||||
## IdentityCMap
|
||||
##
|
||||
|
@ -100,8 +100,8 @@ class IdentityCMap(object):
|
|||
return struct.unpack('>%dH' % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## UnicodeMap
|
||||
##
|
||||
|
@ -162,7 +162,7 @@ class FileCMap(CMap):
|
|||
## FileUnicodeMap
|
||||
##
|
||||
class FileUnicodeMap(UnicodeMap):
|
||||
|
||||
|
||||
def __init__(self):
|
||||
UnicodeMap.__init__(self)
|
||||
self.attrs = {}
|
||||
|
@ -205,12 +205,12 @@ class PyCMap(CMap):
|
|||
|
||||
def is_vertical(self):
|
||||
return self._is_vertical
|
||||
|
||||
|
||||
|
||||
## PyUnicodeMap
|
||||
##
|
||||
class PyUnicodeMap(UnicodeMap):
|
||||
|
||||
|
||||
def __init__(self, name, module, vertical):
|
||||
if vertical:
|
||||
cid2unichr = module.CID2UNICHR_V
|
||||
|
@ -231,7 +231,7 @@ class CMapDB(object):
|
|||
debug = 0
|
||||
_cmap_cache = {}
|
||||
_umap_cache = {}
|
||||
|
||||
|
||||
class CMapNotFound(CMapError): pass
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -119,7 +119,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
|
|||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
self.result = None
|
||||
return
|
||||
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
self.result = ltpage
|
||||
return
|
||||
|
@ -137,7 +137,7 @@ class PDFConverter(PDFLayoutAnalyzer):
|
|||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
|
@ -179,7 +179,7 @@ class TextConverter(PDFConverter):
|
|||
if self.imagewriter is None: return
|
||||
PDFConverter.render_image(self, name, stream)
|
||||
return
|
||||
|
||||
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
|
||||
|
@ -197,13 +197,13 @@ class HTMLConverter(PDFConverter):
|
|||
'curve': 'black',
|
||||
'page': 'gray',
|
||||
}
|
||||
|
||||
|
||||
TEXT_COLORS = {
|
||||
'textbox': 'blue',
|
||||
'char': 'black',
|
||||
}
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
||||
pagemargin=50, imagewriter=None,
|
||||
rect_colors={'curve':'black', 'page':'gray'},
|
||||
|
@ -295,7 +295,7 @@ class HTMLConverter(PDFConverter):
|
|||
self._font = self._fontstack.pop()
|
||||
self.write('</div>')
|
||||
return
|
||||
|
||||
|
||||
def put_text(self, text, fontname, fontsize):
|
||||
font = (fontname, fontsize)
|
||||
if font != self._font:
|
||||
|
@ -399,7 +399,7 @@ class XMLConverter(PDFConverter):
|
|||
def write_footer(self):
|
||||
self.outfp.write('</pages>\n')
|
||||
return
|
||||
|
||||
|
||||
def write_text(self, text):
|
||||
self.outfp.write(enc(text, self.codec))
|
||||
return
|
||||
|
|
|
@ -8,7 +8,7 @@ written with a proportional font.
|
|||
The following data were extracted from the AFM files:
|
||||
|
||||
http://www.ctan.org/tex-archive/fonts/adobe/afm/
|
||||
|
||||
|
||||
"""
|
||||
|
||||
### BEGIN Verbatim copy of the license part
|
||||
|
|
|
@ -70,7 +70,7 @@ class ImageWriter(object):
|
|||
(width, height) = image.srcsize
|
||||
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
elif (image.bits == 1 or
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
else:
|
||||
|
@ -84,7 +84,7 @@ class ImageWriter(object):
|
|||
from PIL import Image
|
||||
from PIL import ImageChops
|
||||
ifp = cStringIO.StringIO(raw_data)
|
||||
i = Image.open(ifp)
|
||||
i = Image.open(ifp)
|
||||
i = ImageChops.invert(i)
|
||||
i = i.convert('RGB')
|
||||
i.save(fp, 'JPEG')
|
||||
|
|
|
@ -94,7 +94,7 @@ class LTComponent(LTItem):
|
|||
|
||||
def is_empty(self):
|
||||
return self.width <= 0 or self.height <= 0
|
||||
|
||||
|
||||
def is_hoverlap(self, obj):
|
||||
assert isinstance(obj, LTComponent)
|
||||
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||
|
@ -247,7 +247,7 @@ class LTChar(LTComponent, LTText):
|
|||
|
||||
def __repr__(self):
|
||||
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox),
|
||||
(self.__class__.__name__, bbox2str(self.bbox),
|
||||
matrix2str(self.matrix), self.fontname, self.adv,
|
||||
self.get_text()))
|
||||
|
||||
|
@ -258,7 +258,7 @@ class LTChar(LTComponent, LTText):
|
|||
"""Returns True if two characters can coexist in the same line."""
|
||||
return True
|
||||
|
||||
|
||||
|
||||
## LTContainer
|
||||
##
|
||||
class LTContainer(LTComponent):
|
||||
|
@ -287,7 +287,7 @@ class LTContainer(LTComponent):
|
|||
for obj in self._objs:
|
||||
obj.analyze(laparams)
|
||||
return
|
||||
|
||||
|
||||
|
||||
## LTExpandableContainer
|
||||
##
|
||||
|
@ -315,7 +315,7 @@ class LTTextContainer(LTExpandableContainer, LTText):
|
|||
|
||||
def get_text(self):
|
||||
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
|
||||
|
||||
|
||||
|
||||
## LTTextLine
|
||||
##
|
||||
|
@ -363,7 +363,7 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
abs(obj.height-self.height) < d and
|
||||
(abs(obj.x0-self.x0) < d or
|
||||
abs(obj.x1-self.x1) < d)) ]
|
||||
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
|
||||
def __init__(self, word_margin):
|
||||
|
@ -379,7 +379,7 @@ class LTTextLineVertical(LTTextLine):
|
|||
self._y0 = obj.y0
|
||||
LTTextLine.add(self, obj)
|
||||
return
|
||||
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
d = ratio*self.width
|
||||
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
|
||||
|
@ -387,8 +387,8 @@ class LTTextLineVertical(LTTextLine):
|
|||
if (isinstance(obj, LTTextLineVertical) and
|
||||
abs(obj.width-self.width) < d and
|
||||
(abs(obj.y0-self.y0) < d or
|
||||
abs(obj.y1-self.y1) < d)) ]
|
||||
|
||||
abs(obj.y1-self.y1) < d)) ]
|
||||
|
||||
|
||||
## LTTextBox
|
||||
##
|
||||
|
@ -408,7 +408,7 @@ class LTTextBox(LTTextContainer):
|
|||
self.index, bbox2str(self.bbox), self.get_text()))
|
||||
|
||||
class LTTextBoxHorizontal(LTTextBox):
|
||||
|
||||
|
||||
def analyze(self, laparams):
|
||||
LTTextBox.analyze(self, laparams)
|
||||
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
|
||||
|
@ -438,7 +438,7 @@ class LTTextGroup(LTTextContainer):
|
|||
return
|
||||
|
||||
class LTTextGroupLRTB(LTTextGroup):
|
||||
|
||||
|
||||
def analyze(self, laparams):
|
||||
LTTextGroup.analyze(self, laparams)
|
||||
# reorder the objects from top-left to bottom-right.
|
||||
|
@ -448,7 +448,7 @@ class LTTextGroupLRTB(LTTextGroup):
|
|||
return
|
||||
|
||||
class LTTextGroupTBRL(LTTextGroup):
|
||||
|
||||
|
||||
def analyze(self, laparams):
|
||||
LTTextGroup.analyze(self, laparams)
|
||||
# reorder the objects from top-right to bottom-left.
|
||||
|
@ -466,14 +466,14 @@ class LTLayoutContainer(LTContainer):
|
|||
LTContainer.__init__(self, bbox)
|
||||
self.groups = None
|
||||
return
|
||||
|
||||
|
||||
def get_textlines(self, laparams, objs):
|
||||
obj0 = None
|
||||
line = None
|
||||
for obj1 in objs:
|
||||
if obj0 is not None:
|
||||
k = 0
|
||||
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
|
||||
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
|
||||
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
|
||||
obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin):
|
||||
# obj0 and obj1 is horizontally aligned:
|
||||
|
@ -488,7 +488,7 @@ class LTLayoutContainer(LTContainer):
|
|||
# (char_margin)
|
||||
k |= 1
|
||||
if (laparams.detect_vertical and
|
||||
obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
|
||||
obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
|
||||
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
|
||||
obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin):
|
||||
# obj0 and obj1 is vertically aligned:
|
||||
|
@ -565,9 +565,9 @@ class LTLayoutContainer(LTContainer):
|
|||
assert boxes
|
||||
def dist(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
||||
|
||||
Consider the bounding rectangle for obj1 and obj2.
|
||||
Return its area less the areas of obj1 and obj2,
|
||||
Return its area less the areas of obj1 and obj2,
|
||||
shown as 'www' below. This value may be negative.
|
||||
+------+..........+ (x1,y1)
|
||||
| obj1 |wwwwwwwwww:
|
||||
|
@ -621,7 +621,7 @@ class LTLayoutContainer(LTContainer):
|
|||
plane.add(group)
|
||||
assert len(plane) == 1
|
||||
return list(plane)
|
||||
|
||||
|
||||
def analyze(self, laparams):
|
||||
# textobjs is a list of LTChar objects, i.e.
|
||||
# it has all the individual characters in the page.
|
||||
|
@ -668,7 +668,7 @@ class LTFigure(LTLayoutContainer):
|
|||
def analyze(self, laparams):
|
||||
if not laparams.all_texts: return
|
||||
LTLayoutContainer.analyze(self, laparams)
|
||||
return
|
||||
return
|
||||
|
||||
|
||||
## LTPage
|
||||
|
|
|
@ -74,8 +74,8 @@ class PDFTextDevice(PDFDevice):
|
|||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale)
|
||||
return
|
||||
|
||||
def render_string_horizontal(self, seq, matrix, (x,y),
|
||||
|
||||
def render_string_horizontal(self, seq, matrix, (x,y),
|
||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
|
@ -93,7 +93,7 @@ class PDFTextDevice(PDFDevice):
|
|||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_string_vertical(self, seq, matrix, (x,y),
|
||||
def render_string_vertical(self, seq, matrix, (x,y),
|
||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
|
@ -104,7 +104,7 @@ class PDFTextDevice(PDFDevice):
|
|||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
y += charspace
|
||||
y += self.render_char(translate_matrix(matrix, (x,y)),
|
||||
y += self.render_char(translate_matrix(matrix, (x,y)),
|
||||
font, fontsize, scaling, rise, cid)
|
||||
if cid == 32 and wordspace:
|
||||
y += wordspace
|
||||
|
|
|
@ -260,7 +260,7 @@ class PDFDocument(object):
|
|||
doc = PDFDocument(parser)
|
||||
doc.initialize(password)
|
||||
obj = doc.getobj(objid)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
debug = 0
|
||||
|
@ -425,7 +425,7 @@ class PDFDocument(object):
|
|||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||
(_,obj) = self._parser.nextobject()
|
||||
return obj
|
||||
|
||||
|
||||
# can raise PDFObjectNotFound
|
||||
def getobj(self, objid):
|
||||
assert objid != 0
|
||||
|
|
|
@ -102,7 +102,7 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
except KeyError:
|
||||
pass
|
||||
return self._cid2unicode
|
||||
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_PUT:
|
||||
((_,key),(_,value)) = self.pop(2)
|
||||
|
@ -111,7 +111,7 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
self.add_results((key, literal_name(value)))
|
||||
return
|
||||
|
||||
|
||||
|
||||
## CFFFont
|
||||
## (Format specified in Adobe Technical Note: #5176
|
||||
## "The Compact Font Format Specification")
|
||||
|
|
|
@ -125,7 +125,7 @@ class PDFGraphicState(object):
|
|||
class PDFResourceManager(object):
|
||||
|
||||
"""Repository of shared resources.
|
||||
|
||||
|
||||
ResourceManager facilitates reuse of shared resources
|
||||
such as fonts and images so that large objects are not
|
||||
allocated multiple times.
|
||||
|
@ -725,7 +725,7 @@ class PDFPageInterpreter(object):
|
|||
interpreter = self.dup()
|
||||
bbox = list_value(xobj['BBox'])
|
||||
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
|
||||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||
# instead of having their own Resources entry.
|
||||
resources = dict_value(xobj.get('Resources')) or self.resources.copy()
|
||||
|
|
|
@ -40,7 +40,7 @@ class PDFPage(object):
|
|||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
"""Initialize a page object.
|
||||
|
||||
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
|
|
|
@ -35,7 +35,7 @@ class PDFParser(PSStackParser):
|
|||
parser.set_document(doc)
|
||||
parser.seek(offset)
|
||||
parser.nextobject()
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, fp):
|
||||
|
@ -57,10 +57,10 @@ class PDFParser(PSStackParser):
|
|||
KEYWORD_STARTXREF = KWD('startxref')
|
||||
def do_keyword(self, pos, token):
|
||||
"""Handles PDF-related keywords."""
|
||||
|
||||
|
||||
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
||||
self.add_results(*self.pop(1))
|
||||
|
||||
|
||||
elif token is self.KEYWORD_ENDOBJ:
|
||||
self.add_results(*self.pop(4))
|
||||
|
||||
|
@ -125,7 +125,7 @@ class PDFParser(PSStackParser):
|
|||
else:
|
||||
# others
|
||||
self.push((pos, token))
|
||||
|
||||
|
||||
return
|
||||
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ def resolve1(x, default=None):
|
|||
|
||||
def resolve_all(x, default=None):
|
||||
"""Recursively resolves the given object and all the internals.
|
||||
|
||||
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
This procedure might be slow.
|
||||
"""
|
||||
|
@ -180,13 +180,13 @@ class PDFStream(PDFObject):
|
|||
|
||||
def __contains__(self, name):
|
||||
return name in self.attrs
|
||||
|
||||
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
|
||||
|
||||
def get(self, name, default=None):
|
||||
return self.attrs.get(name, default)
|
||||
|
||||
|
||||
def get_any(self, names, default=None):
|
||||
for name in names:
|
||||
if name in self.attrs:
|
||||
|
|
|
@ -32,7 +32,7 @@ class PSObject(object):
|
|||
class PSLiteral(PSObject):
|
||||
|
||||
"""A class that represents a PostScript literal.
|
||||
|
||||
|
||||
Postscript literals are used as identifiers, such as
|
||||
variable names, property names and dictionary keys.
|
||||
Literals are case sensitive and denoted by a preceding
|
||||
|
@ -55,11 +55,11 @@ class PSLiteral(PSObject):
|
|||
class PSKeyword(PSObject):
|
||||
|
||||
"""A class that represents a PostScript keyword.
|
||||
|
||||
|
||||
PostScript keywords are a dozen of predefined words.
|
||||
Commands and directives in PostScript are expressed by keywords.
|
||||
They are also used to denote the content boundaries.
|
||||
|
||||
|
||||
Note: Do not create an instance of PSKeyword directly.
|
||||
Always use PSKeywordTable.intern().
|
||||
"""
|
||||
|
@ -80,7 +80,7 @@ class PSSymbolTable(object):
|
|||
|
||||
Interned objects can be checked its identity with "is" operator.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, klass):
|
||||
self.dict = {}
|
||||
self.klass = klass
|
||||
|
@ -357,7 +357,7 @@ class PSBaseParser(object):
|
|||
pass
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
|
||||
def _parse_float(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
|
@ -493,17 +493,17 @@ class PSStackParser(PSBaseParser):
|
|||
def push(self, *objs):
|
||||
self.curstack.extend(objs)
|
||||
return
|
||||
|
||||
|
||||
def pop(self, n):
|
||||
objs = self.curstack[-n:]
|
||||
self.curstack[-n:] = []
|
||||
return objs
|
||||
|
||||
|
||||
def popall(self):
|
||||
objs = self.curstack
|
||||
self.curstack = []
|
||||
return objs
|
||||
|
||||
|
||||
def add_results(self, *objs):
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'add_results: %r' % (objs,)
|
||||
|
@ -516,7 +516,7 @@ class PSStackParser(PSBaseParser):
|
|||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'start_type: pos=%r, type=%r' % (pos, type)
|
||||
return
|
||||
|
||||
|
||||
def end_type(self, type):
|
||||
if self.curtype != type:
|
||||
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
||||
|
|
|
@ -1060,7 +1060,7 @@ class RijndaelEncryptor(object):
|
|||
>>> RijndaelEncryptor(key, 128).encrypt(plaintext).encode('hex')
|
||||
'd8f532538289ef7d06b506a4fd5be9c9'
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, key, keybits=256):
|
||||
assert len(key) == KEYLENGTH(keybits)
|
||||
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
|
||||
|
|
|
@ -255,7 +255,7 @@ class Plane(object):
|
|||
for obj in objs:
|
||||
self.add(obj)
|
||||
return
|
||||
|
||||
|
||||
# add(obj): place an object.
|
||||
def add(self, obj):
|
||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||
|
|
4
setup.py
4
setup.py
|
@ -7,9 +7,9 @@ setup(
|
|||
version=__version__,
|
||||
description='PDF parser and analyzer',
|
||||
long_description='''PDFMiner is a tool for extracting information from PDF documents.
|
||||
Unlike other PDF-related tools, it focuses entirely on getting
|
||||
Unlike other PDF-related tools, it focuses entirely on getting
|
||||
and analyzing text data. PDFMiner allows to obtain
|
||||
the exact location of texts in a page, as well as
|
||||
the exact location of texts in a page, as well as
|
||||
other information such as fonts or lines.
|
||||
It includes a PDF converter that can transform PDF files
|
||||
into other text formats (such as HTML). It has an extensible
|
||||
|
|
|
@ -50,7 +50,7 @@ class CMapConverter(object):
|
|||
assert values[0] == 'CID'
|
||||
encs = values
|
||||
continue
|
||||
|
||||
|
||||
def put(dmap, code, cid, force=False):
|
||||
for b in code[:-1]:
|
||||
b = ord(b)
|
||||
|
@ -64,7 +64,7 @@ class CMapConverter(object):
|
|||
if force or ((b not in dmap) or dmap[b] == cid):
|
||||
dmap[b] = cid
|
||||
return
|
||||
|
||||
|
||||
def add(unimap, enc, code):
|
||||
try:
|
||||
codec = self.enc2codec[enc]
|
||||
|
@ -78,20 +78,20 @@ class CMapConverter(object):
|
|||
except UnicodeError:
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
def pick(unimap):
|
||||
chars = unimap.items()
|
||||
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
|
||||
(c,_) = chars[0]
|
||||
return c
|
||||
|
||||
|
||||
cid = int(values[0])
|
||||
unimap_h = {}
|
||||
unimap_v = {}
|
||||
for (enc,value) in zip(encs, values):
|
||||
if enc == 'CID': continue
|
||||
if value == '*': continue
|
||||
|
||||
|
||||
# hcodes, vcodes: encoded bytes for each writing mode.
|
||||
hcodes = []
|
||||
vcodes = []
|
||||
|
@ -121,7 +121,7 @@ class CMapConverter(object):
|
|||
for code in hcodes:
|
||||
put(hmap, code, cid)
|
||||
put(vmap, code, cid)
|
||||
|
||||
|
||||
# Determine the "most popular" candidate.
|
||||
if unimap_h:
|
||||
self.cid2unichr_h[cid] = pick(unimap_h)
|
||||
|
@ -137,7 +137,7 @@ class CMapConverter(object):
|
|||
)
|
||||
fp.write(pickle.dumps(data))
|
||||
return
|
||||
|
||||
|
||||
def dump_unicodemap(self, fp):
|
||||
data = dict(
|
||||
CID2UNICHR_H=self.cid2unichr_h,
|
||||
|
@ -151,7 +151,7 @@ def main(argv):
|
|||
import getopt
|
||||
import gzip
|
||||
import os.path
|
||||
|
||||
|
||||
def usage():
|
||||
print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
|
||||
return 100
|
||||
|
|
|
@ -25,7 +25,7 @@ def dumpxml(out, obj, codec=None):
|
|||
if obj is None:
|
||||
out.write('<null />')
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, dict):
|
||||
out.write('<dict size="%d">\n' % len(obj))
|
||||
for (k,v) in obj.iteritems():
|
||||
|
@ -179,7 +179,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
|||
out.write(fileobj.get_data())
|
||||
out.close()
|
||||
return
|
||||
|
||||
|
||||
fp = file(fname, 'rb')
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
|
|
|
@ -14,7 +14,7 @@ This is an in-house mapping table for some Latin-1 characters
|
|||
LATIN2ASCII = {
|
||||
#0x00a0: '',
|
||||
#0x00a7: '',
|
||||
|
||||
|
||||
# iso-8859-1
|
||||
0x00c0: 'A`',
|
||||
0x00c1: "A'",
|
||||
|
|
|
@ -159,7 +159,7 @@ class WebApp(object):
|
|||
|
||||
def convert(self):
|
||||
self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
|
||||
if (self.method != 'POST' or
|
||||
if (self.method != 'POST' or
|
||||
'c' not in self.form or
|
||||
'f' not in self.form):
|
||||
self.response_200()
|
||||
|
|
Loading…
Reference in New Issue