tidy up a bit

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@114 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-06-14 08:54:57 +00:00
parent 290eccbcc7
commit fc453e2061
4 changed files with 240 additions and 220 deletions

View File

@ -19,12 +19,12 @@ install:
clean: clean:
-rm -rf build -rm -rf build
-cd $(PACKAGE) && make clean -cd $(PACKAGE) && $(MAKE) clean
-cd tools && make clean -cd tools && $(MAKE) clean
-cd samples && make clean -cd samples && $(MAKE) clean
test: test:
cd samples && make test cd samples && $(MAKE) test
# Maintainance: # Maintainance:
commit: clean commit: clean

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun May 17 23:10:42 JST 2009 Last Modified: Sat May 23 10:06:04 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -146,7 +146,7 @@ PDFMiner comes with two handy tools:
<p> <p>
<code>pdf2txt.py</code> extracts text contents from a PDF file. <code>pdf2txt.py</code> extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programatically, It extracts all the texts that are to be rendered programatically,
i.e. it cannot extract texts drawn as images that require optical character recognition. It cannot recognize texts drawn as images that would require optical character recognition.
It also extracts the corresponding locations, font names, font sizes, writing It also extracts the corresponding locations, font names, font sizes, writing
direction (horizontal or vertical) for each text portion. direction (horizontal or vertical) for each text portion.
You need to provide a password for protected PDF documents when its access is restricted. You need to provide a password for protected PDF documents when its access is restricted.
@ -243,9 +243,9 @@ Note that page numbers start from one.
Because the contents of stream objects can be very large, Because the contents of stream objects can be very large,
they are omitted when none of the options above is specified. they are omitted when none of the options above is specified.
<p> <p>
With <code>-r</code> option, all the stream contents are dumped without decoding. With <code>-r</code> option, the "raw" stream contents are dumped without decompression.
With <code>-b</code> option, the contents are dumped as a binary blob. With <code>-b</code> option, the decompressed contents are dumped as a binary blob.
With <code>-t</code> option, the contents are dumped in a text format, With <code>-t</code> option, the decompressed contents are dumped in a text format,
similar to <code>repr()</code> manner. When similar to <code>repr()</code> manner. When
<code>-r</code> or <code>-b</code> option is given, <code>-r</code> or <code>-b</code> option is given,
no stream header is displayed for the ease of saving it to a file. no stream header is displayed for the ease of saving it to a file.

View File

@ -303,6 +303,10 @@ class TextConverter(PDFConverter):
self.word_margin = word_margin self.word_margin = word_margin
return return
def write(self, text):
self.outfp.write(text.encode(self.codec, 'ignore'))
return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTText):

View File

@ -18,16 +18,24 @@ class PSValueError(PSException): pass
## Basic PostScript Types ## Basic PostScript Types
## ##
# PSLiteral ## PSObject
##
## Base class for all PS or PDF-related data types.
##
class PSObject(object): pass class PSObject(object): pass
class PSLiteral(PSObject):
''' ## PSLiteral
PS literals (e.g. "/Name"). ##
Caution: Never create these objects directly. ## Postscript literals are used as identifiers, such as
Use PSLiteralTable.intern() instead. ## variable names, property names and dictionary keys.
''' ## Literals are case sensitive and denoted by a preceding
## slash sign (e.g. "/Name")
##
## Note: Never create an instance of PSLiteral by hand.
## Always use PSLiteralTable.intern().
##
class PSLiteral(PSObject):
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
@ -36,14 +44,13 @@ class PSLiteral(PSObject):
def __repr__(self): def __repr__(self):
return '/%s' % self.name return '/%s' % self.name
# PSKeyword
class PSKeyword(PSObject):
''' ## PSKeyword
PS keywords (e.g. "showpage"). ##
Caution: Never create these objects directly. ## Note: Never create an instance of PSLiteral by hand.
Use PSKeywordTable.intern() instead. ## Always use PSKeywordTable.intern().
''' ##
class PSKeyword(PSObject):
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
@ -52,23 +59,27 @@ class PSKeyword(PSObject):
def __repr__(self): def __repr__(self):
return self.name return self.name
# PSSymbolTable
## PSSymbolTable
##
## A dictionary-like object that is used for
## storing PSLiteral/PSKeyword objects so that
## an object that has the same name can never be defined
## twice and it is always assured that the same name is
## referred to as the same PSLiteral/PSKeyword object.
##
class PSSymbolTable(object): class PSSymbolTable(object):
''' def __init__(self, klass):
Symbol table that stores PSLiteral or PSKeyword.
'''
def __init__(self, classe):
self.dic = {} self.dic = {}
self.classe = classe self.klass = klass
return return
def intern(self, name): def intern(self, name):
if name in self.dic: if name in self.dic:
lit = self.dic[name] lit = self.dic[name]
else: else:
lit = self.classe(name) lit = self.klass(name)
self.dic[name] = lit self.dic[name] = lit
return lit return lit
@ -118,7 +129,7 @@ ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
class PSBaseParser(object): class PSBaseParser(object):
''' '''
Most basic PostScript parser that performs only basic tokenization. Most basic PostScript parser that performs only tokenization.
''' '''
BUFSIZ = 4096 BUFSIZ = 4096
@ -163,8 +174,10 @@ class PSBaseParser(object):
self.buf = '' self.buf = ''
self.charpos = 0 self.charpos = 0
# reset the status for nexttoken() # reset the status for nexttoken()
self.parse1 = self.parse_main self._parse1 = self._parse_main
self.tokens = [] self._curtoken = ''
self._curtokenpos = 0
self._tokens = []
return return
def fillbuf(self): def fillbuf(self):
@ -177,192 +190,6 @@ class PSBaseParser(object):
self.charpos = 0 self.charpos = 0
return return
def parse_main(self, s, i):
m = NONSPC.search(s, i)
if not m:
return (self.parse_main, len(s))
j = m.start(0)
c = s[j]
self.tokenstart = self.bufpos+j
if c == '%':
self.token = '%'
return (self.parse_comment, j+1)
if c == '/':
self.token = ''
return (self.parse_literal, j+1)
if c in '-+' or c.isdigit():
self.token = c
return (self.parse_number, j+1)
if c == '.':
self.token = c
return (self.parse_float, j+1)
if c.isalpha():
self.token = c
return (self.parse_keyword, j+1)
if c == '(':
self.token = ''
self.paren = 1
return (self.parse_string, j+1)
if c == '<':
self.token = ''
return (self.parse_wopen, j+1)
if c == '>':
self.token = ''
return (self.parse_wclose, j+1)
self.add_token(KWD(c))
return (self.parse_main, j+1)
def add_token(self, obj):
self.tokens.append((self.tokenstart, obj))
return
def parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_comment, len(s))
j = m.start(0)
self.token += s[i:j]
# We ignore comments.
#self.tokens.append(self.token)
return (self.parse_main, j)
def parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_literal, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '#':
self.hex = ''
return (self.parse_literal_hex, j+1)
self.add_token(LIT(self.token))
return (self.parse_main, j)
def parse_literal_hex(self, s, i):
c = s[i]
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self.parse_literal_hex, i+1)
if self.hex:
self.token += chr(int(self.hex, 16))
return (self.parse_literal, i)
def parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_number, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '.':
self.token += c
return (self.parse_float, j+1)
try:
self.add_token(int(self.token))
except ValueError:
pass
return (self.parse_main, j)
def parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_float, len(s))
j = m.start(0)
self.token += s[i:j]
self.add_token(float(self.token))
return (self.parse_main, j)
def parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_keyword, len(s))
j = m.start(0)
self.token += s[i:j]
if self.token == 'true':
token = True
elif self.token == 'false':
token = False
else:
token = KWD(self.token)
self.add_token(token)
return (self.parse_main, j)
def parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_string, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '\\':
self.oct = ''
return (self.parse_string_1, j+1)
if c == '(':
self.paren += 1
self.token += c
return (self.parse_string, j+1)
if c == ')':
self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment.
self.token += c
return (self.parse_string, j+1)
self.add_token(self.token)
return (self.parse_main, j+1)
def parse_string_1(self, s, i):
c = s[i]
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self.parse_string_1, i+1)
if self.oct:
self.token += chr(int(self.oct, 8))
return (self.parse_string, i)
if c in ESC_STRING:
self.token += chr(ESC_STRING[c])
return (self.parse_string, i+1)
def parse_wopen(self, s, i):
c = s[i]
if c.isspace() or HEX.match(c):
return (self.parse_hexstring, i)
if c == '<':
self.add_token(KEYWORD_DICT_BEGIN)
i += 1
return (self.parse_main, i)
def parse_wclose(self, s, i):
c = s[i]
if c == '>':
self.add_token(KEYWORD_DICT_END)
i += 1
return (self.parse_main, i)
def parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_hexstring, len(s))
j = m.start(0)
self.token += s[i:j]
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
SPC.sub('', self.token))
self.add_token(token)
return (self.parse_main, j)
def nexttoken(self):
while not self.tokens:
self.fillbuf()
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
token = self.tokens.pop(0)
if 2 <= self.debug:
print >>stderr, 'nexttoken: %r' % (token,)
return token
def nextline(self): def nextline(self):
''' '''
Fetches a next line that ends either with \\r or \\n. Fetches a next line that ends either with \\r or \\n.
@ -417,6 +244,195 @@ class PSBaseParser(object):
buf = '' buf = ''
return return
def _parse_main(self, s, i):
m = NONSPC.search(s, i)
if not m:
return (self._parse_main, len(s))
j = m.start(0)
c = s[j]
self._curtokenpos = self.bufpos+j
if c == '%':
self._curtoken = '%'
return (self._parse_comment, j+1)
elif c == '/':
self._curtoken = ''
return (self._parse_literal, j+1)
elif c in '-+' or c.isdigit():
self._curtoken = c
return (self._parse_number, j+1)
elif c == '.':
self._curtoken = c
return (self._parse_float, j+1)
elif c.isalpha():
self._curtoken = c
return (self._parse_keyword, j+1)
elif c == '(':
self._curtoken = ''
self.paren = 1
return (self._parse_string, j+1)
elif c == '<':
self._curtoken = ''
return (self._parse_wopen, j+1)
elif c == '>':
self._curtoken = ''
return (self._parse_wclose, j+1)
else:
self._add_token(KWD(c))
return (self._parse_main, j+1)
def _add_token(self, obj):
self._tokens.append((self._curtokenpos, obj))
return
def _parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_comment, len(s))
j = m.start(0)
self._curtoken += s[i:j]
# We ignore comments.
#self._tokens.append(self._curtoken)
return (self._parse_main, j)
def _parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_literal, len(s))
j = m.start(0)
self._curtoken += s[i:j]
c = s[j]
if c == '#':
self.hex = ''
return (self._parse_literal_hex, j+1)
self._add_token(LIT(self._curtoken))
return (self._parse_main, j)
def _parse_literal_hex(self, s, i):
c = s[i]
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self._parse_literal_hex, i+1)
if self.hex:
self._curtoken += chr(int(self.hex, 16))
return (self._parse_literal, i)
def _parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_number, len(s))
j = m.start(0)
self._curtoken += s[i:j]
c = s[j]
if c == '.':
self._curtoken += c
return (self._parse_float, j+1)
try:
self._add_token(int(self._curtoken))
except ValueError:
pass
return (self._parse_main, j)
def _parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_float, len(s))
j = m.start(0)
self._curtoken += s[i:j]
self._add_token(float(self._curtoken))
return (self._parse_main, j)
def _parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_keyword, len(s))
j = m.start(0)
self._curtoken += s[i:j]
if self._curtoken == 'true':
token = True
elif self._curtoken == 'false':
token = False
else:
token = KWD(self._curtoken)
self._add_token(token)
return (self._parse_main, j)
def _parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_string, len(s))
j = m.start(0)
self._curtoken += s[i:j]
c = s[j]
if c == '\\':
self.oct = ''
return (self._parse_string_1, j+1)
if c == '(':
self.paren += 1
self._curtoken += c
return (self._parse_string, j+1)
if c == ')':
self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment.
self._curtoken += c
return (self._parse_string, j+1)
self._add_token(self._curtoken)
return (self._parse_main, j+1)
def _parse_string_1(self, s, i):
c = s[i]
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self._parse_string_1, i+1)
if self.oct:
self._curtoken += chr(int(self.oct, 8))
return (self._parse_string, i)
if c in ESC_STRING:
self._curtoken += chr(ESC_STRING[c])
return (self._parse_string, i+1)
def _parse_wopen(self, s, i):
c = s[i]
if c.isspace() or HEX.match(c):
return (self._parse_hexstring, i)
if c == '<':
self._add_token(KEYWORD_DICT_BEGIN)
i += 1
return (self._parse_main, i)
def _parse_wclose(self, s, i):
c = s[i]
if c == '>':
self._add_token(KEYWORD_DICT_END)
i += 1
return (self._parse_main, i)
def _parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_hexstring, len(s))
j = m.start(0)
self._curtoken += s[i:j]
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
SPC.sub('', self._curtoken))
self._add_token(token)
return (self._parse_main, j)
def nexttoken(self):
while not self._tokens:
self.fillbuf()
(self._parse1, self.charpos) = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0)
if 2 <= self.debug:
print >>stderr, 'nexttoken: %r' % (token,)
return token
## PSStackParser ## PSStackParser
## ##