From fc453e206128b26fcdabeb87e07a642a2aa15363 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 14 Jun 2009 08:54:57 +0000 Subject: [PATCH] tidy up a bit git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@114 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 8 +- README.html | 10 +- pdfminer/converter.py | 4 + pdfminer/psparser.py | 438 ++++++++++++++++++++++-------------------- 4 files changed, 240 insertions(+), 220 deletions(-) diff --git a/Makefile b/Makefile index 2b23c19..6010a59 100644 --- a/Makefile +++ b/Makefile @@ -19,12 +19,12 @@ install: clean: -rm -rf build - -cd $(PACKAGE) && make clean - -cd tools && make clean - -cd samples && make clean + -cd $(PACKAGE) && $(MAKE) clean + -cd tools && $(MAKE) clean + -cd samples && $(MAKE) clean test: - cd samples && make test + cd samples && $(MAKE) test # Maintainance: commit: clean diff --git a/README.html b/README.html index 58845be..53ad545 100644 --- a/README.html +++ b/README.html @@ -18,7 +18,7 @@ Python PDF parser and analyzer
-Last Modified: Sun May 17 23:10:42 JST 2009 +Last Modified: Sat May 23 10:06:04 JST 2009
@@ -146,7 +146,7 @@ PDFMiner comes with two handy tools:

pdf2txt.py extracts text contents from a PDF file. It extracts all the texts that are to be rendered programatically, -i.e. it cannot extract texts drawn as images that require optical character recognition. +It cannot recognize texts drawn as images that would require optical character recognition. It also extracts the corresponding locations, font names, font sizes, writing direction (horizontal or vertical) for each text portion. You need to provide a password for protected PDF documents when its access is restricted. @@ -243,9 +243,9 @@ Note that page numbers start from one. Because the contents of stream objects can be very large, they are omitted when none of the options above is specified.

-With -r option, all the stream contents are dumped without decoding. -With -b option, the contents are dumped as a binary blob. -With -t option, the contents are dumped in a text format, +With -r option, the "raw" stream contents are dumped without decompression. +With -b option, the decompressed contents are dumped as a binary blob. +With -t option, the decompressed contents are dumped in a text format, similar to repr() manner. When -r or -b option is given, no stream header is displayed for the ease of saving it to a file. diff --git a/pdfminer/converter.py b/pdfminer/converter.py index b9510cc..ad2a47f 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -303,6 +303,10 @@ class TextConverter(PDFConverter): self.word_margin = word_margin return + def write(self, text): + self.outfp.write(text.encode(self.codec, 'ignore')) + return + def end_page(self, page): def render(item): if isinstance(item, LTText): diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index cf3e43f..4114072 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -18,17 +18,25 @@ class PSValueError(PSException): pass ## Basic PostScript Types ## -# PSLiteral +## PSObject +## +## Base class for all PS or PDF-related data types. +## class PSObject(object): pass + +## PSLiteral +## +## Postscript literals are used as identifiers, such as +## variable names, property names and dictionary keys. +## Literals are case sensitive and denoted by a preceding +## slash sign (e.g. "/Name") +## +## Note: Never create an instance of PSLiteral by hand. +## Always use PSLiteralTable.intern(). +## class PSLiteral(PSObject): - ''' - PS literals (e.g. "/Name"). - Caution: Never create these objects directly. - Use PSLiteralTable.intern() instead. - ''' - def __init__(self, name): self.name = name return @@ -36,15 +44,14 @@ class PSLiteral(PSObject): def __repr__(self): return '/%s' % self.name -# PSKeyword + +## PSKeyword +## +## Note: Never create an instance of PSLiteral by hand. +## Always use PSKeywordTable.intern(). +## class PSKeyword(PSObject): - ''' - PS keywords (e.g. "showpage"). - Caution: Never create these objects directly. - Use PSKeywordTable.intern() instead. - ''' - def __init__(self, name): self.name = name return @@ -52,23 +59,27 @@ class PSKeyword(PSObject): def __repr__(self): return self.name -# PSSymbolTable + +## PSSymbolTable +## +## A dictionary-like object that is used for +## storing PSLiteral/PSKeyword objects so that +## an object that has the same name can never be defined +## twice and it is always assured that the same name is +## referred to as the same PSLiteral/PSKeyword object. +## class PSSymbolTable(object): - ''' - Symbol table that stores PSLiteral or PSKeyword. - ''' - - def __init__(self, classe): + def __init__(self, klass): self.dic = {} - self.classe = classe + self.klass = klass return def intern(self, name): if name in self.dic: lit = self.dic[name] else: - lit = self.classe(name) + lit = self.klass(name) self.dic[name] = lit return lit @@ -118,7 +129,7 @@ ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } class PSBaseParser(object): ''' - Most basic PostScript parser that performs only basic tokenization. + Most basic PostScript parser that performs only tokenization. ''' BUFSIZ = 4096 @@ -163,8 +174,10 @@ class PSBaseParser(object): self.buf = '' self.charpos = 0 # reset the status for nexttoken() - self.parse1 = self.parse_main - self.tokens = [] + self._parse1 = self._parse_main + self._curtoken = '' + self._curtokenpos = 0 + self._tokens = [] return def fillbuf(self): @@ -177,192 +190,6 @@ class PSBaseParser(object): self.charpos = 0 return - def parse_main(self, s, i): - m = NONSPC.search(s, i) - if not m: - return (self.parse_main, len(s)) - j = m.start(0) - c = s[j] - self.tokenstart = self.bufpos+j - if c == '%': - self.token = '%' - return (self.parse_comment, j+1) - if c == '/': - self.token = '' - return (self.parse_literal, j+1) - if c in '-+' or c.isdigit(): - self.token = c - return (self.parse_number, j+1) - if c == '.': - self.token = c - return (self.parse_float, j+1) - if c.isalpha(): - self.token = c - return (self.parse_keyword, j+1) - if c == '(': - self.token = '' - self.paren = 1 - return (self.parse_string, j+1) - if c == '<': - self.token = '' - return (self.parse_wopen, j+1) - if c == '>': - self.token = '' - return (self.parse_wclose, j+1) - self.add_token(KWD(c)) - return (self.parse_main, j+1) - - def add_token(self, obj): - self.tokens.append((self.tokenstart, obj)) - return - - def parse_comment(self, s, i): - m = EOL.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_comment, len(s)) - j = m.start(0) - self.token += s[i:j] - # We ignore comments. - #self.tokens.append(self.token) - return (self.parse_main, j) - - def parse_literal(self, s, i): - m = END_LITERAL.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_literal, len(s)) - j = m.start(0) - self.token += s[i:j] - c = s[j] - if c == '#': - self.hex = '' - return (self.parse_literal_hex, j+1) - self.add_token(LIT(self.token)) - return (self.parse_main, j) - - def parse_literal_hex(self, s, i): - c = s[i] - if HEX.match(c) and len(self.hex) < 2: - self.hex += c - return (self.parse_literal_hex, i+1) - if self.hex: - self.token += chr(int(self.hex, 16)) - return (self.parse_literal, i) - - def parse_number(self, s, i): - m = END_NUMBER.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_number, len(s)) - j = m.start(0) - self.token += s[i:j] - c = s[j] - if c == '.': - self.token += c - return (self.parse_float, j+1) - try: - self.add_token(int(self.token)) - except ValueError: - pass - return (self.parse_main, j) - def parse_float(self, s, i): - m = END_NUMBER.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_float, len(s)) - j = m.start(0) - self.token += s[i:j] - self.add_token(float(self.token)) - return (self.parse_main, j) - - def parse_keyword(self, s, i): - m = END_KEYWORD.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_keyword, len(s)) - j = m.start(0) - self.token += s[i:j] - if self.token == 'true': - token = True - elif self.token == 'false': - token = False - else: - token = KWD(self.token) - self.add_token(token) - return (self.parse_main, j) - - def parse_string(self, s, i): - m = END_STRING.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_string, len(s)) - j = m.start(0) - self.token += s[i:j] - c = s[j] - if c == '\\': - self.oct = '' - return (self.parse_string_1, j+1) - if c == '(': - self.paren += 1 - self.token += c - return (self.parse_string, j+1) - if c == ')': - self.paren -= 1 - if self.paren: # WTF, they said balanced parens need no special treatment. - self.token += c - return (self.parse_string, j+1) - self.add_token(self.token) - return (self.parse_main, j+1) - def parse_string_1(self, s, i): - c = s[i] - if OCT_STRING.match(c) and len(self.oct) < 3: - self.oct += c - return (self.parse_string_1, i+1) - if self.oct: - self.token += chr(int(self.oct, 8)) - return (self.parse_string, i) - if c in ESC_STRING: - self.token += chr(ESC_STRING[c]) - return (self.parse_string, i+1) - - def parse_wopen(self, s, i): - c = s[i] - if c.isspace() or HEX.match(c): - return (self.parse_hexstring, i) - if c == '<': - self.add_token(KEYWORD_DICT_BEGIN) - i += 1 - return (self.parse_main, i) - - def parse_wclose(self, s, i): - c = s[i] - if c == '>': - self.add_token(KEYWORD_DICT_END) - i += 1 - return (self.parse_main, i) - - def parse_hexstring(self, s, i): - m = END_HEX_STRING.search(s, i) - if not m: - self.token += s[i:] - return (self.parse_hexstring, len(s)) - j = m.start(0) - self.token += s[i:j] - token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), - SPC.sub('', self.token)) - self.add_token(token) - return (self.parse_main, j) - - def nexttoken(self): - while not self.tokens: - self.fillbuf() - (self.parse1, self.charpos) = self.parse1(self.buf, self.charpos) - token = self.tokens.pop(0) - if 2 <= self.debug: - print >>stderr, 'nexttoken: %r' % (token,) - return token - def nextline(self): ''' Fetches a next line that ends either with \\r or \\n. @@ -417,6 +244,195 @@ class PSBaseParser(object): buf = '' return + def _parse_main(self, s, i): + m = NONSPC.search(s, i) + if not m: + return (self._parse_main, len(s)) + j = m.start(0) + c = s[j] + self._curtokenpos = self.bufpos+j + if c == '%': + self._curtoken = '%' + return (self._parse_comment, j+1) + elif c == '/': + self._curtoken = '' + return (self._parse_literal, j+1) + elif c in '-+' or c.isdigit(): + self._curtoken = c + return (self._parse_number, j+1) + elif c == '.': + self._curtoken = c + return (self._parse_float, j+1) + elif c.isalpha(): + self._curtoken = c + return (self._parse_keyword, j+1) + elif c == '(': + self._curtoken = '' + self.paren = 1 + return (self._parse_string, j+1) + elif c == '<': + self._curtoken = '' + return (self._parse_wopen, j+1) + elif c == '>': + self._curtoken = '' + return (self._parse_wclose, j+1) + else: + self._add_token(KWD(c)) + return (self._parse_main, j+1) + + def _add_token(self, obj): + self._tokens.append((self._curtokenpos, obj)) + return + + def _parse_comment(self, s, i): + m = EOL.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_comment, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + # We ignore comments. + #self._tokens.append(self._curtoken) + return (self._parse_main, j) + + def _parse_literal(self, s, i): + m = END_LITERAL.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_literal, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + c = s[j] + if c == '#': + self.hex = '' + return (self._parse_literal_hex, j+1) + self._add_token(LIT(self._curtoken)) + return (self._parse_main, j) + + def _parse_literal_hex(self, s, i): + c = s[i] + if HEX.match(c) and len(self.hex) < 2: + self.hex += c + return (self._parse_literal_hex, i+1) + if self.hex: + self._curtoken += chr(int(self.hex, 16)) + return (self._parse_literal, i) + + def _parse_number(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_number, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + c = s[j] + if c == '.': + self._curtoken += c + return (self._parse_float, j+1) + try: + self._add_token(int(self._curtoken)) + except ValueError: + pass + return (self._parse_main, j) + + def _parse_float(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_float, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + self._add_token(float(self._curtoken)) + return (self._parse_main, j) + + def _parse_keyword(self, s, i): + m = END_KEYWORD.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_keyword, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + if self._curtoken == 'true': + token = True + elif self._curtoken == 'false': + token = False + else: + token = KWD(self._curtoken) + self._add_token(token) + return (self._parse_main, j) + + def _parse_string(self, s, i): + m = END_STRING.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_string, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + c = s[j] + if c == '\\': + self.oct = '' + return (self._parse_string_1, j+1) + if c == '(': + self.paren += 1 + self._curtoken += c + return (self._parse_string, j+1) + if c == ')': + self.paren -= 1 + if self.paren: # WTF, they said balanced parens need no special treatment. + self._curtoken += c + return (self._parse_string, j+1) + self._add_token(self._curtoken) + return (self._parse_main, j+1) + + def _parse_string_1(self, s, i): + c = s[i] + if OCT_STRING.match(c) and len(self.oct) < 3: + self.oct += c + return (self._parse_string_1, i+1) + if self.oct: + self._curtoken += chr(int(self.oct, 8)) + return (self._parse_string, i) + if c in ESC_STRING: + self._curtoken += chr(ESC_STRING[c]) + return (self._parse_string, i+1) + + def _parse_wopen(self, s, i): + c = s[i] + if c.isspace() or HEX.match(c): + return (self._parse_hexstring, i) + if c == '<': + self._add_token(KEYWORD_DICT_BEGIN) + i += 1 + return (self._parse_main, i) + + def _parse_wclose(self, s, i): + c = s[i] + if c == '>': + self._add_token(KEYWORD_DICT_END) + i += 1 + return (self._parse_main, i) + + def _parse_hexstring(self, s, i): + m = END_HEX_STRING.search(s, i) + if not m: + self._curtoken += s[i:] + return (self._parse_hexstring, len(s)) + j = m.start(0) + self._curtoken += s[i:j] + token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), + SPC.sub('', self._curtoken)) + self._add_token(token) + return (self._parse_main, j) + + def nexttoken(self): + while not self._tokens: + self.fillbuf() + (self._parse1, self.charpos) = self._parse1(self.buf, self.charpos) + token = self._tokens.pop(0) + if 2 <= self.debug: + print >>stderr, 'nexttoken: %r' % (token,) + return token + ## PSStackParser ##