version 20090517

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@110 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-17 14:02:57 +00:00
parent 5c1cebadbb
commit 5c2a6d9b70
13 changed files with 50 additions and 48 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun May 17 15:39:06 JST 2009 Last Modified: Sun May 17 22:57:53 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
<a name="source"></a> <a name="source"></a>
<p> <p>
<strong>Download:</strong><br> <strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz"> <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
</a> </a>
(1.8Mbytes) (1.8Mbytes)
@ -126,10 +126,11 @@ For example:
$ <strong>cd /usr/lib/python2.5/site-packages</strong> $ <strong>cd /usr/lib/python2.5/site-packages</strong>
$ <strong>tar jxf CMap.tar.bz2</strong> $ <strong>tar jxf CMap.tar.bz2</strong>
</pre></blockquote> </pre></blockquote>
<li> Do the follwoing: (this is optional but highly recommended)<br> <li> Do the follwoing. (this is optional, but highly recommended)<br>
<blockquote><pre> <blockquote><pre>
$ <strong>python -m pdfminer.cmap /usr/lib/python2.5/site-packages/CMap</strong> $ <strong>python -m pdfminer.cmap</strong>
</pre></blockquote> </pre></blockquote>
This may take several minutes.
</ol> </ol>
<a name="usage"></a> <a name="usage"></a>
@ -260,6 +261,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
<li> 2009/03/30: Text output mode added. <li> 2009/03/30: Text output mode added.
<li> 2009/03/25: Encoding problems fixed. Word splitting option added. <li> 2009/03/25: Encoding problems fixed. Word splitting option added.
<li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger. <li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger.

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
__version__ = '20090330' __version__ = '20090517'
if __name__ == '__main__': print __version__ if __name__ == '__main__': print __version__

View File

@ -2,14 +2,17 @@
import sys, re, os, os.path import sys, re, os, os.path
stderr = sys.stderr stderr = sys.stderr
from struct import pack, unpack from struct import pack, unpack
from utils import choplist, nunpack from pdfminer.utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from pdfminer.fontmetrics import FONT_METRICS
from pdfminer.latin_enc import ENCODING
from pdfminer.glyphlist import charname2unicode
from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \ PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser PSStackParser
try: try:
import cdb import cdb
except ImportError: except ImportError:
import pycdb as cdb import pdfminer.pycdb as cdb
class CMapError(Exception): pass class CMapError(Exception): pass
@ -28,7 +31,6 @@ def find_cmap_path():
STRIP_NAME = re.compile(r'[0-9]+') STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name): def name2unicode(name):
from glyphlist import charname2unicode
if name in charname2unicode: if name in charname2unicode:
return charname2unicode[name] return charname2unicode[name]
m = STRIP_NAME.search(name) m = STRIP_NAME.search(name)
@ -360,19 +362,16 @@ class CMapParser(PSStackParser):
## FontMetricsDB ## FontMetricsDB
## ##
class FontMetricsDB(object): class FontMetricsDB(object):
from fontmetrics import FONT_METRICS
@classmethod @classmethod
def get_metrics(klass, fontname): def get_metrics(klass, fontname):
return klass.FONT_METRICS[fontname] return FONT_METRICS[fontname]
## EncodingDB ## EncodingDB
## ##
class EncodingDB(object): class EncodingDB(object):
from latin_enc import ENCODING
std2unicode = {} std2unicode = {}
mac2unicode = {} mac2unicode = {}
win2unicode = {} win2unicode = {}
@ -447,8 +446,10 @@ def main(argv):
(opts, args) = getopt.getopt(argv[1:], 'C:D:f') (opts, args) = getopt.getopt(argv[1:], 'C:D:f')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: usage() if args:
cmapdir = args.pop(0) cmapdir = args.pop(0)
else:
cmapdir = find_cmap_path()
outputdir = cmapdir outputdir = cmapdir
force = False force = False
for (k, v) in opts: for (k, v) in opts:
@ -456,9 +457,11 @@ def main(argv):
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': outputdir = v elif k == '-D': outputdir = v
if not os.path.isdir(cmapdir): if not os.path.isdir(cmapdir):
raise ValueError('not directory: %r' % cmapdir) print >>stderr, 'directory does not exist: %r' % cmapdir
return 111
if not os.path.isdir(outputdir): if not os.path.isdir(outputdir):
raise ValueError('not directory: %r' % outputdir) print >>stderr, 'directory does not exist: %r' % outputdir
return 111
return convert_cmap(cmapdir, outputdir, force=force) return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdffont import PDFUnicodeNotDefined from pdfminer.pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
## PDFPageAggregator ## PDFPageAggregator

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from utils import apply_matrix_norm from pdfminer.utils import apply_matrix_norm
INF = sys.maxint INF = sys.maxint

View File

@ -2,6 +2,7 @@
import sys import sys
stderr = sys.stderr stderr = sys.stderr
## LZWDecoder ## LZWDecoder
## ##
class LZWDecoder(object): class LZWDecoder(object):

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from psparser import PSLiteralTable from pdfminer.psparser import PSLiteralTable
## PDFColorSpace ## PDFColorSpace

View File

@ -5,13 +5,13 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \ from pdfminer.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
literal_name, keyword_name, STRICT literal_name, keyword_name, STRICT
from pdftypes import PDFException, \ from pdfminer.pdftypes import PDFException, \
resolve1, int_value, float_value, num_value, \ resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB from pdfminer.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import apply_matrix_norm, nunpack from pdfminer.utils import apply_matrix_norm, nunpack
## CFFFont ## CFFFont

View File

@ -6,18 +6,18 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from psparser import PSException, PSTypeError, PSEOF, \ from pdfminer.psparser import PSException, PSTypeError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSStackParser, PSKeyword, STRICT PSStackParser, PSKeyword, STRICT
from pdftypes import PDFException, PDFStream, PDFObjRef, \ from pdfminer.pdftypes import PDFException, PDFStream, PDFObjRef, \
resolve1, int_value, float_value, num_value, \ resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY from pdfminer.utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdfminer.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \ from pdfminer.pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
from cmap import CMapDB from pdfminer.cmap import CMapDB
## Exceptions ## Exceptions

View File

@ -7,12 +7,11 @@
import sys, re import sys, re
import md5, struct import md5, struct
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack, decode_text from pdfminer.utils import choplist, nunpack, decode_text
from arcfour import Arcfour from pdfminer.arcfour import Arcfour
from psparser import PSStackParser, PSSyntaxError, PSEOF, \ from pdfminer.psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, STRICT
STRICT from pdfminer.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
PDFStream, PDFObjRef, resolve1, decipher_all, \ PDFStream, PDFObjRef, resolve1, decipher_all, \
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value int_value, float_value, num_value, str_value, list_value, dict_value, stream_value

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys, zlib import sys, zlib
stderr = sys.stderr from pdfminer.lzw import LZWDecoder
from lzw import LZWDecoder from pdfminer.psparser import PSException, PSObject, \
from psparser import PSException, PSObject, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, STRICT literal_name, keyword_name, STRICT

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys, re import sys, re
stderr = sys.stderr stderr = sys.stderr
from pdfminer.utils import choplist
from utils import choplist
STRICT = 0 STRICT = 0

View File

@ -1,8 +1,7 @@
# GNUMakefile for test # GNUMakefile for test
PYTHON=python PYTHON=python
CMAPDIR=../CMap PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \