version 20090517
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@110 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
5c1cebadbb
commit
5c2a6d9b70
12
README.html
12
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun May 17 15:39:06 JST 2009
|
Last Modified: Sun May 17 22:57:53 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
|
||||||
<a name="source"></a>
|
<a name="source"></a>
|
||||||
<p>
|
<p>
|
||||||
<strong>Download:</strong><br>
|
<strong>Download:</strong><br>
|
||||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
|
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
|
||||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
|
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
|
||||||
</a>
|
</a>
|
||||||
(1.8Mbytes)
|
(1.8Mbytes)
|
||||||
|
|
||||||
|
@ -126,10 +126,11 @@ For example:
|
||||||
$ <strong>cd /usr/lib/python2.5/site-packages</strong>
|
$ <strong>cd /usr/lib/python2.5/site-packages</strong>
|
||||||
$ <strong>tar jxf CMap.tar.bz2</strong>
|
$ <strong>tar jxf CMap.tar.bz2</strong>
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
<li> Do the follwoing: (this is optional but highly recommended)<br>
|
<li> Do the follwoing. (this is optional, but highly recommended)<br>
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>python -m pdfminer.cmap /usr/lib/python2.5/site-packages/CMap</strong>
|
$ <strong>python -m pdfminer.cmap</strong>
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
This may take several minutes.
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
<a name="usage"></a>
|
<a name="usage"></a>
|
||||||
|
@ -260,6 +261,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
||||||
<li> 2009/03/30: Text output mode added.
|
<li> 2009/03/30: Text output mode added.
|
||||||
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
||||||
<li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger.
|
<li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__version__ = '20090330'
|
__version__ = '20090517'
|
||||||
|
|
||||||
if __name__ == '__main__': print __version__
|
if __name__ == '__main__': print __version__
|
||||||
|
|
|
@ -2,14 +2,17 @@
|
||||||
import sys, re, os, os.path
|
import sys, re, os, os.path
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from utils import choplist, nunpack
|
from pdfminer.utils import choplist, nunpack
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from pdfminer.fontmetrics import FONT_METRICS
|
||||||
|
from pdfminer.latin_enc import ENCODING
|
||||||
|
from pdfminer.glyphlist import charname2unicode
|
||||||
|
from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||||
PSStackParser
|
PSStackParser
|
||||||
try:
|
try:
|
||||||
import cdb
|
import cdb
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import pycdb as cdb
|
import pdfminer.pycdb as cdb
|
||||||
|
|
||||||
|
|
||||||
class CMapError(Exception): pass
|
class CMapError(Exception): pass
|
||||||
|
@ -28,7 +31,6 @@ def find_cmap_path():
|
||||||
|
|
||||||
STRIP_NAME = re.compile(r'[0-9]+')
|
STRIP_NAME = re.compile(r'[0-9]+')
|
||||||
def name2unicode(name):
|
def name2unicode(name):
|
||||||
from glyphlist import charname2unicode
|
|
||||||
if name in charname2unicode:
|
if name in charname2unicode:
|
||||||
return charname2unicode[name]
|
return charname2unicode[name]
|
||||||
m = STRIP_NAME.search(name)
|
m = STRIP_NAME.search(name)
|
||||||
|
@ -360,19 +362,16 @@ class CMapParser(PSStackParser):
|
||||||
## FontMetricsDB
|
## FontMetricsDB
|
||||||
##
|
##
|
||||||
class FontMetricsDB(object):
|
class FontMetricsDB(object):
|
||||||
from fontmetrics import FONT_METRICS
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_metrics(klass, fontname):
|
def get_metrics(klass, fontname):
|
||||||
return klass.FONT_METRICS[fontname]
|
return FONT_METRICS[fontname]
|
||||||
|
|
||||||
|
|
||||||
## EncodingDB
|
## EncodingDB
|
||||||
##
|
##
|
||||||
class EncodingDB(object):
|
class EncodingDB(object):
|
||||||
|
|
||||||
from latin_enc import ENCODING
|
|
||||||
|
|
||||||
std2unicode = {}
|
std2unicode = {}
|
||||||
mac2unicode = {}
|
mac2unicode = {}
|
||||||
win2unicode = {}
|
win2unicode = {}
|
||||||
|
@ -447,8 +446,10 @@ def main(argv):
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
|
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: usage()
|
if args:
|
||||||
cmapdir = args.pop(0)
|
cmapdir = args.pop(0)
|
||||||
|
else:
|
||||||
|
cmapdir = find_cmap_path()
|
||||||
outputdir = cmapdir
|
outputdir = cmapdir
|
||||||
force = False
|
force = False
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
|
@ -456,9 +457,11 @@ def main(argv):
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
elif k == '-D': outputdir = v
|
elif k == '-D': outputdir = v
|
||||||
if not os.path.isdir(cmapdir):
|
if not os.path.isdir(cmapdir):
|
||||||
raise ValueError('not directory: %r' % cmapdir)
|
print >>stderr, 'directory does not exist: %r' % cmapdir
|
||||||
|
return 111
|
||||||
if not os.path.isdir(outputdir):
|
if not os.path.isdir(outputdir):
|
||||||
raise ValueError('not directory: %r' % outputdir)
|
print >>stderr, 'directory does not exist: %r' % outputdir
|
||||||
|
return 111
|
||||||
return convert_cmap(cmapdir, outputdir, force=force)
|
return convert_cmap(cmapdir, outputdir, force=force)
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||||
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||||
from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
## PDFPageAggregator
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from utils import apply_matrix_norm
|
from pdfminer.utils import apply_matrix_norm
|
||||||
INF = sys.maxint
|
INF = sys.maxint
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
import sys
|
import sys
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
|
|
||||||
|
|
||||||
## LZWDecoder
|
## LZWDecoder
|
||||||
##
|
##
|
||||||
class LZWDecoder(object):
|
class LZWDecoder(object):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from psparser import PSLiteralTable
|
from pdfminer.psparser import PSLiteralTable
|
||||||
|
|
||||||
|
|
||||||
## PDFColorSpace
|
## PDFColorSpace
|
||||||
|
|
|
@ -5,13 +5,13 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
|
from pdfminer.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
|
||||||
literal_name, keyword_name, STRICT
|
literal_name, keyword_name, STRICT
|
||||||
from pdftypes import PDFException, \
|
from pdfminer.pdftypes import PDFException, \
|
||||||
resolve1, int_value, float_value, num_value, \
|
resolve1, int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
from pdfminer.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||||
from utils import apply_matrix_norm, nunpack
|
from pdfminer.utils import apply_matrix_norm, nunpack
|
||||||
|
|
||||||
|
|
||||||
## CFFFont
|
## CFFFont
|
||||||
|
|
|
@ -6,18 +6,18 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from psparser import PSException, PSTypeError, PSEOF, \
|
from pdfminer.psparser import PSException, PSTypeError, PSEOF, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||||
PSStackParser, PSKeyword, STRICT
|
PSStackParser, PSKeyword, STRICT
|
||||||
from pdftypes import PDFException, PDFStream, PDFObjRef, \
|
from pdfminer.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||||
resolve1, int_value, float_value, num_value, \
|
resolve1, int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
|
from pdfminer.utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
|
||||||
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
from pdfminer.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||||
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
from pdfminer.pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
||||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||||
from cmap import CMapDB
|
from pdfminer.cmap import CMapDB
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
|
|
|
@ -7,12 +7,11 @@
|
||||||
import sys, re
|
import sys, re
|
||||||
import md5, struct
|
import md5, struct
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist, nunpack, decode_text
|
from pdfminer.utils import choplist, nunpack, decode_text
|
||||||
from arcfour import Arcfour
|
from pdfminer.arcfour import Arcfour
|
||||||
from psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
from pdfminer.psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, STRICT
|
||||||
STRICT
|
from pdfminer.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
||||||
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
|
||||||
PDFStream, PDFObjRef, resolve1, decipher_all, \
|
PDFStream, PDFObjRef, resolve1, decipher_all, \
|
||||||
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
|
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys, zlib
|
import sys, zlib
|
||||||
stderr = sys.stderr
|
from pdfminer.lzw import LZWDecoder
|
||||||
from lzw import LZWDecoder
|
from pdfminer.psparser import PSException, PSObject, \
|
||||||
from psparser import PSException, PSObject, \
|
|
||||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
literal_name, keyword_name, STRICT
|
literal_name, keyword_name, STRICT
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys, re
|
import sys, re
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
|
from pdfminer.utils import choplist
|
||||||
from utils import choplist
|
|
||||||
|
|
||||||
STRICT = 0
|
STRICT = 0
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
# GNUMakefile for test
|
# GNUMakefile for test
|
||||||
|
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
CMAPDIR=../CMap
|
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
|
||||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
|
|
||||||
|
|
||||||
HTMLS= \
|
HTMLS= \
|
||||||
simple1.html \
|
simple1.html \
|
||||||
|
|
Loading…
Reference in New Issue