version 20090517
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@110 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
5c1cebadbb
commit
5c2a6d9b70
12
README.html
12
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun May 17 15:39:06 JST 2009
|
||||
Last Modified: Sun May 17 22:57:53 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
|
|||
<a name="source"></a>
|
||||
<p>
|
||||
<strong>Download:</strong><br>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
|
||||
</a>
|
||||
(1.8Mbytes)
|
||||
|
||||
|
@ -126,10 +126,11 @@ For example:
|
|||
$ <strong>cd /usr/lib/python2.5/site-packages</strong>
|
||||
$ <strong>tar jxf CMap.tar.bz2</strong>
|
||||
</pre></blockquote>
|
||||
<li> Do the follwoing: (this is optional but highly recommended)<br>
|
||||
<li> Do the follwoing. (this is optional, but highly recommended)<br>
|
||||
<blockquote><pre>
|
||||
$ <strong>python -m pdfminer.cmap /usr/lib/python2.5/site-packages/CMap</strong>
|
||||
$ <strong>python -m pdfminer.cmap</strong>
|
||||
</pre></blockquote>
|
||||
This may take several minutes.
|
||||
</ol>
|
||||
|
||||
<a name="usage"></a>
|
||||
|
@ -260,6 +261,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
||||
<li> 2009/03/30: Text output mode added.
|
||||
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
||||
<li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
__version__ = '20090330'
|
||||
__version__ = '20090517'
|
||||
|
||||
if __name__ == '__main__': print __version__
|
||||
|
|
|
@ -2,14 +2,17 @@
|
|||
import sys, re, os, os.path
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
from utils import choplist, nunpack
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
from pdfminer.utils import choplist, nunpack
|
||||
from pdfminer.fontmetrics import FONT_METRICS
|
||||
from pdfminer.latin_enc import ENCODING
|
||||
from pdfminer.glyphlist import charname2unicode
|
||||
from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||
PSStackParser
|
||||
try:
|
||||
import cdb
|
||||
except ImportError:
|
||||
import pycdb as cdb
|
||||
import pdfminer.pycdb as cdb
|
||||
|
||||
|
||||
class CMapError(Exception): pass
|
||||
|
@ -28,7 +31,6 @@ def find_cmap_path():
|
|||
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
def name2unicode(name):
|
||||
from glyphlist import charname2unicode
|
||||
if name in charname2unicode:
|
||||
return charname2unicode[name]
|
||||
m = STRIP_NAME.search(name)
|
||||
|
@ -360,19 +362,16 @@ class CMapParser(PSStackParser):
|
|||
## FontMetricsDB
|
||||
##
|
||||
class FontMetricsDB(object):
|
||||
from fontmetrics import FONT_METRICS
|
||||
|
||||
@classmethod
|
||||
def get_metrics(klass, fontname):
|
||||
return klass.FONT_METRICS[fontname]
|
||||
return FONT_METRICS[fontname]
|
||||
|
||||
|
||||
## EncodingDB
|
||||
##
|
||||
class EncodingDB(object):
|
||||
|
||||
from latin_enc import ENCODING
|
||||
|
||||
std2unicode = {}
|
||||
mac2unicode = {}
|
||||
win2unicode = {}
|
||||
|
@ -447,8 +446,10 @@ def main(argv):
|
|||
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: usage()
|
||||
cmapdir = args.pop(0)
|
||||
if args:
|
||||
cmapdir = args.pop(0)
|
||||
else:
|
||||
cmapdir = find_cmap_path()
|
||||
outputdir = cmapdir
|
||||
force = False
|
||||
for (k, v) in opts:
|
||||
|
@ -456,9 +457,11 @@ def main(argv):
|
|||
elif k == '-C': cmapdir = v
|
||||
elif k == '-D': outputdir = v
|
||||
if not os.path.isdir(cmapdir):
|
||||
raise ValueError('not directory: %r' % cmapdir)
|
||||
print >>stderr, 'directory does not exist: %r' % cmapdir
|
||||
return 111
|
||||
if not os.path.isdir(outputdir):
|
||||
raise ValueError('not directory: %r' % outputdir)
|
||||
print >>stderr, 'directory does not exist: %r' % outputdir
|
||||
return 111
|
||||
return convert_cmap(cmapdir, outputdir, force=force)
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfdevice import PDFDevice
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||
from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from utils import apply_matrix_norm
|
||||
from pdfminer.utils import apply_matrix_norm
|
||||
INF = sys.maxint
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
import sys
|
||||
stderr = sys.stderr
|
||||
|
||||
|
||||
## LZWDecoder
|
||||
##
|
||||
class LZWDecoder(object):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from psparser import PSLiteralTable
|
||||
from pdfminer.psparser import PSLiteralTable
|
||||
|
||||
|
||||
## PDFColorSpace
|
||||
|
|
|
@ -5,13 +5,13 @@ try:
|
|||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
|
||||
from pdfminer.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
|
||||
literal_name, keyword_name, STRICT
|
||||
from pdftypes import PDFException, \
|
||||
from pdfminer.pdftypes import PDFException, \
|
||||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||
from utils import apply_matrix_norm, nunpack
|
||||
from pdfminer.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||
from pdfminer.utils import apply_matrix_norm, nunpack
|
||||
|
||||
|
||||
## CFFFont
|
||||
|
|
|
@ -6,18 +6,18 @@ try:
|
|||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from psparser import PSException, PSTypeError, PSEOF, \
|
||||
from pdfminer.psparser import PSException, PSTypeError, PSEOF, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||
PSStackParser, PSKeyword, STRICT
|
||||
from pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||
from pdfminer.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
|
||||
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
||||
from pdfminer.utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
|
||||
from pdfminer.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfminer.pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||
from cmap import CMapDB
|
||||
from pdfminer.cmap import CMapDB
|
||||
|
||||
|
||||
## Exceptions
|
||||
|
|
|
@ -7,12 +7,11 @@
|
|||
import sys, re
|
||||
import md5, struct
|
||||
stderr = sys.stderr
|
||||
from utils import choplist, nunpack, decode_text
|
||||
from arcfour import Arcfour
|
||||
from psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||
STRICT
|
||||
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
||||
from pdfminer.utils import choplist, nunpack, decode_text
|
||||
from pdfminer.arcfour import Arcfour
|
||||
from pdfminer.psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, STRICT
|
||||
from pdfminer.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
||||
PDFStream, PDFObjRef, resolve1, decipher_all, \
|
||||
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, zlib
|
||||
stderr = sys.stderr
|
||||
from lzw import LZWDecoder
|
||||
from psparser import PSException, PSObject, \
|
||||
from pdfminer.lzw import LZWDecoder
|
||||
from pdfminer.psparser import PSException, PSObject, \
|
||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, STRICT
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, re
|
||||
stderr = sys.stderr
|
||||
|
||||
from utils import choplist
|
||||
from pdfminer.utils import choplist
|
||||
|
||||
STRICT = 0
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
PYTHON=python
|
||||
CMAPDIR=../CMap
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
|
||||
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
|
||||
|
||||
HTMLS= \
|
||||
simple1.html \
|
||||
|
|
Loading…
Reference in New Issue