version 20090517

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@110 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-17 14:02:57 +00:00
parent 5c1cebadbb
commit 5c2a6d9b70
13 changed files with 50 additions and 48 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun May 17 15:39:06 JST 2009
Last Modified: Sun May 17 22:57:53 JST 2009
<!-- hhmts end -->
</div>
@ -51,8 +51,8 @@ PDF parser that can be used for other purpoes instead of text analysis.
<a name="source"></a>
<p>
<strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090517.tar.gz
</a>
(1.8Mbytes)
@ -126,10 +126,11 @@ For example:
$ <strong>cd /usr/lib/python2.5/site-packages</strong>
$ <strong>tar jxf CMap.tar.bz2</strong>
</pre></blockquote>
<li> Do the follwoing: (this is optional but highly recommended)<br>
<li> Do the follwoing. (this is optional, but highly recommended)<br>
<blockquote><pre>
$ <strong>python -m pdfminer.cmap /usr/lib/python2.5/site-packages/CMap</strong>
$ <strong>python -m pdfminer.cmap</strong>
</pre></blockquote>
This may take several minutes.
</ol>
<a name="usage"></a>
@ -260,6 +261,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
<li> 2009/03/30: Text output mode added.
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
<li> 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger.

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
__version__ = '20090330'
__version__ = '20090517'
if __name__ == '__main__': print __version__

View File

@ -2,14 +2,17 @@
import sys, re, os, os.path
stderr = sys.stderr
from struct import pack, unpack
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
from pdfminer.utils import choplist, nunpack
from pdfminer.fontmetrics import FONT_METRICS
from pdfminer.latin_enc import ENCODING
from pdfminer.glyphlist import charname2unicode
from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser
try:
import cdb
except ImportError:
import pycdb as cdb
import pdfminer.pycdb as cdb
class CMapError(Exception): pass
@ -28,7 +31,6 @@ def find_cmap_path():
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
from glyphlist import charname2unicode
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
@ -360,19 +362,16 @@ class CMapParser(PSStackParser):
## FontMetricsDB
##
class FontMetricsDB(object):
from fontmetrics import FONT_METRICS
@classmethod
def get_metrics(klass, fontname):
return klass.FONT_METRICS[fontname]
return FONT_METRICS[fontname]
## EncodingDB
##
class EncodingDB(object):
from latin_enc import ENCODING
std2unicode = {}
mac2unicode = {}
win2unicode = {}
@ -447,8 +446,10 @@ def main(argv):
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
except getopt.GetoptError:
return usage()
if not args: usage()
if args:
cmapdir = args.pop(0)
else:
cmapdir = find_cmap_path()
outputdir = cmapdir
force = False
for (k, v) in opts:
@ -456,9 +457,11 @@ def main(argv):
elif k == '-C': cmapdir = v
elif k == '-D': outputdir = v
if not os.path.isdir(cmapdir):
raise ValueError('not directory: %r' % cmapdir)
print >>stderr, 'directory does not exist: %r' % cmapdir
return 111
if not os.path.isdir(outputdir):
raise ValueError('not directory: %r' % outputdir)
print >>stderr, 'directory does not exist: %r' % outputdir
return 111
return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python
import sys
from pdfdevice import PDFDevice
from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
## PDFPageAggregator

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
import sys
from utils import apply_matrix_norm
from pdfminer.utils import apply_matrix_norm
INF = sys.maxint

View File

@ -2,6 +2,7 @@
import sys
stderr = sys.stderr
## LZWDecoder
##
class LZWDecoder(object):

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
import sys
from psparser import PSLiteralTable
from pdfminer.psparser import PSLiteralTable
## PDFColorSpace

View File

@ -5,13 +5,13 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
from pdfminer.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
literal_name, keyword_name, STRICT
from pdftypes import PDFException, \
from pdfminer.pdftypes import PDFException, \
resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import apply_matrix_norm, nunpack
from pdfminer.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from pdfminer.utils import apply_matrix_norm, nunpack
## CFFFont

View File

@ -6,18 +6,18 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSException, PSTypeError, PSEOF, \
from pdfminer.psparser import PSException, PSTypeError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSStackParser, PSKeyword, STRICT
from pdftypes import PDFException, PDFStream, PDFObjRef, \
from pdfminer.pdftypes import PDFException, PDFStream, PDFObjRef, \
resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
from pdfminer.utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
from pdfminer.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfminer.pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
from cmap import CMapDB
from pdfminer.cmap import CMapDB
## Exceptions

View File

@ -7,12 +7,11 @@
import sys, re
import md5, struct
stderr = sys.stderr
from utils import choplist, nunpack, decode_text
from arcfour import Arcfour
from psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
STRICT
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
from pdfminer.utils import choplist, nunpack, decode_text
from pdfminer.arcfour import Arcfour
from pdfminer.psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, STRICT
from pdfminer.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
PDFStream, PDFObjRef, resolve1, decipher_all, \
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python
import sys, zlib
stderr = sys.stderr
from lzw import LZWDecoder
from psparser import PSException, PSObject, \
from pdfminer.lzw import LZWDecoder
from pdfminer.psparser import PSException, PSObject, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, STRICT

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python
import sys, re
stderr = sys.stderr
from utils import choplist
from pdfminer.utils import choplist
STRICT = 0

View File

@ -1,8 +1,7 @@
# GNUMakefile for test
PYTHON=python
CMAPDIR=../CMap
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
HTMLS= \
simple1.html \