git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@13 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
401c849a37
commit
da778dee6f
|
@ -7,8 +7,14 @@
|
||||||
<h1>PDFMiner</h1>
|
<h1>PDFMiner</h1>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
PDFMiner is a suite of programs that help
|
PDFMiner is a suite of programs that aims to help
|
||||||
extracting or analyzing text data from PDF documents.
|
extracting or analyzing text data from PDF documents.
|
||||||
|
Unlike other PDF-related tools, it allows to obtain
|
||||||
|
the exact location of texts in a page, as well as
|
||||||
|
other layout information such as font size or font name,
|
||||||
|
which could be useful for analyzing the document.
|
||||||
|
PDFMiner is written purely in Python. It can be also used as a
|
||||||
|
basis for a full-fledged PDF interpreter.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
<strong>Homepage:</strong><br>
|
<strong>Homepage:</strong><br>
|
||||||
|
@ -60,6 +66,7 @@ $ ./pdf2txt.py -c euc-jp samples/jo.pdf
|
||||||
<hr>
|
<hr>
|
||||||
<h2>Similar Projects</h2>
|
<h2>Similar Projects</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> <a href="http://pybrary.net/pyPdf/">pyPdf</a>
|
||||||
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
|
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
|
||||||
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
|
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
13
pdf2txt.py
13
pdf2txt.py
|
@ -27,10 +27,14 @@ class TextConverter(PDFDevice):
|
||||||
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||||
(name,x0,y0,x1,y1))
|
(name,x0,y0,x1,y1))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_block(self):
|
def end_block(self):
|
||||||
self.outfp.write('</block>\n')
|
self.outfp.write('</block>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq):
|
def render_string(self, textstate, textmatrix, size, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
spwidth = int(-font.char_width(32) * 0.6) # space width
|
spwidth = int(-font.char_width(32) * 0.6) # space width
|
||||||
|
@ -44,18 +48,19 @@ class TextConverter(PDFDevice):
|
||||||
for cid in chars:
|
for cid in chars:
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unicode(cid)
|
||||||
|
buf += char
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined, e:
|
||||||
(cidcoding, cid) = e.args
|
(cidcoding, cid) = e.args
|
||||||
char = u'[%s:%d]' % (cidcoding, cid)
|
s = self.handle_undefined_char(cidcoding, cid)
|
||||||
buf += char
|
if s:
|
||||||
|
buf += s
|
||||||
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
|
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
|
||||||
skewed = (b != 0 or c != 0)
|
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
size = -size
|
size = -size
|
||||||
tag = 'vtext'
|
tag = 'vtext'
|
||||||
else:
|
else:
|
||||||
tag = 'htext'
|
tag = 'htext'
|
||||||
if skewed:
|
if (b != 0 or c != 0 or a <= 0 or d <= 0):
|
||||||
tag += ' skewed'
|
tag += ' skewed'
|
||||||
s = buf.encode(self.codec, 'xmlcharrefreplace')
|
s = buf.encode(self.codec, 'xmlcharrefreplace')
|
||||||
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
|
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
|
||||||
|
|
58
pdfinterp.py
58
pdfinterp.py
|
@ -9,8 +9,9 @@ except ImportError:
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||||
PSStackParser, PSLiteral, PSKeyword, \
|
PSStackParser, PSLiteral, PSKeyword, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||||
from pdfparser import PDFStream, resolve1, int_value, float_value, num_value, \
|
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
||||||
str_value, list_value, dict_value, stream_value, PDFException
|
int_value, float_value, num_value, \
|
||||||
|
str_value, list_value, dict_value, stream_value
|
||||||
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||||
from utils import choplist
|
from utils import choplist
|
||||||
|
|
||||||
|
@ -80,8 +81,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
# PDFFont
|
# PDFFont
|
||||||
class PDFFont:
|
class PDFFont:
|
||||||
|
|
||||||
def __init__(self, fontid, descriptor, widths, default_width=None):
|
def __init__(self, descriptor, widths, default_width=None):
|
||||||
self.fontid = fontid
|
|
||||||
self.descriptor = descriptor
|
self.descriptor = descriptor
|
||||||
self.widths = widths
|
self.widths = widths
|
||||||
self.fontname = descriptor['FontName']
|
self.fontname = descriptor['FontName']
|
||||||
|
@ -91,11 +91,11 @@ class PDFFont:
|
||||||
self.descent = descriptor['Descent']
|
self.descent = descriptor['Descent']
|
||||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||||
self.leading = descriptor.get('Leading', 0)
|
self.leading = descriptor.get('Leading', 0)
|
||||||
self.bbox = descriptor['FontBBox']
|
self.bbox = list_value(descriptor['FontBBox'])
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFFont: fontid=%r>' % (self.fontid,)
|
return '<PDFFont>'
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return False
|
return False
|
||||||
|
@ -116,7 +116,7 @@ class PDFFont:
|
||||||
# PDFSimpleFont
|
# PDFSimpleFont
|
||||||
class PDFSimpleFont(PDFFont):
|
class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, fontid, descriptor, widths, spec):
|
def __init__(self, descriptor, widths, spec):
|
||||||
# Font encoding is specified either by a name of
|
# Font encoding is specified either by a name of
|
||||||
# built-in encoding or a dictionary that describes
|
# built-in encoding or a dictionary that describes
|
||||||
# the differences.
|
# the differences.
|
||||||
|
@ -135,7 +135,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.ucs2_cmap = CMap()
|
self.ucs2_cmap = CMap()
|
||||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
||||||
PDFFont.__init__(self, fontid, descriptor, widths)
|
PDFFont.__init__(self, descriptor, widths)
|
||||||
return
|
return
|
||||||
|
|
||||||
def to_unicode(self, cid):
|
def to_unicode(self, cid):
|
||||||
|
@ -154,7 +154,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
# PDFType1Font
|
# PDFType1Font
|
||||||
class PDFType1Font(PDFSimpleFont):
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, fontid, spec):
|
def __init__(self, spec):
|
||||||
if 'BaseFont' not in spec:
|
if 'BaseFont' not in spec:
|
||||||
raise PDFFontError('BaseFont is missing')
|
raise PDFFontError('BaseFont is missing')
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
|
@ -169,7 +169,7 @@ class PDFType1Font(PDFSimpleFont):
|
||||||
in enumerate(list_value(spec['Widths'])) )
|
in enumerate(list_value(spec['Widths'])) )
|
||||||
except KeyError, k:
|
except KeyError, k:
|
||||||
raise PDFFontError('%s is missing' % k)
|
raise PDFFontError('%s is missing' % k)
|
||||||
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
return
|
return
|
||||||
|
|
||||||
# PDFTrueTypeFont
|
# PDFTrueTypeFont
|
||||||
|
@ -178,7 +178,7 @@ class PDFTrueTypeFont(PDFType1Font):
|
||||||
|
|
||||||
# PDFType3Font
|
# PDFType3Font
|
||||||
class PDFType3Font(PDFSimpleFont):
|
class PDFType3Font(PDFSimpleFont):
|
||||||
def __init__(self, fontid, spec):
|
def __init__(self, spec):
|
||||||
try:
|
try:
|
||||||
firstchar = int_value(spec['FirstChar'])
|
firstchar = int_value(spec['FirstChar'])
|
||||||
lastchar = int_value(spec['LastChar'])
|
lastchar = int_value(spec['LastChar'])
|
||||||
|
@ -189,9 +189,9 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
if 'FontDescriptor' in spec:
|
if 'FontDescriptor' in spec:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
else:
|
else:
|
||||||
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
|
descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
|
||||||
'FontBBox':spec['FontBBox']}
|
'FontBBox':spec['FontBBox']}
|
||||||
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
return
|
return
|
||||||
|
|
||||||
# PDFCIDFont
|
# PDFCIDFont
|
||||||
|
@ -272,7 +272,7 @@ class TrueTypeFont:
|
||||||
|
|
||||||
class PDFCIDFont(PDFFont):
|
class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, fontid, spec):
|
def __init__(self, spec):
|
||||||
if 'BaseFont' not in spec:
|
if 'BaseFont' not in spec:
|
||||||
raise PDFFontError('BaseFont is missing')
|
raise PDFFontError('BaseFont is missing')
|
||||||
try:
|
try:
|
||||||
|
@ -335,7 +335,7 @@ class PDFCIDFont(PDFFont):
|
||||||
self.disps = {}
|
self.disps = {}
|
||||||
default_width = spec.get('DW', 1000)
|
default_width = spec.get('DW', 1000)
|
||||||
self.default_disp = 0
|
self.default_disp = 0
|
||||||
PDFFont.__init__(self, fontid, descriptor, widths, default_width)
|
PDFFont.__init__(self, descriptor, widths, default_width)
|
||||||
return
|
return
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
|
@ -386,11 +386,10 @@ class PDFResourceManager:
|
||||||
def get_cmap(self, name):
|
def get_cmap(self, name):
|
||||||
return CMapDB.get_cmap(name)
|
return CMapDB.get_cmap(name)
|
||||||
|
|
||||||
def get_font(self, fontid, spec):
|
def get_font(self, objid, spec):
|
||||||
if fontid in self.fonts:
|
if objid and objid in self.fonts:
|
||||||
font = self.fonts[fontid]
|
font = self.fonts[objid]
|
||||||
else:
|
else:
|
||||||
spec = dict_value(spec)
|
|
||||||
assert spec['Type'] == LITERAL_FONT
|
assert spec['Type'] == LITERAL_FONT
|
||||||
# Create a Font object.
|
# Create a Font object.
|
||||||
if 'Subtype' not in spec:
|
if 'Subtype' not in spec:
|
||||||
|
@ -398,16 +397,16 @@ class PDFResourceManager:
|
||||||
subtype = literal_name(spec['Subtype'])
|
subtype = literal_name(spec['Subtype'])
|
||||||
if subtype in ('Type1', 'MMType1'):
|
if subtype in ('Type1', 'MMType1'):
|
||||||
# Type1 Font
|
# Type1 Font
|
||||||
font = PDFType1Font(fontid, spec)
|
font = PDFType1Font(spec)
|
||||||
elif subtype == 'TrueType':
|
elif subtype == 'TrueType':
|
||||||
# TrueType Font
|
# TrueType Font
|
||||||
font = PDFTrueTypeFont(fontid, spec)
|
font = PDFTrueTypeFont(spec)
|
||||||
elif subtype == 'Type3':
|
elif subtype == 'Type3':
|
||||||
# Type3 Font
|
# Type3 Font
|
||||||
font = PDFType3Font(fontid, spec)
|
font = PDFType3Font(spec)
|
||||||
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
||||||
# CID Font
|
# CID Font
|
||||||
font = PDFCIDFont(fontid, spec)
|
font = PDFCIDFont(spec)
|
||||||
elif subtype == 'Type0':
|
elif subtype == 'Type0':
|
||||||
# Type0 Font
|
# Type0 Font
|
||||||
dfonts = list_value(spec['DescendantFonts'])
|
dfonts = list_value(spec['DescendantFonts'])
|
||||||
|
@ -416,10 +415,11 @@ class PDFResourceManager:
|
||||||
for k in ('Encoding', 'ToUnicode'):
|
for k in ('Encoding', 'ToUnicode'):
|
||||||
if k in spec:
|
if k in spec:
|
||||||
subspec[k] = resolve1(spec[k])
|
subspec[k] = resolve1(spec[k])
|
||||||
font = self.get_font(fontid, subspec)
|
font = self.get_font(None, subspec)
|
||||||
else:
|
else:
|
||||||
raise PDFFontError('Invalid Font: %r' % spec)
|
raise PDFFontError('Invalid Font: %r' % spec)
|
||||||
self.fonts[fontid] = font
|
if objid:
|
||||||
|
self.fonts[objid] = font
|
||||||
return font
|
return font
|
||||||
|
|
||||||
|
|
||||||
|
@ -857,8 +857,12 @@ class PDFPageInterpreter:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||||
if k == 'Font':
|
if k == 'Font':
|
||||||
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
for (fontid,spec) in dict_value(v).iteritems():
|
||||||
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
objid = None
|
||||||
|
if isinstance(spec, PDFObjRef):
|
||||||
|
objid = spec.objid
|
||||||
|
spec = dict_value(spec)
|
||||||
|
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
||||||
elif k == 'ColorSpace':
|
elif k == 'ColorSpace':
|
||||||
for (csid,spec) in dict_value(v).iteritems():
|
for (csid,spec) in dict_value(v).iteritems():
|
||||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||||
|
|
Loading…
Reference in New Issue