git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@13 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2008-01-09 14:21:24 +00:00
parent 401c849a37
commit da778dee6f
4 changed files with 49 additions and 33 deletions

View File

@ -7,8 +7,14 @@
<h1>PDFMiner</h1> <h1>PDFMiner</h1>
<p> <p>
PDFMiner is a suite of programs that help PDFMiner is a suite of programs that aims to help
extracting or analyzing text data from PDF documents. extracting or analyzing text data from PDF documents.
Unlike other PDF-related tools, it allows to obtain
the exact location of texts in a page, as well as
other layout information such as font size or font name,
which could be useful for analyzing the document.
PDFMiner is written purely in Python. It can be also used as a
basis for a full-fledged PDF interpreter.
<p> <p>
<strong>Homepage:</strong><br> <strong>Homepage:</strong><br>
@ -60,6 +66,7 @@ $ ./pdf2txt.py -c euc-jp samples/jo.pdf
<hr> <hr>
<h2>Similar Projects</h2> <h2>Similar Projects</h2>
<ul> <ul>
<li> <a href="http://pybrary.net/pyPdf/">pyPdf</a>
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a> <li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
<li> <a href="http://www.pdfbox.org/">pdfbox</a> <li> <a href="http://www.pdfbox.org/">pdfbox</a>
</ul> </ul>

View File

@ -27,10 +27,14 @@ class TextConverter(PDFDevice):
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' % self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
(name,x0,y0,x1,y1)) (name,x0,y0,x1,y1))
return return
def end_block(self): def end_block(self):
self.outfp.write('</block>\n') self.outfp.write('</block>\n')
return return
def handle_undefined_char(self, cidcoding, cid):
return
def render_string(self, textstate, textmatrix, size, seq): def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width spwidth = int(-font.char_width(32) * 0.6) # space width
@ -44,18 +48,19 @@ class TextConverter(PDFDevice):
for cid in chars: for cid in chars:
try: try:
char = font.to_unicode(cid) char = font.to_unicode(cid)
buf += char
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
char = u'[%s:%d]' % (cidcoding, cid) s = self.handle_undefined_char(cidcoding, cid)
buf += char if s:
buf += s
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
skewed = (b != 0 or c != 0)
if font.is_vertical(): if font.is_vertical():
size = -size size = -size
tag = 'vtext' tag = 'vtext'
else: else:
tag = 'htext' tag = 'htext'
if skewed: if (b != 0 or c != 0 or a <= 0 or d <= 0):
tag += ' skewed' tag += ' skewed'
s = buf.encode(self.codec, 'xmlcharrefreplace') s = buf.encode(self.codec, 'xmlcharrefreplace')
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))

View File

@ -9,8 +9,9 @@ except ImportError:
from psparser import PSException, PSSyntaxError, PSTypeError, \ from psparser import PSException, PSSyntaxError, PSTypeError, \
PSStackParser, PSLiteral, PSKeyword, \ PSStackParser, PSLiteral, PSKeyword, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFStream, resolve1, int_value, float_value, num_value, \ from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
str_value, list_value, dict_value, stream_value, PDFException int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import choplist from utils import choplist
@ -80,8 +81,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
# PDFFont # PDFFont
class PDFFont: class PDFFont:
def __init__(self, fontid, descriptor, widths, default_width=None): def __init__(self, descriptor, widths, default_width=None):
self.fontid = fontid
self.descriptor = descriptor self.descriptor = descriptor
self.widths = widths self.widths = widths
self.fontname = descriptor['FontName'] self.fontname = descriptor['FontName']
@ -91,11 +91,11 @@ class PDFFont:
self.descent = descriptor['Descent'] self.descent = descriptor['Descent']
self.default_width = default_width or descriptor.get('MissingWidth', 0) self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = descriptor.get('Leading', 0) self.leading = descriptor.get('Leading', 0)
self.bbox = descriptor['FontBBox'] self.bbox = list_value(descriptor['FontBBox'])
return return
def __repr__(self): def __repr__(self):
return '<PDFFont: fontid=%r>' % (self.fontid,) return '<PDFFont>'
def is_vertical(self): def is_vertical(self):
return False return False
@ -116,7 +116,7 @@ class PDFFont:
# PDFSimpleFont # PDFSimpleFont
class PDFSimpleFont(PDFFont): class PDFSimpleFont(PDFFont):
def __init__(self, fontid, descriptor, widths, spec): def __init__(self, descriptor, widths, spec):
# Font encoding is specified either by a name of # Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes # built-in encoding or a dictionary that describes
# the differences. # the differences.
@ -135,7 +135,7 @@ class PDFSimpleFont(PDFFont):
strm = stream_value(spec['ToUnicode']) strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap() self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
PDFFont.__init__(self, fontid, descriptor, widths) PDFFont.__init__(self, descriptor, widths)
return return
def to_unicode(self, cid): def to_unicode(self, cid):
@ -154,7 +154,7 @@ class PDFSimpleFont(PDFFont):
# PDFType1Font # PDFType1Font
class PDFType1Font(PDFSimpleFont): class PDFType1Font(PDFSimpleFont):
def __init__(self, fontid, spec): def __init__(self, spec):
if 'BaseFont' not in spec: if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing') raise PDFFontError('BaseFont is missing')
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
@ -169,7 +169,7 @@ class PDFType1Font(PDFSimpleFont):
in enumerate(list_value(spec['Widths'])) ) in enumerate(list_value(spec['Widths'])) )
except KeyError, k: except KeyError, k:
raise PDFFontError('%s is missing' % k) raise PDFFontError('%s is missing' % k)
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) PDFSimpleFont.__init__(self, descriptor, widths, spec)
return return
# PDFTrueTypeFont # PDFTrueTypeFont
@ -178,7 +178,7 @@ class PDFTrueTypeFont(PDFType1Font):
# PDFType3Font # PDFType3Font
class PDFType3Font(PDFSimpleFont): class PDFType3Font(PDFSimpleFont):
def __init__(self, fontid, spec): def __init__(self, spec):
try: try:
firstchar = int_value(spec['FirstChar']) firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar']) lastchar = int_value(spec['LastChar'])
@ -189,9 +189,9 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec: if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
else: else:
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0, descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']} 'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) PDFSimpleFont.__init__(self, descriptor, widths, spec)
return return
# PDFCIDFont # PDFCIDFont
@ -272,7 +272,7 @@ class TrueTypeFont:
class PDFCIDFont(PDFFont): class PDFCIDFont(PDFFont):
def __init__(self, fontid, spec): def __init__(self, spec):
if 'BaseFont' not in spec: if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing') raise PDFFontError('BaseFont is missing')
try: try:
@ -335,7 +335,7 @@ class PDFCIDFont(PDFFont):
self.disps = {} self.disps = {}
default_width = spec.get('DW', 1000) default_width = spec.get('DW', 1000)
self.default_disp = 0 self.default_disp = 0
PDFFont.__init__(self, fontid, descriptor, widths, default_width) PDFFont.__init__(self, descriptor, widths, default_width)
return return
def is_vertical(self): def is_vertical(self):
@ -386,11 +386,10 @@ class PDFResourceManager:
def get_cmap(self, name): def get_cmap(self, name):
return CMapDB.get_cmap(name) return CMapDB.get_cmap(name)
def get_font(self, fontid, spec): def get_font(self, objid, spec):
if fontid in self.fonts: if objid and objid in self.fonts:
font = self.fonts[fontid] font = self.fonts[objid]
else: else:
spec = dict_value(spec)
assert spec['Type'] == LITERAL_FONT assert spec['Type'] == LITERAL_FONT
# Create a Font object. # Create a Font object.
if 'Subtype' not in spec: if 'Subtype' not in spec:
@ -398,16 +397,16 @@ class PDFResourceManager:
subtype = literal_name(spec['Subtype']) subtype = literal_name(spec['Subtype'])
if subtype in ('Type1', 'MMType1'): if subtype in ('Type1', 'MMType1'):
# Type1 Font # Type1 Font
font = PDFType1Font(fontid, spec) font = PDFType1Font(spec)
elif subtype == 'TrueType': elif subtype == 'TrueType':
# TrueType Font # TrueType Font
font = PDFTrueTypeFont(fontid, spec) font = PDFTrueTypeFont(spec)
elif subtype == 'Type3': elif subtype == 'Type3':
# Type3 Font # Type3 Font
font = PDFType3Font(fontid, spec) font = PDFType3Font(spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'): elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font # CID Font
font = PDFCIDFont(fontid, spec) font = PDFCIDFont(spec)
elif subtype == 'Type0': elif subtype == 'Type0':
# Type0 Font # Type0 Font
dfonts = list_value(spec['DescendantFonts']) dfonts = list_value(spec['DescendantFonts'])
@ -416,10 +415,11 @@ class PDFResourceManager:
for k in ('Encoding', 'ToUnicode'): for k in ('Encoding', 'ToUnicode'):
if k in spec: if k in spec:
subspec[k] = resolve1(spec[k]) subspec[k] = resolve1(spec[k])
font = self.get_font(fontid, subspec) font = self.get_font(None, subspec)
else: else:
raise PDFFontError('Invalid Font: %r' % spec) raise PDFFontError('Invalid Font: %r' % spec)
self.fonts[fontid] = font if objid:
self.fonts[objid] = font
return font return font
@ -857,8 +857,12 @@ class PDFPageInterpreter:
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v) print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font': if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems(): for (fontid,spec) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace': elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems(): for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec)) self.csmap[csid] = get_colorspace(resolve1(spec))

View File

@ -14,7 +14,7 @@ def choplist(n, seq):
return return
def nunpack(s, default=0): def nunpack(s, default=0):
'''Unpacks up to 4 bytes.''' '''Unpacks up to 4 bytes big endian.'''
l = len(s) l = len(s)
if not l: if not l:
return default return default