git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@13 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2008-01-09 14:21:24 +00:00
parent 401c849a37
commit da778dee6f
4 changed files with 49 additions and 33 deletions

View File

@ -7,8 +7,14 @@
<h1>PDFMiner</h1>
<p>
PDFMiner is a suite of programs that help
PDFMiner is a suite of programs that aims to help
extracting or analyzing text data from PDF documents.
Unlike other PDF-related tools, it allows to obtain
the exact location of texts in a page, as well as
other layout information such as font size or font name,
which could be useful for analyzing the document.
PDFMiner is written purely in Python. It can be also used as a
basis for a full-fledged PDF interpreter.
<p>
<strong>Homepage:</strong><br>
@ -60,6 +66,7 @@ $ ./pdf2txt.py -c euc-jp samples/jo.pdf
<hr>
<h2>Similar Projects</h2>
<ul>
<li> <a href="http://pybrary.net/pyPdf/">pyPdf</a>
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
</ul>

View File

@ -27,10 +27,14 @@ class TextConverter(PDFDevice):
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
(name,x0,y0,x1,y1))
return
def end_block(self):
self.outfp.write('</block>\n')
return
def handle_undefined_char(self, cidcoding, cid):
return
def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width
@ -44,18 +48,19 @@ class TextConverter(PDFDevice):
for cid in chars:
try:
char = font.to_unicode(cid)
buf += char
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = u'[%s:%d]' % (cidcoding, cid)
buf += char
s = self.handle_undefined_char(cidcoding, cid)
if s:
buf += s
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
skewed = (b != 0 or c != 0)
if font.is_vertical():
size = -size
tag = 'vtext'
else:
tag = 'htext'
if skewed:
if (b != 0 or c != 0 or a <= 0 or d <= 0):
tag += ' skewed'
s = buf.encode(self.codec, 'xmlcharrefreplace')
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))

View File

@ -9,8 +9,9 @@ except ImportError:
from psparser import PSException, PSSyntaxError, PSTypeError, \
PSStackParser, PSLiteral, PSKeyword, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFStream, resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value, PDFException
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import choplist
@ -80,8 +81,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
# PDFFont
class PDFFont:
def __init__(self, fontid, descriptor, widths, default_width=None):
self.fontid = fontid
def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor
self.widths = widths
self.fontname = descriptor['FontName']
@ -91,11 +91,11 @@ class PDFFont:
self.descent = descriptor['Descent']
self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = descriptor.get('Leading', 0)
self.bbox = descriptor['FontBBox']
self.bbox = list_value(descriptor['FontBBox'])
return
def __repr__(self):
return '<PDFFont: fontid=%r>' % (self.fontid,)
return '<PDFFont>'
def is_vertical(self):
return False
@ -116,7 +116,7 @@ class PDFFont:
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
def __init__(self, fontid, descriptor, widths, spec):
def __init__(self, descriptor, widths, spec):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
@ -135,7 +135,7 @@ class PDFSimpleFont(PDFFont):
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
PDFFont.__init__(self, fontid, descriptor, widths)
PDFFont.__init__(self, descriptor, widths)
return
def to_unicode(self, cid):
@ -154,7 +154,7 @@ class PDFSimpleFont(PDFFont):
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
def __init__(self, fontid, spec):
def __init__(self, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
self.basefont = literal_name(spec['BaseFont'])
@ -169,7 +169,7 @@ class PDFType1Font(PDFSimpleFont):
in enumerate(list_value(spec['Widths'])) )
except KeyError, k:
raise PDFFontError('%s is missing' % k)
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
# PDFTrueTypeFont
@ -178,7 +178,7 @@ class PDFTrueTypeFont(PDFType1Font):
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
def __init__(self, fontid, spec):
def __init__(self, spec):
try:
firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar'])
@ -189,9 +189,9 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
# PDFCIDFont
@ -272,7 +272,7 @@ class TrueTypeFont:
class PDFCIDFont(PDFFont):
def __init__(self, fontid, spec):
def __init__(self, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
try:
@ -335,7 +335,7 @@ class PDFCIDFont(PDFFont):
self.disps = {}
default_width = spec.get('DW', 1000)
self.default_disp = 0
PDFFont.__init__(self, fontid, descriptor, widths, default_width)
PDFFont.__init__(self, descriptor, widths, default_width)
return
def is_vertical(self):
@ -386,11 +386,10 @@ class PDFResourceManager:
def get_cmap(self, name):
return CMapDB.get_cmap(name)
def get_font(self, fontid, spec):
if fontid in self.fonts:
font = self.fonts[fontid]
def get_font(self, objid, spec):
if objid and objid in self.fonts:
font = self.fonts[objid]
else:
spec = dict_value(spec)
assert spec['Type'] == LITERAL_FONT
# Create a Font object.
if 'Subtype' not in spec:
@ -398,16 +397,16 @@ class PDFResourceManager:
subtype = literal_name(spec['Subtype'])
if subtype in ('Type1', 'MMType1'):
# Type1 Font
font = PDFType1Font(fontid, spec)
font = PDFType1Font(spec)
elif subtype == 'TrueType':
# TrueType Font
font = PDFTrueTypeFont(fontid, spec)
font = PDFTrueTypeFont(spec)
elif subtype == 'Type3':
# Type3 Font
font = PDFType3Font(fontid, spec)
font = PDFType3Font(spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font
font = PDFCIDFont(fontid, spec)
font = PDFCIDFont(spec)
elif subtype == 'Type0':
# Type0 Font
dfonts = list_value(spec['DescendantFonts'])
@ -416,10 +415,11 @@ class PDFResourceManager:
for k in ('Encoding', 'ToUnicode'):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(fontid, subspec)
font = self.get_font(None, subspec)
else:
raise PDFFontError('Invalid Font: %r' % spec)
self.fonts[fontid] = font
if objid:
self.fonts[objid] = font
return font
@ -857,8 +857,12 @@ class PDFPageInterpreter:
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
for (fontid,spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))

View File

@ -14,7 +14,7 @@ def choplist(n, seq):
return
def nunpack(s, default=0):
'''Unpacks up to 4 bytes.'''
'''Unpacks up to 4 bytes big endian.'''
l = len(s)
if not l:
return default