diff --git a/README.html b/README.html
index 6f42416..ee635e8 100644
--- a/README.html
+++ b/README.html
@@ -7,8 +7,14 @@
PDFMiner
-PDFMiner is a suite of programs that help
+PDFMiner is a suite of programs that aims to help
extracting or analyzing text data from PDF documents.
+Unlike other PDF-related tools, it allows to obtain
+the exact location of texts in a page, as well as
+other layout information such as font size or font name,
+which could be useful for analyzing the document.
+PDFMiner is written purely in Python. It can be also used as a
+basis for a full-fledged PDF interpreter.
Homepage:
@@ -60,6 +66,7 @@ $ ./pdf2txt.py -c euc-jp samples/jo.pdf
Similar Projects
diff --git a/pdf2txt.py b/pdf2txt.py
index 04f06fd..5bb24a7 100755
--- a/pdf2txt.py
+++ b/pdf2txt.py
@@ -27,10 +27,14 @@ class TextConverter(PDFDevice):
self.outfp.write('\n' %
(name,x0,y0,x1,y1))
return
+
def end_block(self):
self.outfp.write('\n')
return
+ def handle_undefined_char(self, cidcoding, cid):
+ return
+
def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width
@@ -44,18 +48,19 @@ class TextConverter(PDFDevice):
for cid in chars:
try:
char = font.to_unicode(cid)
+ buf += char
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
- char = u'[%s:%d]' % (cidcoding, cid)
- buf += char
+ s = self.handle_undefined_char(cidcoding, cid)
+ if s:
+ buf += s
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
- skewed = (b != 0 or c != 0)
if font.is_vertical():
size = -size
tag = 'vtext'
else:
tag = 'htext'
- if skewed:
+ if (b != 0 or c != 0 or a <= 0 or d <= 0):
tag += ' skewed'
s = buf.encode(self.codec, 'xmlcharrefreplace')
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
diff --git a/pdfinterp.py b/pdfinterp.py
index 18e0a08..2e48349 100644
--- a/pdfinterp.py
+++ b/pdfinterp.py
@@ -9,8 +9,9 @@ except ImportError:
from psparser import PSException, PSSyntaxError, PSTypeError, \
PSStackParser, PSLiteral, PSKeyword, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdfparser import PDFStream, resolve1, int_value, float_value, num_value, \
- str_value, list_value, dict_value, stream_value, PDFException
+from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
+ int_value, float_value, num_value, \
+ str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import choplist
@@ -80,8 +81,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
# PDFFont
class PDFFont:
- def __init__(self, fontid, descriptor, widths, default_width=None):
- self.fontid = fontid
+ def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor
self.widths = widths
self.fontname = descriptor['FontName']
@@ -91,11 +91,11 @@ class PDFFont:
self.descent = descriptor['Descent']
self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = descriptor.get('Leading', 0)
- self.bbox = descriptor['FontBBox']
+ self.bbox = list_value(descriptor['FontBBox'])
return
def __repr__(self):
- return '' % (self.fontid,)
+ return ''
def is_vertical(self):
return False
@@ -116,7 +116,7 @@ class PDFFont:
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
- def __init__(self, fontid, descriptor, widths, spec):
+ def __init__(self, descriptor, widths, spec):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
@@ -135,7 +135,7 @@ class PDFSimpleFont(PDFFont):
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
- PDFFont.__init__(self, fontid, descriptor, widths)
+ PDFFont.__init__(self, descriptor, widths)
return
def to_unicode(self, cid):
@@ -154,7 +154,7 @@ class PDFSimpleFont(PDFFont):
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
- def __init__(self, fontid, spec):
+ def __init__(self, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
self.basefont = literal_name(spec['BaseFont'])
@@ -169,7 +169,7 @@ class PDFType1Font(PDFSimpleFont):
in enumerate(list_value(spec['Widths'])) )
except KeyError, k:
raise PDFFontError('%s is missing' % k)
- PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+ PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
# PDFTrueTypeFont
@@ -178,7 +178,7 @@ class PDFTrueTypeFont(PDFType1Font):
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
- def __init__(self, fontid, spec):
+ def __init__(self, spec):
try:
firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar'])
@@ -189,9 +189,9 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
- descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
+ descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
- PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+ PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
# PDFCIDFont
@@ -272,7 +272,7 @@ class TrueTypeFont:
class PDFCIDFont(PDFFont):
- def __init__(self, fontid, spec):
+ def __init__(self, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
try:
@@ -335,7 +335,7 @@ class PDFCIDFont(PDFFont):
self.disps = {}
default_width = spec.get('DW', 1000)
self.default_disp = 0
- PDFFont.__init__(self, fontid, descriptor, widths, default_width)
+ PDFFont.__init__(self, descriptor, widths, default_width)
return
def is_vertical(self):
@@ -386,11 +386,10 @@ class PDFResourceManager:
def get_cmap(self, name):
return CMapDB.get_cmap(name)
- def get_font(self, fontid, spec):
- if fontid in self.fonts:
- font = self.fonts[fontid]
+ def get_font(self, objid, spec):
+ if objid and objid in self.fonts:
+ font = self.fonts[objid]
else:
- spec = dict_value(spec)
assert spec['Type'] == LITERAL_FONT
# Create a Font object.
if 'Subtype' not in spec:
@@ -398,16 +397,16 @@ class PDFResourceManager:
subtype = literal_name(spec['Subtype'])
if subtype in ('Type1', 'MMType1'):
# Type1 Font
- font = PDFType1Font(fontid, spec)
+ font = PDFType1Font(spec)
elif subtype == 'TrueType':
# TrueType Font
- font = PDFTrueTypeFont(fontid, spec)
+ font = PDFTrueTypeFont(spec)
elif subtype == 'Type3':
# Type3 Font
- font = PDFType3Font(fontid, spec)
+ font = PDFType3Font(spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font
- font = PDFCIDFont(fontid, spec)
+ font = PDFCIDFont(spec)
elif subtype == 'Type0':
# Type0 Font
dfonts = list_value(spec['DescendantFonts'])
@@ -416,10 +415,11 @@ class PDFResourceManager:
for k in ('Encoding', 'ToUnicode'):
if k in spec:
subspec[k] = resolve1(spec[k])
- font = self.get_font(fontid, subspec)
+ font = self.get_font(None, subspec)
else:
raise PDFFontError('Invalid Font: %r' % spec)
- self.fonts[fontid] = font
+ if objid:
+ self.fonts[objid] = font
return font
@@ -857,8 +857,12 @@ class PDFPageInterpreter:
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
- for (fontid,fontrsrc) in dict_value(v).iteritems():
- self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
+ for (fontid,spec) in dict_value(v).iteritems():
+ objid = None
+ if isinstance(spec, PDFObjRef):
+ objid = spec.objid
+ spec = dict_value(spec)
+ self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
diff --git a/utils.py b/utils.py
index 9d9eeef..6364875 100644
--- a/utils.py
+++ b/utils.py
@@ -14,7 +14,7 @@ def choplist(n, seq):
return
def nunpack(s, default=0):
- '''Unpacks up to 4 bytes.'''
+ '''Unpacks up to 4 bytes big endian.'''
l = len(s)
if not l:
return default