git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@13 1aa58f4a-7d42-0410-adbc-911cccaed67c

2008-01-09 14:21:24 +00:00 · 2008-01-09 14:21:24 +00:00 · da778dee6f
parent 401c849a37
commit da778dee6f
4 changed files with 49 additions and 33 deletions
--- a/README.html
+++ b/README.html
@ -7,8 +7,14 @@
 <h1>PDFMiner</h1>
 <p>
-PDFMiner is a suite of programs that help
+PDFMiner is a suite of programs that aims to help
 extracting or analyzing text data from PDF documents.
 Unlike other PDF-related tools, it allows to obtain
 the exact location of texts in a page, as well as 
 other layout information such as font size or font name,
 which could be useful for analyzing the document.
 PDFMiner is written purely in Python. It can be also used as a 
 basis for a full-fledged PDF interpreter. 
 <p>
 <strong>Homepage:</strong><br>
@ -60,6 +66,7 @@ $ ./pdf2txt.py -c euc-jp samples/jo.pdf
 <hr>
 <h2>Similar Projects</h2>
 <ul>
 <li> <a href="http://pybrary.net/pyPdf/">pyPdf</a>
 <li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
 <li> <a href="http://www.pdfbox.org/">pdfbox</a>
 </ul>
--- a/pdf2txt.py
+++ b/pdf2txt.py
@ -27,10 +27,14 @@ class TextConverter(PDFDevice):
    self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
                     (name,x0,y0,x1,y1))
    return
  def end_block(self):
    self.outfp.write('</block>\n')
    return
  def handle_undefined_char(self, cidcoding, cid):
    return
  def render_string(self, textstate, textmatrix, size, seq):
    font = textstate.font
    spwidth = int(-font.char_width(32) * 0.6) # space width
@ -44,18 +48,19 @@ class TextConverter(PDFDevice):
        for cid in chars:
          try:
            char = font.to_unicode(cid)
            buf += char
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
-            char = u'[%s:%d]' % (cidcoding, cid)
+            s = self.handle_undefined_char(cidcoding, cid)
-          buf += char
+            if s:
              buf += s
    (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
    skewed = (b != 0 or c != 0)
    if font.is_vertical():
      size = -size
      tag = 'vtext'
    else:
      tag = 'htext'
-    if skewed:
+    if (b != 0 or c != 0 or a <= 0 or d <= 0):
      tag += ' skewed'
    s = buf.encode(self.codec, 'xmlcharrefreplace')
    (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
--- a/pdfinterp.py
+++ b/pdfinterp.py
@ -9,8 +9,9 @@ except ImportError:
 from psparser import PSException, PSSyntaxError, PSTypeError, \
     PSStackParser, PSLiteral, PSKeyword, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdfparser import PDFStream, resolve1, int_value, float_value, num_value, \
+from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
-     str_value, list_value, dict_value, stream_value, PDFException
+     int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
 from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
 from utils import choplist
@ -80,8 +81,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
 # PDFFont
 class PDFFont:
-  def __init__(self, fontid, descriptor, widths, default_width=None):
+  def __init__(self, descriptor, widths, default_width=None):
    self.fontid = fontid
    self.descriptor = descriptor
    self.widths = widths
    self.fontname = descriptor['FontName']
@ -91,11 +91,11 @@ class PDFFont:
    self.descent = descriptor['Descent']
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
    self.leading = descriptor.get('Leading', 0)
-    self.bbox = descriptor['FontBBox']
+    self.bbox = list_value(descriptor['FontBBox'])
    return
  def __repr__(self):
-    return '<PDFFont: fontid=%r>' % (self.fontid,)
+    return '<PDFFont>'
  def is_vertical(self):
    return False
@ -116,7 +116,7 @@ class PDFFont:
 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
-  def __init__(self, fontid, descriptor, widths, spec):
+  def __init__(self, descriptor, widths, spec):
    # Font encoding is specified either by a name of
    # built-in encoding or a dictionary that describes
    # the differences.
@ -135,7 +135,7 @@ class PDFSimpleFont(PDFFont):
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
-    PDFFont.__init__(self, fontid, descriptor, widths)
+    PDFFont.__init__(self, descriptor, widths)
    return
  def to_unicode(self, cid):
@ -154,7 +154,7 @@ class PDFSimpleFont(PDFFont):
 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
-  def __init__(self, fontid, spec):
+  def __init__(self, spec):
    if 'BaseFont' not in spec:
      raise PDFFontError('BaseFont is missing')
    self.basefont = literal_name(spec['BaseFont'])
@ -169,7 +169,7 @@ class PDFType1Font(PDFSimpleFont):
                       in enumerate(list_value(spec['Widths'])) )
      except KeyError, k:
        raise PDFFontError('%s is missing' % k)
-    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return
 # PDFTrueTypeFont
@ -178,7 +178,7 @@ class PDFTrueTypeFont(PDFType1Font):
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
-  def __init__(self, fontid, spec):
+  def __init__(self, spec):
    try:
      firstchar = int_value(spec['FirstChar'])
      lastchar = int_value(spec['LastChar'])
@ -189,9 +189,9 @@ class PDFType3Font(PDFSimpleFont):
    if 'FontDescriptor' in spec:
      descriptor = dict_value(spec['FontDescriptor'])
    else:
-      descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
+      descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
                    'FontBBox':spec['FontBBox']}
-    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return
 # PDFCIDFont
@ -272,7 +272,7 @@ class TrueTypeFont:
 class PDFCIDFont(PDFFont):
-  def __init__(self, fontid, spec):
+  def __init__(self, spec):
    if 'BaseFont' not in spec:
      raise PDFFontError('BaseFont is missing')
    try:
@ -335,7 +335,7 @@ class PDFCIDFont(PDFFont):
      self.disps = {}
      default_width = spec.get('DW', 1000)
      self.default_disp = 0
-    PDFFont.__init__(self, fontid, descriptor, widths, default_width)
+    PDFFont.__init__(self, descriptor, widths, default_width)
    return
  def is_vertical(self):
@ -386,11 +386,10 @@ class PDFResourceManager:
  def get_cmap(self, name):
    return CMapDB.get_cmap(name)
-  def get_font(self, fontid, spec):
+  def get_font(self, objid, spec):
-    if fontid in self.fonts:
+    if objid and objid in self.fonts:
-      font = self.fonts[fontid]
+      font = self.fonts[objid]
    else:
      spec = dict_value(spec)
      assert spec['Type'] == LITERAL_FONT
      # Create a Font object.
      if 'Subtype' not in spec:
@ -398,16 +397,16 @@ class PDFResourceManager:
      subtype = literal_name(spec['Subtype'])
      if subtype in ('Type1', 'MMType1'):
        # Type1 Font
-        font = PDFType1Font(fontid, spec)
+        font = PDFType1Font(spec)
      elif subtype == 'TrueType':
        # TrueType Font
-        font = PDFTrueTypeFont(fontid, spec)
+        font = PDFTrueTypeFont(spec)
      elif subtype == 'Type3':
        # Type3 Font
-        font = PDFType3Font(fontid, spec)
+        font = PDFType3Font(spec)
      elif subtype in ('CIDFontType0', 'CIDFontType2'):
        # CID Font
-        font = PDFCIDFont(fontid, spec)
+        font = PDFCIDFont(spec)
      elif subtype == 'Type0':
        # Type0 Font
        dfonts = list_value(spec['DescendantFonts'])
@ -416,10 +415,11 @@ class PDFResourceManager:
        for k in ('Encoding', 'ToUnicode'):
          if k in spec:
            subspec[k] = resolve1(spec[k])
-        font = self.get_font(fontid, subspec)
+        font = self.get_font(None, subspec)
      else:
        raise PDFFontError('Invalid Font: %r' % spec)
-      self.fonts[fontid] = font
+      if objid:
        self.fonts[objid] = font
    return font
@ -857,8 +857,12 @@ class PDFPageInterpreter:
        if 1 <= self.debug:
          print >>stderr, 'Resource: %r: %r' % (k,v)
        if k == 'Font':
-          for (fontid,fontrsrc) in dict_value(v).iteritems():
+          for (fontid,spec) in dict_value(v).iteritems():
-            self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
+            objid = None
            if isinstance(spec, PDFObjRef):
              objid = spec.objid
            spec = dict_value(spec)
            self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
        elif k == 'ColorSpace':
          for (csid,spec) in dict_value(v).iteritems():
            self.csmap[csid] = get_colorspace(resolve1(spec))
--- a/utils.py
+++ b/utils.py
@ -14,7 +14,7 @@ def choplist(n, seq):
  return
 def nunpack(s, default=0):
-  '''Unpacks up to 4 bytes.'''
+  '''Unpacks up to 4 bytes big endian.'''
  l = len(s)
  if not l:
    return default