git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@13 1aa58f4a-7d42-0410-adbc-911cccaed67c

2008-01-09 14:21:24 +00:00 · 2008-01-09 14:21:24 +00:00 · da778dee6f
parent 401c849a37
commit da778dee6f
4 changed files with 49 additions and 33 deletions
--- a/README.html
+++ b/README.html
@ -7,8 +7,14 @@
 <h1>PDFMiner</h1>

 <p>
-PDFMiner is a suite of programs that help
+PDFMiner is a suite of programs that aims to help
 extracting or analyzing text data from PDF documents.
+Unlike other PDF-related tools, it allows to obtain
+the exact location of texts in a page, as well as 
+other layout information such as font size or font name,
+which could be useful for analyzing the document.
+PDFMiner is written purely in Python. It can be also used as a 
+basis for a full-fledged PDF interpreter. 

 <p>
 <strong>Homepage:</strong><br>
@ -60,6 +66,7 @@ $ ./pdf2txt.py -c euc-jp samples/jo.pdf
 <hr>
 <h2>Similar Projects</h2>
 <ul>
+<li> <a href="http://pybrary.net/pyPdf/">pyPdf</a>
 <li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
 <li> <a href="http://www.pdfbox.org/">pdfbox</a>
 </ul>
--- a/pdf2txt.py
+++ b/pdf2txt.py
@ -27,10 +27,14 @@ class TextConverter(PDFDevice):
    self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
                     (name,x0,y0,x1,y1))
    return
+  
  def end_block(self):
    self.outfp.write('</block>\n')
    return

+  def handle_undefined_char(self, cidcoding, cid):
+    return
+
  def render_string(self, textstate, textmatrix, size, seq):
    font = textstate.font
    spwidth = int(-font.char_width(32) * 0.6) # space width
@ -44,18 +48,19 @@ class TextConverter(PDFDevice):
        for cid in chars:
          try:
            char = font.to_unicode(cid)
+            buf += char
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
-            char = u'[%s:%d]' % (cidcoding, cid)
-          buf += char
+            s = self.handle_undefined_char(cidcoding, cid)
+            if s:
+              buf += s
    (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
-    skewed = (b != 0 or c != 0)
    if font.is_vertical():
      size = -size
      tag = 'vtext'
    else:
      tag = 'htext'
-    if skewed:
+    if (b != 0 or c != 0 or a <= 0 or d <= 0):
      tag += ' skewed'
    s = buf.encode(self.codec, 'xmlcharrefreplace')
    (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
--- a/pdfinterp.py
+++ b/pdfinterp.py
@ -9,8 +9,9 @@ except ImportError:
 from psparser import PSException, PSSyntaxError, PSTypeError, \
     PSStackParser, PSLiteral, PSKeyword, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdfparser import PDFStream, resolve1, int_value, float_value, num_value, \
-     str_value, list_value, dict_value, stream_value, PDFException
+from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
+     int_value, float_value, num_value, \
+     str_value, list_value, dict_value, stream_value
 from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
 from utils import choplist

@ -80,8 +81,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
 # PDFFont
 class PDFFont:
  
-  def __init__(self, fontid, descriptor, widths, default_width=None):
-    self.fontid = fontid
+  def __init__(self, descriptor, widths, default_width=None):
    self.descriptor = descriptor
    self.widths = widths
    self.fontname = descriptor['FontName']
@ -91,11 +91,11 @@ class PDFFont:
    self.descent = descriptor['Descent']
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
    self.leading = descriptor.get('Leading', 0)
-    self.bbox = descriptor['FontBBox']
+    self.bbox = list_value(descriptor['FontBBox'])
    return

  def __repr__(self):
-    return '<PDFFont: fontid=%r>' % (self.fontid,)
+    return '<PDFFont>'

  def is_vertical(self):
    return False
@ -116,7 +116,7 @@ class PDFFont:
 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
  
-  def __init__(self, fontid, descriptor, widths, spec):
+  def __init__(self, descriptor, widths, spec):
    # Font encoding is specified either by a name of
    # built-in encoding or a dictionary that describes
    # the differences.
@ -135,7 +135,7 @@ class PDFSimpleFont(PDFFont):
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
-    PDFFont.__init__(self, fontid, descriptor, widths)
+    PDFFont.__init__(self, descriptor, widths)
    return

  def to_unicode(self, cid):
@ -154,7 +154,7 @@ class PDFSimpleFont(PDFFont):
 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
  
-  def __init__(self, fontid, spec):
+  def __init__(self, spec):
    if 'BaseFont' not in spec:
      raise PDFFontError('BaseFont is missing')
    self.basefont = literal_name(spec['BaseFont'])
@ -169,7 +169,7 @@ class PDFType1Font(PDFSimpleFont):
                       in enumerate(list_value(spec['Widths'])) )
      except KeyError, k:
        raise PDFFontError('%s is missing' % k)
-    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return

 # PDFTrueTypeFont
@ -178,7 +178,7 @@ class PDFTrueTypeFont(PDFType1Font):

 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
-  def __init__(self, fontid, spec):
+  def __init__(self, spec):
    try:
      firstchar = int_value(spec['FirstChar'])
      lastchar = int_value(spec['LastChar'])
@ -189,9 +189,9 @@ class PDFType3Font(PDFSimpleFont):
    if 'FontDescriptor' in spec:
      descriptor = dict_value(spec['FontDescriptor'])
    else:
-      descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
+      descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
                    'FontBBox':spec['FontBBox']}
-    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return

 # PDFCIDFont
@ -272,7 +272,7 @@ class TrueTypeFont:

 class PDFCIDFont(PDFFont):
  
-  def __init__(self, fontid, spec):
+  def __init__(self, spec):
    if 'BaseFont' not in spec:
      raise PDFFontError('BaseFont is missing')
    try:
@ -335,7 +335,7 @@ class PDFCIDFont(PDFFont):
      self.disps = {}
      default_width = spec.get('DW', 1000)
      self.default_disp = 0
-    PDFFont.__init__(self, fontid, descriptor, widths, default_width)
+    PDFFont.__init__(self, descriptor, widths, default_width)
    return

  def is_vertical(self):
@ -386,11 +386,10 @@ class PDFResourceManager:
  def get_cmap(self, name):
    return CMapDB.get_cmap(name)

-  def get_font(self, fontid, spec):
-    if fontid in self.fonts:
-      font = self.fonts[fontid]
+  def get_font(self, objid, spec):
+    if objid and objid in self.fonts:
+      font = self.fonts[objid]
    else:
-      spec = dict_value(spec)
      assert spec['Type'] == LITERAL_FONT
      # Create a Font object.
      if 'Subtype' not in spec:
@ -398,16 +397,16 @@ class PDFResourceManager:
      subtype = literal_name(spec['Subtype'])
      if subtype in ('Type1', 'MMType1'):
        # Type1 Font
-        font = PDFType1Font(fontid, spec)
+        font = PDFType1Font(spec)
      elif subtype == 'TrueType':
        # TrueType Font
-        font = PDFTrueTypeFont(fontid, spec)
+        font = PDFTrueTypeFont(spec)
      elif subtype == 'Type3':
        # Type3 Font
-        font = PDFType3Font(fontid, spec)
+        font = PDFType3Font(spec)
      elif subtype in ('CIDFontType0', 'CIDFontType2'):
        # CID Font
-        font = PDFCIDFont(fontid, spec)
+        font = PDFCIDFont(spec)
      elif subtype == 'Type0':
        # Type0 Font
        dfonts = list_value(spec['DescendantFonts'])
@ -416,10 +415,11 @@ class PDFResourceManager:
        for k in ('Encoding', 'ToUnicode'):
          if k in spec:
            subspec[k] = resolve1(spec[k])
-        font = self.get_font(fontid, subspec)
+        font = self.get_font(None, subspec)
      else:
        raise PDFFontError('Invalid Font: %r' % spec)
-      self.fonts[fontid] = font
+      if objid:
+        self.fonts[objid] = font
    return font


@ -857,8 +857,12 @@ class PDFPageInterpreter:
        if 1 <= self.debug:
          print >>stderr, 'Resource: %r: %r' % (k,v)
        if k == 'Font':
-          for (fontid,fontrsrc) in dict_value(v).iteritems():
-            self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
+          for (fontid,spec) in dict_value(v).iteritems():
+            objid = None
+            if isinstance(spec, PDFObjRef):
+              objid = spec.objid
+            spec = dict_value(spec)
+            self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
        elif k == 'ColorSpace':
          for (csid,spec) in dict_value(v).iteritems():
            self.csmap[csid] = get_colorspace(resolve1(spec))
--- a/utils.py
+++ b/utils.py
@ -14,7 +14,7 @@ def choplist(n, seq):
  return

 def nunpack(s, default=0):
-  '''Unpacks up to 4 bytes.'''
+  '''Unpacks up to 4 bytes big endian.'''
  l = len(s)
  if not l:
    return default