diff --git a/Makefile b/Makefile index 2583f3b..ae574c4 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,6 @@ PACKAGE=pdfminer PREFIX=/usr/local -SVN=svn PYTHON=python RM=rm -f CP=cp -f @@ -22,8 +21,6 @@ clean: distclean: clean test_clean cmap_clean -commit: distclean - $(SVN) commit pack: distclean $(PYTHON) setup.py sdist register: distclean diff --git a/docs/index.html b/docs/index.html index 227732d..08b4a85 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Apr 24 04:30:10 UTC 2010 +Last Modified: Mon May 10 23:02:20 UTC 2010
@@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
  • Reconstruct the original layout by grouping text chunks.

    -On the performance, PDFMiner is about 20 times slower than +On the performance side, +PDFMiner is about 20 times slower than other C/C++-based software such as XPdf. diff --git a/docs/usage.html b/docs/usage.html index b1b5677..6d1cdf5 100644 --- a/docs/usage.html +++ b/docs/usage.html @@ -56,7 +56,7 @@ for page in doc.get_pages():

    -In PDFMiner, there are several objects involved in parsing a PDF file, +In PDFMiner, there are several Python classes involved in parsing a PDF file, as shown in Figure 1.

    @@ -68,7 +68,12 @@ as shown in Figure 1.

    Accessing Layout Objects

    -PDFMiner performs a basic layout analysis. +PDF documents are more like graphics, rather than text documents. +It presents no logical structure such as sentences or paragraphs (for most cases). +PDFMiner tries to reconstruct the original structure by performing +basic layout analysis. +

    +

     from pdfminer.layout import LAParams
    diff --git a/pdfminer/converter.py b/pdfminer/converter.py
    index 6264750..a6df07c 100644
    --- a/pdfminer/converter.py
    +++ b/pdfminer/converter.py
    @@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
                 self.cur_item.add(LTPolygon(gstate.linewidth, pts))
             return
     
    -    def render_char(self, matrix, font, fontsize, scaling, cid):
    -        item = LTChar(matrix, font, fontsize, scaling, cid)
    +    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
    +        item = LTChar(matrix, font, fontsize, scaling, rise, cid)
             self.cur_item.add(item)
             return item.adv
     
    diff --git a/pdfminer/layout.py b/pdfminer/layout.py
    index 6c33978..79e8f9d 100644
    --- a/pdfminer/layout.py
    +++ b/pdfminer/layout.py
    @@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):
     
         debug = 0
     
    -    def __init__(self, matrix, font, fontsize, scaling, cid):
    +    def __init__(self, matrix, font, fontsize, scaling, rise, cid):
             self.matrix = matrix
             self.font = font
             self.fontsize = fontsize
    @@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
                 text = font.to_unichr(cid)
             except PDFUnicodeNotDefined:
                 text = '?'
    +        (a,b,c,d,e,f) = self.matrix
    +        self.upright = (0 < a*d*scaling and b*c <= 0)
             LTText.__init__(self, text)
             # compute the boundary rectangle.
             if self.vertical:
    @@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
                 (dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
                 (_,_,_,_,tx,ty) = self.matrix
                 tx -= dx/2
    -            ty += displacement
    +            ty += displacement + rise
                 bbox = (tx, ty+dy, tx+dx, ty)
             else:
                 # horizontal
    @@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
                 (_,descent) = apply_matrix_norm(self.matrix, (0, descent))
                 (dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
                 (_,_,_,_,tx,ty) = self.matrix
    -            ty += descent
    +            ty += descent + rise
                 bbox = (tx, ty, tx+dx, ty+dy)
             LTItem.__init__(self, bbox)
             return
    @@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
             return self.vertical
     
         def is_upright(self):
    -        (a,b,c,d,e,f) = self.matrix
    -        return 0 < a*d and b*c <= 0
    +        return self.upright
     
         
     ##  LTContainer
    diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
    index a0fd371..2e3c347 100644
    --- a/pdfminer/pdfdevice.py
    +++ b/pdfminer/pdfdevice.py
    @@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
             scaling = textstate.scaling * .01
             charspace = textstate.charspace * scaling
             wordspace = textstate.wordspace * scaling
    +        rise = textstate.rise
             if font.is_multibyte():
                 wordspace = 0
             dxscale = .001 * fontsize * scaling
             if font.is_vertical():
                 textstate.linematrix = self.render_string_vertical(
    -                seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
    +                seq, matrix, textstate.linematrix, font, fontsize,
    +                scaling, charspace, wordspace, rise, dxscale)
             else:
                 textstate.linematrix = self.render_string_horizontal(
    -                seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
    +                seq, matrix, textstate.linematrix, font, fontsize,
    +                scaling, charspace, wordspace, rise, dxscale)
             return
         
         def render_string_horizontal(self, seq, matrix, (x,y), 
    -                                 font, fontsize, scaling, charspace, wordspace, dxscale):
    +                                 font, fontsize, scaling, charspace, wordspace, rise, dxscale):
             needcharspace = False
             for obj in seq:
                 if isinstance(obj, int) or isinstance(obj, float):
    @@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
                         if needcharspace:
                             x += charspace
                         x += self.render_char(translate_matrix(matrix, (x,y)),
    -                                          font, fontsize, scaling, cid)
    +                                          font, fontsize, scaling, rise, cid)
                         if cid == 32 and wordspace:
                             x += wordspace
                         needcharspace = True
             return (x, y)
     
         def render_string_vertical(self, seq, matrix, (x,y), 
    -                               font, fontsize, scaling, charspace, wordspace, dxscale):
    +                               font, fontsize, scaling, charspace, wordspace, rise, dxscale):
             needcharspace = False
             for obj in seq:
                 if isinstance(obj, int) or isinstance(obj, float):
    @@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
                         if needcharspace:
                             y += charspace
                         y += self.render_char(translate_matrix(matrix, (x,y)), 
    -                                          font, fontsize, scaling, cid)
    +                                          font, fontsize, scaling, rise, cid)
                         if cid == 32 and wordspace:
                             y += wordspace
                         needcharspace = True
             return (x, y)
     
    -    def render_char(self, matrix, font, fontsize, scaling, cid):
    +    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
             return 0