Page rotation bug fixed.

Various minor fixes. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@176 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-01-31 02:09:28 +00:00 · 2010-01-31 02:09:28 +00:00 · 0f8fe3f19e
parent 7969feeae1
commit 0f8fe3f19e
9 changed files with 55 additions and 60 deletions
--- a/10
+++ b/10
@ -33,11 +33,6 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
 publish:
 	$(CP) docs/*.html $(WEBDIR)
 test:
 	cd samples && $(MAKE) test
 test_clean:
 	-cd samples && $(MAKE) clean
 CONV_CMAP=$(PYTHON) tools/conv_cmap.py
 CMAPSRC=cmaprsrc
 CMAPDST=pdfminer/cmap
@ -53,3 +48,8 @@ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
 	$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
 $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
 	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
 test: cmap
 	cd samples && $(MAKE) all
 test_clean:
 	-cd samples && $(MAKE) clean
--- a/docs/index.html
+++ b/docs/index.html
@ -19,7 +19,7 @@ Python PDF parser and analyzer
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Jan 30 16:32:50 JST 2010
+Last Modified: Sun Jan 31 10:38:26 JST 2010
 <!-- hhmts end -->
 </div>
@ -127,9 +127,8 @@ W o r l d
 <a name="cmap"></a>
 <h3>For East Asian languages</h3>
 In order to handle East Asian languages (Chinese or Japanese, etc.),
-you need to install an additional data called <code>CMap</code>,
+an additional data called <code>CMap</code> is required.
-which is originally distributed by Adobe. CMap is now included
+CMap files are not installed by default.
 in the pdfminer package, but not installed by default.
 <p>
 Here is the additional step you need:
 <blockquote><pre>
@ -347,7 +346,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
-<li> 2010/01/30: JPEG image extraction supported.
+<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed. 
 <li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
 <li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
 <li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -6,8 +6,8 @@ from pdftypes import LITERALS_DCT_DECODE
 from layout import LayoutContainer
 from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
 from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
 from utils import enc
 from utils import apply_matrix_pt, mult_matrix
 from utils import enc, strbbox
 ##  TagExtractor
@ -38,10 +38,8 @@ class TagExtractor(PDFDevice):
        return
    def begin_page(self, page, ctm):
        (x0, y0, x1, y1) = page.mediabox
        bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
        self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
-                         (self.pageno, bbox, page.rotate))
+                         (self.pageno, strbbox(page.mediabox), page.rotate))
        return
    def end_page(self, page):
@ -177,9 +175,9 @@ class PDFConverter(PDFPageAggregator):
 ##
 class XMLConverter(PDFConverter):
-    def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
+    def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
        PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
-        self.imgdir = imgdir
+        self.outdir = outdir
        self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
        self.outfp.write('<pages>\n')
        return
@ -190,7 +188,7 @@ class XMLConverter(PDFConverter):
        else:
            return None
        name = image.name+ext
-        path = os.path.join(self.imgdir, name)
+        path = os.path.join(self.outdir, name)
        fp = file(path, 'wb')
        fp.write(image.data)
        fp.close()
@ -200,42 +198,42 @@ class XMLConverter(PDFConverter):
        def render(item):
            if isinstance(item, LTPage):
                self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
-                                 (item.id, item.get_bbox(), item.rotate))
+                                 (item.id, strbbox(item.bbox), item.rotate))
                for child in item:
                    render(child)
                self.outfp.write('</page>\n')
            elif isinstance(item, LTLine) and item.direction:
-                self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox()))
+                self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
            elif isinstance(item, LTRect):
-                self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox()))
+                self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
            elif isinstance(item, LTPolygon):
-                self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts()))
+                self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
            elif isinstance(item, LTFigure):
-                self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
+                self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
                for child in item:
                    render(child)
                self.outfp.write('</figure>\n')
            elif isinstance(item, LTTextLine):
-                self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
+                self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
                for child in item:
                    render(child)
                self.outfp.write('</textline>\n')
            elif isinstance(item, LTTextBox):
-                self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
+                self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
                for child in item:
                    render(child)
                self.outfp.write('</textbox>\n')
            elif isinstance(item, LTTextItem):
                self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
                                 (enc(item.font.fontname), item.is_vertical(),
-                                  item.get_bbox(), item.fontsize))
+                                  strbbox(item.bbox), item.fontsize))
                self.write(item.text)
                self.outfp.write('</text>\n')
            elif isinstance(item, LTText):
                self.outfp.write('<text>%s</text>\n' % item.text)
            elif isinstance(item, LTImage):
                x = ''
-                if self.imgdir:
+                if self.outdir:
                    name = self.write_image(item)
                    if name:
                        x = 'name="%s" ' % enc(name)
@ -257,11 +255,11 @@ class XMLConverter(PDFConverter):
 class HTMLConverter(PDFConverter):
    def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
-                 scale=1, showpageno=True, pagepad=50, imgdir=None):
+                 scale=1, showpageno=True, pagepad=50, outdir=None):
        PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
        self.showpageno = showpageno
        self.pagepad = pagepad
-        self.imgdir = imgdir
+        self.outdir = outdir
        self.scale = scale
        self.outfp.write('<html><head>\n')
        self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
@ -282,16 +280,15 @@ class HTMLConverter(PDFConverter):
        else:
            return
        name = image.name+ext
-        path = os.path.join(self.imgdir, name)
+        path = os.path.join(self.outdir, name)
        fp = file(path, 'wb')
        fp.write(image.data)
        fp.close()
        (x0,y0,x1,y1) = image.dstbbox
        self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
                         'width="%d" height="%d" />\n' %
                         (enc(name),
-                          x0*self.scale, (self.yoffset-y1)*self.scale,
+                          image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
-                          (x1-x0)*self.scale, (y1-y0)*self.scale))
+                          image.width*self.scale, image.height*self.scale))
        return
    def end_page(self, page):
@ -332,7 +329,7 @@ class HTMLConverter(PDFConverter):
                for child in item:
                    render(child)
            elif isinstance(item, LTImage):
-                if self.imgdir:
+                if self.outdir:
                    self.write_image(item)
            return
        page = PDFConverter.end_page(self, page)
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -1,9 +1,8 @@
 #!/usr/bin/env python
 import sys
 from sys import maxint as INF
-from utils import apply_matrix_norm
+from utils import apply_matrix_norm, apply_matrix_pt
-from utils import apply_matrix_pt
+from utils import bsearch, strbbox
 from utils import bsearch
@ -137,7 +136,7 @@ class LayoutItem(object):
        return
    def __repr__(self):
-        return ('<item bbox=%s>' % (self.get_bbox()))
+        return ('<item bbox=%s>' % strbbox(self.bbox))
    def set_bbox(self, (x0,y0,x1,y1)):
        if x1 < x0: (x0,x1) = (x1,x0)
@ -148,11 +147,9 @@ class LayoutItem(object):
        self.y1 = y1
        self.width = x1-x0
        self.height = y1-y0
        self.bbox = (x0, y0, x1, y1)
        return
    def get_bbox(self):
        return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
    def is_hoverlap(self, obj):
        assert isinstance(obj, LayoutItem)
        if self.x1 <= obj.x0 or obj.x1 <= self.x0:
@ -206,7 +203,7 @@ class LayoutContainer(LayoutItem):
        return
    def __repr__(self):
-        return ('<group %s>' % (self.get_bbox()))
+        return ('<group %s>' % strbbox(self.bbox))
    def __iter__(self):
        return iter(self.objs)
@ -285,13 +282,13 @@ class LTRect(LTPolygon):
 ##  LTImage
 ##
-class LTImage(object):
+class LTImage(LayoutItem):
-    def __init__(self, name, type, srcsize, dstbbox, data):
+    def __init__(self, name, type, srcsize, bbox, data):
        LayoutItem.__init__(self, bbox)
        self.name = name
        self.type = type
        self.srcsize = srcsize
        self.dstbbox = dstbbox
        self.data = data
        return
@ -370,7 +367,7 @@ class LTTextItem(LayoutItem, LTText):
        if self.debug:
            return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
                    ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
-                     self.font, self.fontsize, self.get_bbox(),
+                     self.font, self.fontsize, strbbox(self.bbox),
                     '(%.1f, %.1f)' % self.adv,
                     self.text))
        else:
@ -400,7 +397,7 @@ class LTFigure(LayoutContainer):
        return
    def __repr__(self):
-        return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
+        return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, strbbox(self.bbox), self.matrix))
 ##  LTTextLine
@ -414,7 +411,7 @@ class LTTextLine(LayoutContainer):
        return
    def __repr__(self):
-        return ('<textline %s(%s)>' % (self.get_bbox(), self.direction))
+        return ('<textline %s(%s)>' % (strbbox(self.bbox), self.direction))
    def get_margin(self):
        return min(self.width, self.height)
@ -464,7 +461,7 @@ class LTTextBox(LayoutContainer):
        return
    def __repr__(self):
-        return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
+        return ('<textbox %s(%s) %r...>' % (strbbox(self.bbox), self.direction, self.get_text()[:20]))
    def get_text(self):
        return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -520,7 +517,7 @@ class LTPage(LayoutContainer):
        return
    def __repr__(self):
-        return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
+        return ('<page id=%r bbox=%s rotate=%r>' % (self.id, strbbox(self.bbox), self.rotate))
    def analyze_layout(self, laparams):
        textobjs = []
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -707,7 +707,7 @@ class PDFPageInterpreter(object):
        elif page.rotate == 180:
            ctm = (-1,0,0,-1, x1,y1)
        elif page.rotate == 270:
-            ctm = (0,1,-1,0, x0,-y1)
+            ctm = (0,1,-1,0, y1,-x0)
        else:
            ctm = (1,0,0,1, -x0,-y0)
        self.device.begin_page(page, ctm)
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@ -247,7 +247,7 @@ class PDFPage(object):
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
-        self.rotate = self.attrs.get('Rotate', 0)
+        self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -136,6 +136,9 @@ def enc(x, codec='ascii'):
    x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
    return x.encode(codec, 'xmlcharrefreplace')
 def strbbox((x0,y0,x1,y1)):
    return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
 ##  ObjIdRange
 ##
--- a/samples/Makefile
+++ b/samples/Makefile
@ -1,7 +1,7 @@
 # GNUMakefile for test
 PYTHON=python
-PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
+PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
 HTMLS= \
 	simple1.html \
@ -36,14 +36,13 @@ XMLS= \
 	naacl06-shinyama.xml \
 	nlp2004slides.xml
-all:
+all: htmls texts xmls
 clean:
 	-rm $(HTMLS)
 	-rm $(TEXTS)
 	-rm $(XMLS)
 test: htmls texts xmls
 htmls: $(HTMLS)
 texts: $(TEXTS)
 xmls: $(XMLS)
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -13,10 +13,10 @@ def main(argv):
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
               '[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
-               '[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
+               '[-t text|html|xml|tag] [-O output_dir] [-o output] file ...' % argv[0])
        return 100
    try:
-        (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
+        (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:O:o:C:D:m:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
@ -29,7 +29,7 @@ def main(argv):
    # output option
    outfile = None
    outtype = None
-    imgdir = None
+    outdir = None
    codec = 'utf-8'
    pageno = 1
    scale = 1
@ -43,7 +43,7 @@ def main(argv):
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-o': outfile = v
-        elif k == '-I': imgdir = v
+        elif k == '-O': outdir = v
        elif k == '-s': scale = float(v)
        elif k == '-n': laparams = None
        elif k == '-D': laparams.direction = v
@ -75,9 +75,9 @@ def main(argv):
    if outtype == 'text':
        device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
-        device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
+        device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
-        device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
+        device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrc, outfp, codec=codec)
    else: