From 0f8fe3f19e893fbafcc8637c9f2bed323eeeec2f Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 31 Jan 2010 02:09:28 +0000 Subject: [PATCH] Page rotation bug fixed. Various minor fixes. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@176 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 10 +++++----- docs/index.html | 9 ++++----- pdfminer/converter.py | 43 ++++++++++++++++++++----------------------- pdfminer/layout.py | 29 +++++++++++++---------------- pdfminer/pdfinterp.py | 2 +- pdfminer/pdfparser.py | 2 +- pdfminer/utils.py | 3 +++ samples/Makefile | 5 ++--- tools/pdf2txt.py | 12 ++++++------ 9 files changed, 55 insertions(+), 60 deletions(-) diff --git a/Makefile b/Makefile index 60ededb..c972828 100644 --- a/Makefile +++ b/Makefile @@ -33,11 +33,6 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE) publish: $(CP) docs/*.html $(WEBDIR) -test: - cd samples && $(MAKE) test -test_clean: - -cd samples && $(MAKE) clean - CONV_CMAP=$(PYTHON) tools/conv_cmap.py CMAPSRC=cmaprsrc CMAPDST=pdfminer/cmap @@ -53,3 +48,8 @@ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py: $(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py: $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr + +test: cmap + cd samples && $(MAKE) all +test_clean: + -cd samples && $(MAKE) clean diff --git a/docs/index.html b/docs/index.html index 6c0abd7..aa24ddf 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Jan 30 16:32:50 JST 2010 +Last Modified: Sun Jan 31 10:38:26 JST 2010
@@ -127,9 +127,8 @@ W o r l d

For East Asian languages

In order to handle East Asian languages (Chinese or Japanese, etc.), -you need to install an additional data called CMap, -which is originally distributed by Adobe. CMap is now included -in the pdfminer package, but not installed by default. +an additional data called CMap is required. +CMap files are not installed by default.

Here is the additional step you need:

@@ -347,7 +346,7 @@ no stream header is displayed for the ease of saving it to a file.
 

Changes

    -
  • 2010/01/30: JPEG image extraction supported. +
  • 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
  • 2010/01/04: Python 2.6 warning removal. More doctest conversion.
  • 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
  • 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger. diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 4c3fcbf..3d6093f 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -6,8 +6,8 @@ from pdftypes import LITERALS_DCT_DECODE from layout import LayoutContainer from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine -from utils import enc from utils import apply_matrix_pt, mult_matrix +from utils import enc, strbbox ## TagExtractor @@ -38,10 +38,8 @@ class TagExtractor(PDFDevice): return def begin_page(self, page, ctm): - (x0, y0, x1, y1) = page.mediabox - bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) self.outfp.write('' % - (self.pageno, bbox, page.rotate)) + (self.pageno, strbbox(page.mediabox), page.rotate)) return def end_page(self, page): @@ -177,9 +175,9 @@ class PDFConverter(PDFPageAggregator): ## class XMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None): + def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) - self.imgdir = imgdir + self.outdir = outdir self.outfp.write('\n' % codec) self.outfp.write('\n') return @@ -190,7 +188,7 @@ class XMLConverter(PDFConverter): else: return None name = image.name+ext - path = os.path.join(self.imgdir, name) + path = os.path.join(self.outdir, name) fp = file(path, 'wb') fp.write(image.data) fp.close() @@ -200,42 +198,42 @@ class XMLConverter(PDFConverter): def render(item): if isinstance(item, LTPage): self.outfp.write('\n' % - (item.id, item.get_bbox(), item.rotate)) + (item.id, strbbox(item.bbox), item.rotate)) for child in item: render(child) self.outfp.write('\n') elif isinstance(item, LTLine) and item.direction: - self.outfp.write('\n' % (item.linewidth, item.direction, item.get_bbox())) + self.outfp.write('\n' % (item.linewidth, item.direction, strbbox(item.bbox))) elif isinstance(item, LTRect): - self.outfp.write('\n' % (item.linewidth, item.get_bbox())) + self.outfp.write('\n' % (item.linewidth, strbbox(item.bbox))) elif isinstance(item, LTPolygon): - self.outfp.write('\n' % (item.linewidth, item.get_bbox(), item.get_pts())) + self.outfp.write('\n' % (item.linewidth, strbbox(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): - self.outfp.write('
    \n' % (item.id, item.get_bbox())) + self.outfp.write('
    \n' % (item.id, strbbox(item.bbox))) for child in item: render(child) self.outfp.write('
    \n') elif isinstance(item, LTTextLine): - self.outfp.write('\n' % (item.get_bbox())) + self.outfp.write('\n' % strbbox(item.bbox)) for child in item: render(child) self.outfp.write('\n') elif isinstance(item, LTTextBox): - self.outfp.write('\n' % (item.id, item.get_bbox())) + self.outfp.write('\n' % (item.id, strbbox(item.bbox))) for child in item: render(child) self.outfp.write('\n') elif isinstance(item, LTTextItem): self.outfp.write('' % (enc(item.font.fontname), item.is_vertical(), - item.get_bbox(), item.fontsize)) + strbbox(item.bbox), item.fontsize)) self.write(item.text) self.outfp.write('\n') elif isinstance(item, LTText): self.outfp.write('%s\n' % item.text) elif isinstance(item, LTImage): x = '' - if self.imgdir: + if self.outdir: name = self.write_image(item) if name: x = 'name="%s" ' % enc(name) @@ -257,11 +255,11 @@ class XMLConverter(PDFConverter): class HTMLConverter(PDFConverter): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, showpageno=True, pagepad=50, imgdir=None): + scale=1, showpageno=True, pagepad=50, outdir=None): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.pagepad = pagepad - self.imgdir = imgdir + self.outdir = outdir self.scale = scale self.outfp.write('\n') self.outfp.write('\n' % @@ -282,16 +280,15 @@ class HTMLConverter(PDFConverter): else: return name = image.name+ext - path = os.path.join(self.imgdir, name) + path = os.path.join(self.outdir, name) fp = file(path, 'wb') fp.write(image.data) fp.close() - (x0,y0,x1,y1) = image.dstbbox self.outfp.write('\n' % (enc(name), - x0*self.scale, (self.yoffset-y1)*self.scale, - (x1-x0)*self.scale, (y1-y0)*self.scale)) + image.x0*self.scale, (self.yoffset-image.y1)*self.scale, + image.width*self.scale, image.height*self.scale)) return def end_page(self, page): @@ -332,7 +329,7 @@ class HTMLConverter(PDFConverter): for child in item: render(child) elif isinstance(item, LTImage): - if self.imgdir: + if self.outdir: self.write_image(item) return page = PDFConverter.end_page(self, page) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 02d1cb8..91a4a71 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,9 +1,8 @@ #!/usr/bin/env python import sys from sys import maxint as INF -from utils import apply_matrix_norm -from utils import apply_matrix_pt -from utils import bsearch +from utils import apply_matrix_norm, apply_matrix_pt +from utils import bsearch, strbbox @@ -137,7 +136,7 @@ class LayoutItem(object): return def __repr__(self): - return ('' % (self.get_bbox())) + return ('' % strbbox(self.bbox)) def set_bbox(self, (x0,y0,x1,y1)): if x1 < x0: (x0,x1) = (x1,x0) @@ -148,11 +147,9 @@ class LayoutItem(object): self.y1 = y1 self.width = x1-x0 self.height = y1-y0 + self.bbox = (x0, y0, x1, y1) return - def get_bbox(self): - return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) - def is_hoverlap(self, obj): assert isinstance(obj, LayoutItem) if self.x1 <= obj.x0 or obj.x1 <= self.x0: @@ -206,7 +203,7 @@ class LayoutContainer(LayoutItem): return def __repr__(self): - return ('' % (self.get_bbox())) + return ('' % strbbox(self.bbox)) def __iter__(self): return iter(self.objs) @@ -285,13 +282,13 @@ class LTRect(LTPolygon): ## LTImage ## -class LTImage(object): +class LTImage(LayoutItem): - def __init__(self, name, type, srcsize, dstbbox, data): + def __init__(self, name, type, srcsize, bbox, data): + LayoutItem.__init__(self, bbox) self.name = name self.type = type self.srcsize = srcsize - self.dstbbox = dstbbox self.data = data return @@ -370,7 +367,7 @@ class LTTextItem(LayoutItem, LTText): if self.debug: return ('' % ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, - self.font, self.fontsize, self.get_bbox(), + self.font, self.fontsize, strbbox(self.bbox), '(%.1f, %.1f)' % self.adv, self.text)) else: @@ -400,7 +397,7 @@ class LTFigure(LayoutContainer): return def __repr__(self): - return ('
    ' % (self.id, self.get_bbox(), self.matrix)) + return ('
    ' % (self.id, strbbox(self.bbox), self.matrix)) ## LTTextLine @@ -414,7 +411,7 @@ class LTTextLine(LayoutContainer): return def __repr__(self): - return ('' % (self.get_bbox(), self.direction)) + return ('' % (strbbox(self.bbox), self.direction)) def get_margin(self): return min(self.width, self.height) @@ -464,7 +461,7 @@ class LTTextBox(LayoutContainer): return def __repr__(self): - return ('' % (self.get_bbox(), self.direction, self.get_text()[:20])) + return ('' % (strbbox(self.bbox), self.direction, self.get_text()[:20])) def get_text(self): return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) @@ -520,7 +517,7 @@ class LTPage(LayoutContainer): return def __repr__(self): - return ('' % (self.id, self.get_bbox(), self.rotate)) + return ('' % (self.id, strbbox(self.bbox), self.rotate)) def analyze_layout(self, laparams): textobjs = [] diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index f6647fe..e6bdf16 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -707,7 +707,7 @@ class PDFPageInterpreter(object): elif page.rotate == 180: ctm = (-1,0,0,-1, x1,y1) elif page.rotate == 270: - ctm = (0,1,-1,0, x0,-y1) + ctm = (0,1,-1,0, y1,-x0) else: ctm = (1,0,0,1, -x0,-y0) self.device.begin_page(page, ctm) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 730cf45..7012df7 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -247,7 +247,7 @@ class PDFPage(object): self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox - self.rotate = self.attrs.get('Rotate', 0) + self.rotate = (self.attrs.get('Rotate', 0)+360) % 360 self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 5a9a918..06e9785 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -136,6 +136,9 @@ def enc(x, codec='ascii'): x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') return x.encode(codec, 'xmlcharrefreplace') +def strbbox((x0,y0,x1,y1)): + return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) + ## ObjIdRange ## diff --git a/samples/Makefile b/samples/Makefile index 976a5ae..fe7f4dc 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,7 +1,7 @@ # GNUMakefile for test PYTHON=python -PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py +PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 HTMLS= \ simple1.html \ @@ -36,14 +36,13 @@ XMLS= \ naacl06-shinyama.xml \ nlp2004slides.xml -all: +all: htmls texts xmls clean: -rm $(HTMLS) -rm $(TEXTS) -rm $(XMLS) -test: htmls texts xmls htmls: $(HTMLS) texts: $(TEXTS) xmls: $(XMLS) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 7e80f09..935bb24 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -13,10 +13,10 @@ def main(argv): def usage(): print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' '[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' - '[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0]) + '[-t text|html|xml|tag] [-O output_dir] [-o output] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:O:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() @@ -29,7 +29,7 @@ def main(argv): # output option outfile = None outtype = None - imgdir = None + outdir = None codec = 'utf-8' pageno = 1 scale = 1 @@ -43,7 +43,7 @@ def main(argv): elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-o': outfile = v - elif k == '-I': imgdir = v + elif k == '-O': outdir = v elif k == '-s': scale = float(v) elif k == '-n': laparams = None elif k == '-D': laparams.direction = v @@ -75,9 +75,9 @@ def main(argv): if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': - device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir) + device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir) + device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: