Page rotation bug fixed.

Various minor fixes.


git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@176 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-31 02:09:28 +00:00
parent 7969feeae1
commit 0f8fe3f19e
9 changed files with 55 additions and 60 deletions

View File

@ -33,11 +33,6 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish:
$(CP) docs/*.html $(WEBDIR)
test:
cd samples && $(MAKE) test
test_clean:
-cd samples && $(MAKE) clean
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
@ -53,3 +48,8 @@ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap
cd samples && $(MAKE) all
test_clean:
-cd samples && $(MAKE) clean

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat Jan 30 16:32:50 JST 2010
Last Modified: Sun Jan 31 10:38:26 JST 2010
<!-- hhmts end -->
</div>
@ -127,9 +127,8 @@ W o r l d
<a name="cmap"></a>
<h3>For East Asian languages</h3>
In order to handle East Asian languages (Chinese or Japanese, etc.),
you need to install an additional data called <code>CMap</code>,
which is originally distributed by Adobe. CMap is now included
in the pdfminer package, but not installed by default.
an additional data called <code>CMap</code> is required.
CMap files are not installed by default.
<p>
Here is the additional step you need:
<blockquote><pre>
@ -347,7 +346,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2010/01/30: JPEG image extraction supported.
<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
<li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.

View File

@ -6,8 +6,8 @@ from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
from utils import enc
from utils import apply_matrix_pt, mult_matrix
from utils import enc, strbbox
## TagExtractor
@ -38,10 +38,8 @@ class TagExtractor(PDFDevice):
return
def begin_page(self, page, ctm):
(x0, y0, x1, y1) = page.mediabox
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox, page.rotate))
(self.pageno, strbbox(page.mediabox), page.rotate))
return
def end_page(self, page):
@ -177,9 +175,9 @@ class PDFConverter(PDFPageAggregator):
##
class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.imgdir = imgdir
self.outdir = outdir
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n')
return
@ -190,7 +188,7 @@ class XMLConverter(PDFConverter):
else:
return None
name = image.name+ext
path = os.path.join(self.imgdir, name)
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
@ -200,42 +198,42 @@ class XMLConverter(PDFConverter):
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate))
(item.id, strbbox(item.bbox), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox()))
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox()))
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts()))
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
strbbox(item.bbox), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text)
elif isinstance(item, LTImage):
x = ''
if self.imgdir:
if self.outdir:
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
@ -257,11 +255,11 @@ class XMLConverter(PDFConverter):
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50, imgdir=None):
scale=1, showpageno=True, pagepad=50, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
self.imgdir = imgdir
self.outdir = outdir
self.scale = scale
self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
@ -282,16 +280,15 @@ class HTMLConverter(PDFConverter):
else:
return
name = image.name+ext
path = os.path.join(self.imgdir, name)
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
(x0,y0,x1,y1) = image.dstbbox
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name),
x0*self.scale, (self.yoffset-y1)*self.scale,
(x1-x0)*self.scale, (y1-y0)*self.scale))
image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
image.width*self.scale, image.height*self.scale))
return
def end_page(self, page):
@ -332,7 +329,7 @@ class HTMLConverter(PDFConverter):
for child in item:
render(child)
elif isinstance(item, LTImage):
if self.imgdir:
if self.outdir:
self.write_image(item)
return
page = PDFConverter.end_page(self, page)

View File

@ -1,9 +1,8 @@
#!/usr/bin/env python
import sys
from sys import maxint as INF
from utils import apply_matrix_norm
from utils import apply_matrix_pt
from utils import bsearch
from utils import apply_matrix_norm, apply_matrix_pt
from utils import bsearch, strbbox
@ -137,7 +136,7 @@ class LayoutItem(object):
return
def __repr__(self):
return ('<item bbox=%s>' % (self.get_bbox()))
return ('<item bbox=%s>' % strbbox(self.bbox))
def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
@ -148,11 +147,9 @@ class LayoutItem(object):
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
self.bbox = (x0, y0, x1, y1)
return
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def is_hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
@ -206,7 +203,7 @@ class LayoutContainer(LayoutItem):
return
def __repr__(self):
return ('<group %s>' % (self.get_bbox()))
return ('<group %s>' % strbbox(self.bbox))
def __iter__(self):
return iter(self.objs)
@ -285,13 +282,13 @@ class LTRect(LTPolygon):
## LTImage
##
class LTImage(object):
class LTImage(LayoutItem):
def __init__(self, name, type, srcsize, dstbbox, data):
def __init__(self, name, type, srcsize, bbox, data):
LayoutItem.__init__(self, bbox)
self.name = name
self.type = type
self.srcsize = srcsize
self.dstbbox = dstbbox
self.data = data
return
@ -370,7 +367,7 @@ class LTTextItem(LayoutItem, LTText):
if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
self.font, self.fontsize, strbbox(self.bbox),
'(%.1f, %.1f)' % self.adv,
self.text))
else:
@ -400,7 +397,7 @@ class LTFigure(LayoutContainer):
return
def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, strbbox(self.bbox), self.matrix))
## LTTextLine
@ -414,7 +411,7 @@ class LTTextLine(LayoutContainer):
return
def __repr__(self):
return ('<textline %s(%s)>' % (self.get_bbox(), self.direction))
return ('<textline %s(%s)>' % (strbbox(self.bbox), self.direction))
def get_margin(self):
return min(self.width, self.height)
@ -464,7 +461,7 @@ class LTTextBox(LayoutContainer):
return
def __repr__(self):
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
return ('<textbox %s(%s) %r...>' % (strbbox(self.bbox), self.direction, self.get_text()[:20]))
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -520,7 +517,7 @@ class LTPage(LayoutContainer):
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, strbbox(self.bbox), self.rotate))
def analyze_layout(self, laparams):
textobjs = []

View File

@ -707,7 +707,7 @@ class PDFPageInterpreter(object):
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, x0,-y1)
ctm = (0,1,-1,0, y1,-x0)
else:
ctm = (1,0,0,1, -x0,-y0)
self.device.begin_page(page, ctm)

View File

@ -247,7 +247,7 @@ class PDFPage(object):
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = self.attrs.get('Rotate', 0)
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:

View File

@ -136,6 +136,9 @@ def enc(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')
def strbbox((x0,y0,x1,y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
## ObjIdRange
##

View File

@ -1,7 +1,7 @@
# GNUMakefile for test
PYTHON=python
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
HTMLS= \
simple1.html \
@ -36,14 +36,13 @@ XMLS= \
naacl06-shinyama.xml \
nlp2004slides.xml
all:
all: htmls texts xmls
clean:
-rm $(HTMLS)
-rm $(TEXTS)
-rm $(XMLS)
test: htmls texts xmls
htmls: $(HTMLS)
texts: $(TEXTS)
xmls: $(XMLS)

View File

@ -13,10 +13,10 @@ def main(argv):
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
'[-t text|html|xml|tag] [-O output_dir] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:O:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -29,7 +29,7 @@ def main(argv):
# output option
outfile = None
outtype = None
imgdir = None
outdir = None
codec = 'utf-8'
pageno = 1
scale = 1
@ -43,7 +43,7 @@ def main(argv):
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-I': imgdir = v
elif k == '-O': outdir = v
elif k == '-s': scale = float(v)
elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v
@ -75,9 +75,9 @@ def main(argv):
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else: