Page rotation bug fixed.

Various minor fixes.


git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@176 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-31 02:09:28 +00:00
parent 7969feeae1
commit 0f8fe3f19e
9 changed files with 55 additions and 60 deletions

View File

@ -33,11 +33,6 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish: publish:
$(CP) docs/*.html $(WEBDIR) $(CP) docs/*.html $(WEBDIR)
test:
cd samples && $(MAKE) test
test_clean:
-cd samples && $(MAKE) clean
CONV_CMAP=$(PYTHON) tools/conv_cmap.py CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap CMAPDST=pdfminer/cmap
@ -53,3 +48,8 @@ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp $(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py: $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap
cd samples && $(MAKE) all
test_clean:
-cd samples && $(MAKE) clean

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Jan 30 16:32:50 JST 2010 Last Modified: Sun Jan 31 10:38:26 JST 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -127,9 +127,8 @@ W o r l d
<a name="cmap"></a> <a name="cmap"></a>
<h3>For East Asian languages</h3> <h3>For East Asian languages</h3>
In order to handle East Asian languages (Chinese or Japanese, etc.), In order to handle East Asian languages (Chinese or Japanese, etc.),
you need to install an additional data called <code>CMap</code>, an additional data called <code>CMap</code> is required.
which is originally distributed by Adobe. CMap is now included CMap files are not installed by default.
in the pdfminer package, but not installed by default.
<p> <p>
Here is the additional step you need: Here is the additional step you need:
<blockquote><pre> <blockquote><pre>
@ -347,7 +346,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2010/01/30: JPEG image extraction supported. <li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
<li> 2010/01/04: Python 2.6 warning removal. More doctest conversion. <li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert. <li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger. <li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.

View File

@ -6,8 +6,8 @@ from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
from utils import enc
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
from utils import enc, strbbox
## TagExtractor ## TagExtractor
@ -38,10 +38,8 @@ class TagExtractor(PDFDevice):
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
(x0, y0, x1, y1) = page.mediabox
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox, page.rotate)) (self.pageno, strbbox(page.mediabox), page.rotate))
return return
def end_page(self, page): def end_page(self, page):
@ -177,9 +175,9 @@ class PDFConverter(PDFPageAggregator):
## ##
class XMLConverter(PDFConverter): class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.imgdir = imgdir self.outdir = outdir
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec) self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n') self.outfp.write('<pages>\n')
return return
@ -190,7 +188,7 @@ class XMLConverter(PDFConverter):
else: else:
return None return None
name = image.name+ext name = image.name+ext
path = os.path.join(self.imgdir, name) path = os.path.join(self.outdir, name)
fp = file(path, 'wb') fp = file(path, 'wb')
fp.write(image.data) fp.write(image.data)
fp.close() fp.close()
@ -200,42 +198,42 @@ class XMLConverter(PDFConverter):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate)) (item.id, strbbox(item.bbox), item.rotate))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction: elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox())) self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
elif isinstance(item, LTPolygon): elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts())) self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox())) self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textline>\n') self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem): elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(), (enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize)) strbbox(item.bbox), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text) self.outfp.write('<text>%s</text>\n' % item.text)
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
x = '' x = ''
if self.imgdir: if self.outdir:
name = self.write_image(item) name = self.write_image(item)
if name: if name:
x = 'name="%s" ' % enc(name) x = 'name="%s" ' % enc(name)
@ -257,11 +255,11 @@ class XMLConverter(PDFConverter):
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50, imgdir=None): scale=1, showpageno=True, pagepad=50, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.imgdir = imgdir self.outdir = outdir
self.scale = scale self.scale = scale
self.outfp.write('<html><head>\n') self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
@ -282,16 +280,15 @@ class HTMLConverter(PDFConverter):
else: else:
return return
name = image.name+ext name = image.name+ext
path = os.path.join(self.imgdir, name) path = os.path.join(self.outdir, name)
fp = file(path, 'wb') fp = file(path, 'wb')
fp.write(image.data) fp.write(image.data)
fp.close() fp.close()
(x0,y0,x1,y1) = image.dstbbox
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" ' self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' % 'width="%d" height="%d" />\n' %
(enc(name), (enc(name),
x0*self.scale, (self.yoffset-y1)*self.scale, image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
(x1-x0)*self.scale, (y1-y0)*self.scale)) image.width*self.scale, image.height*self.scale))
return return
def end_page(self, page): def end_page(self, page):
@ -332,7 +329,7 @@ class HTMLConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
if self.imgdir: if self.outdir:
self.write_image(item) self.write_image(item)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)

View File

@ -1,9 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from sys import maxint as INF from sys import maxint as INF
from utils import apply_matrix_norm from utils import apply_matrix_norm, apply_matrix_pt
from utils import apply_matrix_pt from utils import bsearch, strbbox
from utils import bsearch
@ -137,7 +136,7 @@ class LayoutItem(object):
return return
def __repr__(self): def __repr__(self):
return ('<item bbox=%s>' % (self.get_bbox())) return ('<item bbox=%s>' % strbbox(self.bbox))
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0) if x1 < x0: (x0,x1) = (x1,x0)
@ -148,11 +147,9 @@ class LayoutItem(object):
self.y1 = y1 self.y1 = y1
self.width = x1-x0 self.width = x1-x0
self.height = y1-y0 self.height = y1-y0
self.bbox = (x0, y0, x1, y1)
return return
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def is_hoverlap(self, obj): def is_hoverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0: if self.x1 <= obj.x0 or obj.x1 <= self.x0:
@ -206,7 +203,7 @@ class LayoutContainer(LayoutItem):
return return
def __repr__(self): def __repr__(self):
return ('<group %s>' % (self.get_bbox())) return ('<group %s>' % strbbox(self.bbox))
def __iter__(self): def __iter__(self):
return iter(self.objs) return iter(self.objs)
@ -285,13 +282,13 @@ class LTRect(LTPolygon):
## LTImage ## LTImage
## ##
class LTImage(object): class LTImage(LayoutItem):
def __init__(self, name, type, srcsize, dstbbox, data): def __init__(self, name, type, srcsize, bbox, data):
LayoutItem.__init__(self, bbox)
self.name = name self.name = name
self.type = type self.type = type
self.srcsize = srcsize self.srcsize = srcsize
self.dstbbox = dstbbox
self.data = data self.data = data
return return
@ -370,7 +367,7 @@ class LTTextItem(LayoutItem, LTText):
if self.debug: if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' % return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(), self.font, self.fontsize, strbbox(self.bbox),
'(%.1f, %.1f)' % self.adv, '(%.1f, %.1f)' % self.adv,
self.text)) self.text))
else: else:
@ -400,7 +397,7 @@ class LTFigure(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix)) return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, strbbox(self.bbox), self.matrix))
## LTTextLine ## LTTextLine
@ -414,7 +411,7 @@ class LTTextLine(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<textline %s(%s)>' % (self.get_bbox(), self.direction)) return ('<textline %s(%s)>' % (strbbox(self.bbox), self.direction))
def get_margin(self): def get_margin(self):
return min(self.width, self.height) return min(self.width, self.height)
@ -464,7 +461,7 @@ class LTTextBox(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20])) return ('<textbox %s(%s) %r...>' % (strbbox(self.bbox), self.direction, self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -520,7 +517,7 @@ class LTPage(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate)) return ('<page id=%r bbox=%s rotate=%r>' % (self.id, strbbox(self.bbox), self.rotate))
def analyze_layout(self, laparams): def analyze_layout(self, laparams):
textobjs = [] textobjs = []

View File

@ -707,7 +707,7 @@ class PDFPageInterpreter(object):
elif page.rotate == 180: elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1) ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270: elif page.rotate == 270:
ctm = (0,1,-1,0, x0,-y1) ctm = (0,1,-1,0, y1,-x0)
else: else:
ctm = (1,0,0,1, -x0,-y0) ctm = (1,0,0,1, -x0,-y0)
self.device.begin_page(page, ctm) self.device.begin_page(page, ctm)

View File

@ -247,7 +247,7 @@ class PDFPage(object):
self.cropbox = resolve1(self.attrs['CropBox']) self.cropbox = resolve1(self.attrs['CropBox'])
else: else:
self.cropbox = self.mediabox self.cropbox = self.mediabox
self.rotate = self.attrs.get('Rotate', 0) self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots') self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B') self.beads = self.attrs.get('B')
if 'Contents' in self.attrs: if 'Contents' in self.attrs:

View File

@ -136,6 +136,9 @@ def enc(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
def strbbox((x0,y0,x1,y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
## ObjIdRange ## ObjIdRange
## ##

View File

@ -1,7 +1,7 @@
# GNUMakefile for test # GNUMakefile for test
PYTHON=python PYTHON=python
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \
@ -36,14 +36,13 @@ XMLS= \
naacl06-shinyama.xml \ naacl06-shinyama.xml \
nlp2004slides.xml nlp2004slides.xml
all: all: htmls texts xmls
clean: clean:
-rm $(HTMLS) -rm $(HTMLS)
-rm $(TEXTS) -rm $(TEXTS)
-rm $(XMLS) -rm $(XMLS)
test: htmls texts xmls
htmls: $(HTMLS) htmls: $(HTMLS)
texts: $(TEXTS) texts: $(TEXTS)
xmls: $(XMLS) xmls: $(XMLS)

View File

@ -13,10 +13,10 @@ def main(argv):
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0]) '[-t text|html|xml|tag] [-O output_dir] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:O:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -29,7 +29,7 @@ def main(argv):
# output option # output option
outfile = None outfile = None
outtype = None outtype = None
imgdir = None outdir = None
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
scale = 1 scale = 1
@ -43,7 +43,7 @@ def main(argv):
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-I': imgdir = v elif k == '-O': outdir = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-n': laparams = None elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v elif k == '-D': laparams.direction = v
@ -75,9 +75,9 @@ def main(argv):
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir) device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir) device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrc, outfp, codec=codec)
else: else: