Page rotation bug fixed.
Various minor fixes. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@176 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
7969feeae1
commit
0f8fe3f19e
10
Makefile
10
Makefile
|
@ -33,11 +33,6 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
|
|||
publish:
|
||||
$(CP) docs/*.html $(WEBDIR)
|
||||
|
||||
test:
|
||||
cd samples && $(MAKE) test
|
||||
test_clean:
|
||||
-cd samples && $(MAKE) clean
|
||||
|
||||
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
||||
CMAPSRC=cmaprsrc
|
||||
CMAPDST=pdfminer/cmap
|
||||
|
@ -53,3 +48,8 @@ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
|
|||
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
||||
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
|
||||
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
||||
|
||||
test: cmap
|
||||
cd samples && $(MAKE) all
|
||||
test_clean:
|
||||
-cd samples && $(MAKE) clean
|
||||
|
|
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sat Jan 30 16:32:50 JST 2010
|
||||
Last Modified: Sun Jan 31 10:38:26 JST 2010
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -127,9 +127,8 @@ W o r l d
|
|||
<a name="cmap"></a>
|
||||
<h3>For East Asian languages</h3>
|
||||
In order to handle East Asian languages (Chinese or Japanese, etc.),
|
||||
you need to install an additional data called <code>CMap</code>,
|
||||
which is originally distributed by Adobe. CMap is now included
|
||||
in the pdfminer package, but not installed by default.
|
||||
an additional data called <code>CMap</code> is required.
|
||||
CMap files are not installed by default.
|
||||
<p>
|
||||
Here is the additional step you need:
|
||||
<blockquote><pre>
|
||||
|
@ -347,7 +346,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2010/01/30: JPEG image extraction supported.
|
||||
<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
|
||||
<li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
|
||||
<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
|
||||
<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
|
||||
|
|
|
@ -6,8 +6,8 @@ from pdftypes import LITERALS_DCT_DECODE
|
|||
from layout import LayoutContainer
|
||||
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
||||
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
|
||||
from utils import enc
|
||||
from utils import apply_matrix_pt, mult_matrix
|
||||
from utils import enc, strbbox
|
||||
|
||||
|
||||
## TagExtractor
|
||||
|
@ -38,10 +38,8 @@ class TagExtractor(PDFDevice):
|
|||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
||||
(self.pageno, bbox, page.rotate))
|
||||
(self.pageno, strbbox(page.mediabox), page.rotate))
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
|
@ -177,9 +175,9 @@ class PDFConverter(PDFPageAggregator):
|
|||
##
|
||||
class XMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.imgdir = imgdir
|
||||
self.outdir = outdir
|
||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||
self.outfp.write('<pages>\n')
|
||||
return
|
||||
|
@ -190,7 +188,7 @@ class XMLConverter(PDFConverter):
|
|||
else:
|
||||
return None
|
||||
name = image.name+ext
|
||||
path = os.path.join(self.imgdir, name)
|
||||
path = os.path.join(self.outdir, name)
|
||||
fp = file(path, 'wb')
|
||||
fp.write(image.data)
|
||||
fp.close()
|
||||
|
@ -200,42 +198,42 @@ class XMLConverter(PDFConverter):
|
|||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
(item.id, item.get_bbox(), item.rotate))
|
||||
(item.id, strbbox(item.bbox), item.rotate))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</page>\n')
|
||||
elif isinstance(item, LTLine) and item.direction:
|
||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox()))
|
||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
|
||||
elif isinstance(item, LTRect):
|
||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox()))
|
||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
|
||||
elif isinstance(item, LTPolygon):
|
||||
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts()))
|
||||
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
|
||||
elif isinstance(item, LTFigure):
|
||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
elif isinstance(item, LTTextLine):
|
||||
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
|
||||
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textline>\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
elif isinstance(item, LTTextItem):
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(enc(item.font.fontname), item.is_vertical(),
|
||||
item.get_bbox(), item.fontsize))
|
||||
strbbox(item.bbox), item.fontsize))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</text>\n')
|
||||
elif isinstance(item, LTText):
|
||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||
elif isinstance(item, LTImage):
|
||||
x = ''
|
||||
if self.imgdir:
|
||||
if self.outdir:
|
||||
name = self.write_image(item)
|
||||
if name:
|
||||
x = 'name="%s" ' % enc(name)
|
||||
|
@ -257,11 +255,11 @@ class XMLConverter(PDFConverter):
|
|||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, showpageno=True, pagepad=50, imgdir=None):
|
||||
scale=1, showpageno=True, pagepad=50, outdir=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.imgdir = imgdir
|
||||
self.outdir = outdir
|
||||
self.scale = scale
|
||||
self.outfp.write('<html><head>\n')
|
||||
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
|
||||
|
@ -282,16 +280,15 @@ class HTMLConverter(PDFConverter):
|
|||
else:
|
||||
return
|
||||
name = image.name+ext
|
||||
path = os.path.join(self.imgdir, name)
|
||||
path = os.path.join(self.outdir, name)
|
||||
fp = file(path, 'wb')
|
||||
fp.write(image.data)
|
||||
fp.close()
|
||||
(x0,y0,x1,y1) = image.dstbbox
|
||||
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||
'width="%d" height="%d" />\n' %
|
||||
(enc(name),
|
||||
x0*self.scale, (self.yoffset-y1)*self.scale,
|
||||
(x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
|
||||
image.width*self.scale, image.height*self.scale))
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
|
@ -332,7 +329,7 @@ class HTMLConverter(PDFConverter):
|
|||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTImage):
|
||||
if self.imgdir:
|
||||
if self.outdir:
|
||||
self.write_image(item)
|
||||
return
|
||||
page = PDFConverter.end_page(self, page)
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from sys import maxint as INF
|
||||
from utils import apply_matrix_norm
|
||||
from utils import apply_matrix_pt
|
||||
from utils import bsearch
|
||||
from utils import apply_matrix_norm, apply_matrix_pt
|
||||
from utils import bsearch, strbbox
|
||||
|
||||
|
||||
|
||||
|
@ -137,7 +136,7 @@ class LayoutItem(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<item bbox=%s>' % (self.get_bbox()))
|
||||
return ('<item bbox=%s>' % strbbox(self.bbox))
|
||||
|
||||
def set_bbox(self, (x0,y0,x1,y1)):
|
||||
if x1 < x0: (x0,x1) = (x1,x0)
|
||||
|
@ -148,11 +147,9 @@ class LayoutItem(object):
|
|||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
self.bbox = (x0, y0, x1, y1)
|
||||
return
|
||||
|
||||
def get_bbox(self):
|
||||
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
||||
|
||||
def is_hoverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||
|
@ -206,7 +203,7 @@ class LayoutContainer(LayoutItem):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<group %s>' % (self.get_bbox()))
|
||||
return ('<group %s>' % strbbox(self.bbox))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.objs)
|
||||
|
@ -285,13 +282,13 @@ class LTRect(LTPolygon):
|
|||
|
||||
## LTImage
|
||||
##
|
||||
class LTImage(object):
|
||||
class LTImage(LayoutItem):
|
||||
|
||||
def __init__(self, name, type, srcsize, dstbbox, data):
|
||||
def __init__(self, name, type, srcsize, bbox, data):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
self.name = name
|
||||
self.type = type
|
||||
self.srcsize = srcsize
|
||||
self.dstbbox = dstbbox
|
||||
self.data = data
|
||||
return
|
||||
|
||||
|
@ -370,7 +367,7 @@ class LTTextItem(LayoutItem, LTText):
|
|||
if self.debug:
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
|
||||
self.font, self.fontsize, self.get_bbox(),
|
||||
self.font, self.fontsize, strbbox(self.bbox),
|
||||
'(%.1f, %.1f)' % self.adv,
|
||||
self.text))
|
||||
else:
|
||||
|
@ -400,7 +397,7 @@ class LTFigure(LayoutContainer):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
|
||||
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, strbbox(self.bbox), self.matrix))
|
||||
|
||||
|
||||
## LTTextLine
|
||||
|
@ -414,7 +411,7 @@ class LTTextLine(LayoutContainer):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textline %s(%s)>' % (self.get_bbox(), self.direction))
|
||||
return ('<textline %s(%s)>' % (strbbox(self.bbox), self.direction))
|
||||
|
||||
def get_margin(self):
|
||||
return min(self.width, self.height)
|
||||
|
@ -464,7 +461,7 @@ class LTTextBox(LayoutContainer):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
|
||||
return ('<textbox %s(%s) %r...>' % (strbbox(self.bbox), self.direction, self.get_text()[:20]))
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||
|
@ -520,7 +517,7 @@ class LTPage(LayoutContainer):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, strbbox(self.bbox), self.rotate))
|
||||
|
||||
def analyze_layout(self, laparams):
|
||||
textobjs = []
|
||||
|
|
|
@ -707,7 +707,7 @@ class PDFPageInterpreter(object):
|
|||
elif page.rotate == 180:
|
||||
ctm = (-1,0,0,-1, x1,y1)
|
||||
elif page.rotate == 270:
|
||||
ctm = (0,1,-1,0, x0,-y1)
|
||||
ctm = (0,1,-1,0, y1,-x0)
|
||||
else:
|
||||
ctm = (1,0,0,1, -x0,-y0)
|
||||
self.device.begin_page(page, ctm)
|
||||
|
|
|
@ -247,7 +247,7 @@ class PDFPage(object):
|
|||
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = self.attrs.get('Rotate', 0)
|
||||
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
|
|
|
@ -136,6 +136,9 @@ def enc(x, codec='ascii'):
|
|||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
|
||||
def strbbox((x0,y0,x1,y1)):
|
||||
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||
|
||||
|
||||
## ObjIdRange
|
||||
##
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
PYTHON=python
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
|
||||
|
||||
HTMLS= \
|
||||
simple1.html \
|
||||
|
@ -36,14 +36,13 @@ XMLS= \
|
|||
naacl06-shinyama.xml \
|
||||
nlp2004slides.xml
|
||||
|
||||
all:
|
||||
all: htmls texts xmls
|
||||
|
||||
clean:
|
||||
-rm $(HTMLS)
|
||||
-rm $(TEXTS)
|
||||
-rm $(XMLS)
|
||||
|
||||
test: htmls texts xmls
|
||||
htmls: $(HTMLS)
|
||||
texts: $(TEXTS)
|
||||
xmls: $(XMLS)
|
||||
|
|
|
@ -13,10 +13,10 @@ def main(argv):
|
|||
def usage():
|
||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||
'[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
|
||||
'[-t text|html|xml|tag] [-O output_dir] [-o output] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:O:o:C:D:m:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -29,7 +29,7 @@ def main(argv):
|
|||
# output option
|
||||
outfile = None
|
||||
outtype = None
|
||||
imgdir = None
|
||||
outdir = None
|
||||
codec = 'utf-8'
|
||||
pageno = 1
|
||||
scale = 1
|
||||
|
@ -43,7 +43,7 @@ def main(argv):
|
|||
elif k == '-t': outtype = v
|
||||
elif k == '-c': codec = v
|
||||
elif k == '-o': outfile = v
|
||||
elif k == '-I': imgdir = v
|
||||
elif k == '-O': outdir = v
|
||||
elif k == '-s': scale = float(v)
|
||||
elif k == '-n': laparams = None
|
||||
elif k == '-D': laparams.direction = v
|
||||
|
@ -75,9 +75,9 @@ def main(argv):
|
|||
if outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||
elif outtype == 'xml':
|
||||
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
|
||||
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue