diff --git a/Makefile b/Makefile
index 60ededb..c972828 100644
--- a/Makefile
+++ b/Makefile
@@ -33,11 +33,6 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish:
$(CP) docs/*.html $(WEBDIR)
-test:
- cd samples && $(MAKE) test
-test_clean:
- -cd samples && $(MAKE) clean
-
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
@@ -53,3 +48,8 @@ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+
+test: cmap
+ cd samples && $(MAKE) all
+test_clean:
+ -cd samples && $(MAKE) clean
diff --git a/docs/index.html b/docs/index.html
index 6c0abd7..aa24ddf 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Jan 30 16:32:50 JST 2010
+Last Modified: Sun Jan 31 10:38:26 JST 2010
@@ -127,9 +127,8 @@ W o r l d
For East Asian languages
In order to handle East Asian languages (Chinese or Japanese, etc.),
-you need to install an additional data called CMap
,
-which is originally distributed by Adobe. CMap is now included
-in the pdfminer package, but not installed by default.
+an additional data called CMap
is required.
+CMap files are not installed by default.
Here is the additional step you need:
@@ -347,7 +346,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
- 2010/01/30: JPEG image extraction supported.
+ 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.
2010/01/04: Python 2.6 warning removal. More doctest conversion.
2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 4c3fcbf..3d6093f 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -6,8 +6,8 @@ from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
-from utils import enc
from utils import apply_matrix_pt, mult_matrix
+from utils import enc, strbbox
## TagExtractor
@@ -38,10 +38,8 @@ class TagExtractor(PDFDevice):
return
def begin_page(self, page, ctm):
- (x0, y0, x1, y1) = page.mediabox
- bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.outfp.write('' %
- (self.pageno, bbox, page.rotate))
+ (self.pageno, strbbox(page.mediabox), page.rotate))
return
def end_page(self, page):
@@ -177,9 +175,9 @@ class PDFConverter(PDFPageAggregator):
##
class XMLConverter(PDFConverter):
- def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
- self.imgdir = imgdir
+ self.outdir = outdir
self.outfp.write('\n' % codec)
self.outfp.write('\n')
return
@@ -190,7 +188,7 @@ class XMLConverter(PDFConverter):
else:
return None
name = image.name+ext
- path = os.path.join(self.imgdir, name)
+ path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
@@ -200,42 +198,42 @@ class XMLConverter(PDFConverter):
def render(item):
if isinstance(item, LTPage):
self.outfp.write('\n' %
- (item.id, item.get_bbox(), item.rotate))
+ (item.id, strbbox(item.bbox), item.rotate))
for child in item:
render(child)
self.outfp.write(' \n')
elif isinstance(item, LTLine) and item.direction:
- self.outfp.write(' \n' % (item.linewidth, item.direction, item.get_bbox()))
+ self.outfp.write(' \n' % (item.linewidth, item.direction, strbbox(item.bbox)))
elif isinstance(item, LTRect):
- self.outfp.write(' \n' % (item.linewidth, item.get_bbox()))
+ self.outfp.write(' \n' % (item.linewidth, strbbox(item.bbox)))
elif isinstance(item, LTPolygon):
- self.outfp.write(' \n' % (item.linewidth, item.get_bbox(), item.get_pts()))
+ self.outfp.write(' \n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
- self.outfp.write('\n' % (item.id, item.get_bbox()))
+ self.outfp.write('\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write(' \n')
elif isinstance(item, LTTextLine):
- self.outfp.write('\n' % (item.get_bbox()))
+ self.outfp.write('\n' % strbbox(item.bbox))
for child in item:
render(child)
self.outfp.write(' \n')
elif isinstance(item, LTTextBox):
- self.outfp.write('\n' % (item.id, item.get_bbox()))
+ self.outfp.write('\n' % (item.id, strbbox(item.bbox)))
for child in item:
render(child)
self.outfp.write(' \n')
elif isinstance(item, LTTextItem):
self.outfp.write('' %
(enc(item.font.fontname), item.is_vertical(),
- item.get_bbox(), item.fontsize))
+ strbbox(item.bbox), item.fontsize))
self.write(item.text)
self.outfp.write(' \n')
elif isinstance(item, LTText):
self.outfp.write('%s \n' % item.text)
elif isinstance(item, LTImage):
x = ''
- if self.imgdir:
+ if self.outdir:
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
@@ -257,11 +255,11 @@ class XMLConverter(PDFConverter):
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
- scale=1, showpageno=True, pagepad=50, imgdir=None):
+ scale=1, showpageno=True, pagepad=50, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
- self.imgdir = imgdir
+ self.outdir = outdir
self.scale = scale
self.outfp.write('\n')
self.outfp.write(' \n' %
@@ -282,16 +280,15 @@ class HTMLConverter(PDFConverter):
else:
return
name = image.name+ext
- path = os.path.join(self.imgdir, name)
+ path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
- (x0,y0,x1,y1) = image.dstbbox
self.outfp.write(' \n' %
(enc(name),
- x0*self.scale, (self.yoffset-y1)*self.scale,
- (x1-x0)*self.scale, (y1-y0)*self.scale))
+ image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
+ image.width*self.scale, image.height*self.scale))
return
def end_page(self, page):
@@ -332,7 +329,7 @@ class HTMLConverter(PDFConverter):
for child in item:
render(child)
elif isinstance(item, LTImage):
- if self.imgdir:
+ if self.outdir:
self.write_image(item)
return
page = PDFConverter.end_page(self, page)
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index 02d1cb8..91a4a71 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -1,9 +1,8 @@
#!/usr/bin/env python
import sys
from sys import maxint as INF
-from utils import apply_matrix_norm
-from utils import apply_matrix_pt
-from utils import bsearch
+from utils import apply_matrix_norm, apply_matrix_pt
+from utils import bsearch, strbbox
@@ -137,7 +136,7 @@ class LayoutItem(object):
return
def __repr__(self):
- return ('- ' % (self.get_bbox()))
+ return ('
- ' % strbbox(self.bbox))
def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
@@ -148,11 +147,9 @@ class LayoutItem(object):
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
+ self.bbox = (x0, y0, x1, y1)
return
- def get_bbox(self):
- return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
-
def is_hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
@@ -206,7 +203,7 @@ class LayoutContainer(LayoutItem):
return
def __repr__(self):
- return ('
' % (self.get_bbox()))
+ return ('' % strbbox(self.bbox))
def __iter__(self):
return iter(self.objs)
@@ -285,13 +282,13 @@ class LTRect(LTPolygon):
## LTImage
##
-class LTImage(object):
+class LTImage(LayoutItem):
- def __init__(self, name, type, srcsize, dstbbox, data):
+ def __init__(self, name, type, srcsize, bbox, data):
+ LayoutItem.__init__(self, bbox)
self.name = name
self.type = type
self.srcsize = srcsize
- self.dstbbox = dstbbox
self.data = data
return
@@ -370,7 +367,7 @@ class LTTextItem(LayoutItem, LTText):
if self.debug:
return ('' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
- self.font, self.fontsize, self.get_bbox(),
+ self.font, self.fontsize, strbbox(self.bbox),
'(%.1f, %.1f)' % self.adv,
self.text))
else:
@@ -400,7 +397,7 @@ class LTFigure(LayoutContainer):
return
def __repr__(self):
- return ('' % (self.id, self.get_bbox(), self.matrix))
+ return ('' % (self.id, strbbox(self.bbox), self.matrix))
## LTTextLine
@@ -414,7 +411,7 @@ class LTTextLine(LayoutContainer):
return
def __repr__(self):
- return ('' % (self.get_bbox(), self.direction))
+ return ('' % (strbbox(self.bbox), self.direction))
def get_margin(self):
return min(self.width, self.height)
@@ -464,7 +461,7 @@ class LTTextBox(LayoutContainer):
return
def __repr__(self):
- return ('' % (self.get_bbox(), self.direction, self.get_text()[:20]))
+ return ('' % (strbbox(self.bbox), self.direction, self.get_text()[:20]))
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@@ -520,7 +517,7 @@ class LTPage(LayoutContainer):
return
def __repr__(self):
- return ('' % (self.id, self.get_bbox(), self.rotate))
+ return ('' % (self.id, strbbox(self.bbox), self.rotate))
def analyze_layout(self, laparams):
textobjs = []
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index f6647fe..e6bdf16 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -707,7 +707,7 @@ class PDFPageInterpreter(object):
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270:
- ctm = (0,1,-1,0, x0,-y1)
+ ctm = (0,1,-1,0, y1,-x0)
else:
ctm = (1,0,0,1, -x0,-y0)
self.device.begin_page(page, ctm)
diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index 730cf45..7012df7 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -247,7 +247,7 @@ class PDFPage(object):
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
- self.rotate = self.attrs.get('Rotate', 0)
+ self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 5a9a918..06e9785 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -136,6 +136,9 @@ def enc(x, codec='ascii'):
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
return x.encode(codec, 'xmlcharrefreplace')
+def strbbox((x0,y0,x1,y1)):
+ return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
+
## ObjIdRange
##
diff --git a/samples/Makefile b/samples/Makefile
index 976a5ae..fe7f4dc 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,7 +1,7 @@
# GNUMakefile for test
PYTHON=python
-PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
+PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
HTMLS= \
simple1.html \
@@ -36,14 +36,13 @@ XMLS= \
naacl06-shinyama.xml \
nlp2004slides.xml
-all:
+all: htmls texts xmls
clean:
-rm $(HTMLS)
-rm $(TEXTS)
-rm $(XMLS)
-test: htmls texts xmls
htmls: $(HTMLS)
texts: $(TEXTS)
xmls: $(XMLS)
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index 7e80f09..935bb24 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -13,10 +13,10 @@ def main(argv):
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
- '[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
+ '[-t text|html|xml|tag] [-O output_dir] [-o output] file ...' % argv[0])
return 100
try:
- (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
+ (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:O:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@@ -29,7 +29,7 @@ def main(argv):
# output option
outfile = None
outtype = None
- imgdir = None
+ outdir = None
codec = 'utf-8'
pageno = 1
scale = 1
@@ -43,7 +43,7 @@ def main(argv):
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfile = v
- elif k == '-I': imgdir = v
+ elif k == '-O': outdir = v
elif k == '-s': scale = float(v)
elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v
@@ -75,9 +75,9 @@ def main(argv):
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
- device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
+ device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
- device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
+ device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else: