From 83d2086f1983ed14eee640f5d3d6d0a6a114e23c Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 29 Aug 2010 06:39:31 +0000 Subject: [PATCH] fix minor layout issue git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@239 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/layout.py | 46 ++-- pdfminer/pdffont.py | 95 ++++--- samples/jo.html.ref | 340 ++++++++++++------------- samples/nonfree/f1040nr.html.ref | 418 +++++++++++++++---------------- samples/nonfree/kampo.html.ref | 44 ++-- 5 files changed, 490 insertions(+), 453 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index c6d93df..e0f2feb 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import sys from sys import maxint as INF -from utils import apply_matrix_norm, apply_matrix_pt +from utils import apply_matrix_pt from utils import bsearch, bbox2str, matrix2str from pdffont import PDFUnicodeNotDefined @@ -208,36 +208,38 @@ class LTChar(LTItem, LTText): self.matrix = matrix self.font = font self.fontsize = fontsize - self.vertical = font.is_vertical() self.adv = font.char_width(cid) * fontsize * scaling try: text = font.to_unichr(cid) assert isinstance(text, unicode), text except PDFUnicodeNotDefined: text = '?' - (a,b,c,d,e,f) = self.matrix - self.upright = (0 < a*d*scaling and b*c <= 0) LTText.__init__(self, text) # compute the boundary rectangle. - if self.vertical: + if self.font.is_vertical(): # vertical - size = font.get_size() * fontsize - displacement = (1000 - font.char_disp(cid)) * fontsize * .001 - (_,displacement) = apply_matrix_norm(self.matrix, (0, displacement)) - (dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv)) - (_,_,_,_,tx,ty) = self.matrix - tx -= dx/2 - ty += displacement + rise - bbox = (tx, ty+dy, tx+dx, ty) + width = font.get_width() * fontsize + (vx,vy) = font.char_disp(cid) + if vx is None: + vx = width/2 + else: + vx = vx * fontsize * .001 + vy = (1000 - vy) * fontsize * .001 + tx = -vx + ty = vy + rise + bll = (tx, ty+self.adv) + bur = (tx+width, ty) else: # horizontal - size = font.get_size() * fontsize + height = font.get_height() * fontsize descent = font.get_descent() * fontsize - (_,descent) = apply_matrix_norm(self.matrix, (0, descent)) - (dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size)) - (_,_,_,_,tx,ty) = self.matrix - ty += descent + rise - bbox = (tx, ty, tx+dx, ty+dy) + ty = descent + rise + bll = (0, ty) + bur = (self.adv, ty+height) + (a,b,c,d,e,f) = self.matrix + self.upright = (0 < a*d*scaling and b*c <= 0) + bbox = (apply_matrix_pt(self.matrix, bll) + + apply_matrix_pt(self.matrix, bur)) LTItem.__init__(self, bbox) return @@ -253,7 +255,7 @@ class LTChar(LTItem, LTText): return max(self.width, self.height) def is_vertical(self): - return self.vertical + return self.font.is_vertical def is_upright(self): return self.upright @@ -692,12 +694,12 @@ class LTAnalyzer(LTContainer): class LTFigure(LTAnalyzer): def __init__(self, name, bbox, matrix): + self.name = name + self.matrix = matrix (x,y,w,h) = bbox bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) LTAnalyzer.__init__(self, bbox) - self.name = name - self.matrix = matrix return def __repr__(self): diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 10d72db..f872bfd 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -13,7 +13,52 @@ from pdftypes import PDFException, resolve1 from pdftypes import int_value, float_value, num_value from pdftypes import str_value, list_value, dict_value, stream_value from fontmetrics import FONT_METRICS -from utils import apply_matrix_norm, nunpack +from utils import apply_matrix_norm, nunpack, choplist + + +def get_widths(seq): + widths = {} + r = [] + for v in seq: + if isinstance(v, list): + if r: + char1 = r[-1] + for (i,w) in enumerate(v): + widths[char1+i] = w + r = [] + elif isinstance(v, int): + r.append(v) + if len(r) == 3: + (char1,char2,w) = r + for i in xrange(char1, char2+1): + widths[i] = w + r = [] + return widths +#assert get_widths([1]) == {} +#assert get_widths([1,2,3]) == {1:3, 2:3} +#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} + +def get_widths2(seq): + widths = {} + r = [] + for v in seq: + if isinstance(v, list): + if r: + char1 = r[-1] + for (i,(w,vx,vy)) in enumerate(choplist(3,v)): + widths[char1+i] = (w,(vx,vy)) + r = [] + elif isinstance(v, int): + r.append(v) + if len(r) == 5: + (char1,char2,w,vx,vy) = r + for i in xrange(char1, char2+1): + widths[i] = (w,(vx,vy)) + r = [] + return widths +#assert get_widths2([1]) == {} +#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))} +#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))} ## FontMetricsDB @@ -345,9 +390,6 @@ class PDFFont(object): self.default_width = default_width or descriptor.get('MissingWidth', 0) self.leading = num_value(descriptor.get('Leading', 0)) self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) - self.size = self.bbox[3]-self.bbox[1] - if self.size == 0: - self.size = self.ascent - self.descent self.hscale = self.vscale = .001 return @@ -367,8 +409,17 @@ class PDFFont(object): return self.ascent * self.vscale def get_descent(self): return self.descent * self.vscale - def get_size(self): - return self.size * self.vscale + + def get_width(self): + w = self.bbox[2]-self.bbox[0] + if w == 0: + w = -self.default_width + return w * self.hscale + def get_height(self): + h = self.bbox[3]-self.bbox[1] + if h == 0: + h = self.ascent - self.descent + return h * self.vscale def char_width(self, cid): return self.widths.get(cid, self.default_width) * self.hscale @@ -522,38 +573,21 @@ class PDFCIDFont(PDFFont): except CMapDB.CMapNotFound, e: pass - def get_width(seq): - dic = {} - char1 = char2 = None - for v in seq: - if char1 is None: - char1 = v - elif char2 is None and isinstance(v, int): - char2 = v - else: - if char2 is None: - for (i,w) in enumerate(v): - dic[char1+i] = w - else: - for i in xrange(char1, char2+1): - dic[i] = v - char1 = char2 = None - return dic self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical - dic = get_width(list_value(spec.get('W2', []))) - widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() ) - self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() ) - (d,w) = spec.get('DW2', [880, -1000]) + widths = get_widths2(list_value(spec.get('W2', []))) + self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.iteritems() ) + (vy,w) = spec.get('DW2', [880, -1000]) + self.default_disp = (None,vy) + widths = dict( (cid,w) for (cid,(w,_)) in widths.iteritems() ) default_width = w - self.default_disp = d else: # writing mode: horizontal - widths = get_width(list_value(spec.get('W', []))) self.disps = {} - default_width = spec.get('DW', 1000) self.default_disp = 0 + widths = get_widths(list_value(spec.get('W', []))) + default_width = spec.get('DW', 1000) PDFFont.__init__(self, descriptor, widths, default_width=default_width) return @@ -570,6 +604,7 @@ class PDFCIDFont(PDFFont): return self.cmap.decode(bytes) def char_disp(self, cid): + "Returns an integer for horizontal fonts, a tuple for vertical fonts." return self.disps.get(cid, self.default_disp) def to_unichr(self, cid): diff --git a/samples/jo.html.ref b/samples/jo.html.ref index e0bf27f..18fe213 100644 --- a/samples/jo.html.ref +++ b/samples/jo.html.ref @@ -10,38 +10,38 @@   - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -193,47 +193,47 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +     @@ -494,49 +494,49 @@ -  -  -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +  +  +  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -744,60 +744,60 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/samples/nonfree/f1040nr.html.ref b/samples/nonfree/f1040nr.html.ref index 18aeeaf..72032c3 100644 --- a/samples/nonfree/f1040nr.html.ref +++ b/samples/nonfree/f1040nr.html.ref @@ -3885,28 +3885,28 @@ -P -l -e -a -s -e - -p -r -i -n -t - -o -r - -t -y -p -e -. - +P +l +e +a +s +e + +p +r +i +n +t + +o +r + +t +y +p +e +. + @@ -3937,76 +3937,76 @@ -A -t -t -a -c -h - -F -o -r -m -s - -W -- -2 - -h -e -r -e -. -A -l -s -o - -a -t -t -a -c -h - -F -o -r -m -( -s -) - -1 -0 -9 -9 -- -R - -i -f - -t -a -x - -w -a -s - -w -i -t -h -h -e -l -d -. - +A +t +t +a +c +h + +F +o +r +m +s + +W +- +2 + +h +e +r +e +. +A +l +s +o + +a +t +t +a +c +h + +F +o +r +m +( +s +) + +1 +0 +9 +9 +- +R + +i +f + +t +a +x + +w +a +s + +w +i +t +h +h +e +l +d +. + @@ -4049,60 +4049,60 @@ -I -n -c -o -m -e - -E -f -f -e -c -t -i -v -e -l -y - -C -o -n -n -e -c -t -e -d - -W -i -t -h - -U -. -S -. - -T -r -a -d -e -/ -B -u -s -i -n -e -s -s - +I +n +c +o +m +e + +E +f +f +e +c +t +i +v +e +l +y + +C +o +n +n +e +c +t +e +d + +W +i +t +h + +U +. +S +. + +T +r +a +d +e +/ +B +u +s +i +n +e +s +s + @@ -4113,70 +4113,70 @@ -E -n -c -l -o -s -e -, - -b -u -t - -d -o - -n -o -t - -a -t -t -a -c -h -, - -a -n -y - -p -a -y -m -e -n -t -. - +E +n +c +l +o +s +e +, + +b +u +t + +d +o + +n +o +t + +a +t +t +a +c +h +, + +a +n +y + +p +a +y +m +e +n +t +. + -A -d -j -u -s -t -e -d - -G -r -o -s -s - -I -n -c -o -m -e - +A +d +j +u +s +t +e +d + +G +r +o +s +s + +I +n +c +o +m +e + diff --git a/samples/nonfree/kampo.html.ref b/samples/nonfree/kampo.html.ref index 19b9532..70de825 100644 --- a/samples/nonfree/kampo.html.ref +++ b/samples/nonfree/kampo.html.ref @@ -285,7 +285,7 @@ - + @@ -316,7 +316,7 @@ - + @@ -326,7 +326,7 @@ - + @@ -335,7 +335,7 @@ - + @@ -350,7 +350,7 @@ - + @@ -364,7 +364,7 @@ - + @@ -932,7 +932,7 @@ - + @@ -1155,7 +1155,7 @@ - + @@ -1513,7 +1513,7 @@ - + @@ -1530,7 +1530,7 @@ - + @@ -1544,7 +1544,7 @@ - + @@ -1840,7 +1840,7 @@ - + @@ -1866,8 +1866,8 @@ - - + + @@ -1893,7 +1893,7 @@ - + @@ -1914,8 +1914,8 @@ - - + + @@ -1941,7 +1941,7 @@ - + @@ -1967,8 +1967,8 @@ - - + + @@ -2289,7 +2289,7 @@ - + @@ -2301,7 +2301,7 @@ - +