diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 6b609a5..b4d8a76 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -233,7 +233,7 @@ class HTMLConverter(PDFConverter): return page = PDFConverter.end_page(self, page) render(page) - if page.layout: + if self.debug and page.layout: def show_layout(item): if isinstance(item, LTTextGroup): self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index bdae9f7..bed7152 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -237,7 +237,7 @@ class LTAnon(LTText): ## class LTChar(LayoutItem, LTText): - debug = 1 + debug = 0 def __init__(self, matrix, font, fontsize, scaling, cid): self.matrix = matrix @@ -418,7 +418,7 @@ class LTTextGroupLRTB(LTTextGroup): def __init__(self, objs): LTTextGroup.__init__(self, objs) # reorder the objects from top-left to bottom-right. - self.objs = csort(self.objs, key=lambda obj: obj.x0-obj.y1) + self.objs = csort(self.objs, key=lambda obj: obj.x0+obj.x1-(obj.y0+obj.y1)) return class LTTextGroupTBRL(LTTextGroup): @@ -426,7 +426,7 @@ class LTTextGroupTBRL(LTTextGroup): def __init__(self, objs): LTTextGroup.__init__(self, objs) # reorder the objects from top-right to bottom-left. - self.objs = csort(self.objs, key=lambda obj: -obj.x1-obj.y1) + self.objs = csort(self.objs, key=lambda obj: -(obj.x0+obj.x1)-(obj.y0+obj.y1)) return @@ -509,17 +509,19 @@ def guess_wmode(objs): ## group_lines ## -def group_lines(groupfunc, objs, *args): +def group_lines(groupfunc, objs, findfunc, debug=0): """Group LTTextLine objects to form a LTTextBox.""" plane = Plane(objs) groups = {} for obj in objs: - neighbors = obj.find_neighbors(plane, *args) + neighbors = findfunc(obj, plane) assert obj in neighbors, obj members = neighbors[:] for obj1 in neighbors: if obj1 in groups: members.extend(groups.pop(obj1)) + if debug: + print >>sys.stderr, 'group:', members group = groupfunc(list(uniq(members))) for obj in members: groups[obj] = group @@ -536,7 +538,7 @@ def group_lines(groupfunc, objs, *args): ## group_boxes ## -def group_boxes(groupfunc, objs, distfunc): +def group_boxes(groupfunc, objs, distfunc, debug=0): assert objs while 2 <= len(objs): mindist = INF @@ -552,6 +554,8 @@ def group_boxes(groupfunc, objs, distfunc): (obj1, obj2) = minpair objs.remove(obj1) objs.remove(obj2) + if debug: + print >>sys.stderr, 'group:', obj1, obj2 objs.append(groupfunc([obj1, obj2])) assert len(objs) == 1 return objs.pop() @@ -635,8 +639,9 @@ class LTPage(LayoutContainer): prev = cur if line: lines.append(LTTextLineHorizontal(line, laparams.word_margin)) - return group_lines(LTTextBoxHorizontal, lines, laparams.line_margin) - + return group_lines(LTTextBoxHorizontal, lines, + lambda obj, plane: obj.find_neighbors(plane, laparams.line_margin)) + def build_textbox_vertical(self, objs, laparams): """Identify vertical text regions in the page.""" def aligned(obj1, obj2): @@ -666,18 +671,19 @@ class LTPage(LayoutContainer): prev = cur if line: lines.append(LTTextLineVertical(line, laparams.word_margin)) - return group_lines(LTTextBoxVertical, lines, laparams.line_margin) + return group_lines(LTTextBoxVertical, lines, + lambda obj, plane: obj.find_neighbors(plane, laparams.line_margin)) def group_textbox_lr_tb(self, boxes, laparams): def dist(obj1, obj2): - return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * + return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - - obj1.width*obj1.height - obj2.width*obj2.height) + (obj1.width*obj1.height + obj2.width*obj2.height)) return group_boxes(LTTextGroupLRTB, boxes, dist) def group_textbox_tb_rl(self, boxes, laparams): def dist(obj1, obj2): return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - - obj1.width*obj1.height - obj2.width*obj2.height) + (obj1.width*obj1.height + obj2.width*obj2.height)) return group_boxes(LTTextGroupTBRL, boxes, dist) diff --git a/samples/Makefile b/samples/Makefile index 611d721..1d5fde3 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -4,7 +4,7 @@ RM=rm -f #CMP=cmp CMP=: PYTHON=python -PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -Dauto -p1 +PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -Dx -p1 HTMLS= \ simple1.html \ diff --git a/samples/dmca.html.ref b/samples/dmca.html.ref index 6944f21..dca8ecc 100644 --- a/samples/dmca.html.ref +++ b/samples/dmca.html.ref @@ -1832,52 +1832,6 @@ l f . - -C -o -p -y -r -i -g -h -t - -O -f -f -i -c -e - -S -u -m -m -a -r -y - -D -e -c -e -m -b -e -r - -1 -9 -9 -8 - -P -a -g -e - -1 P u @@ -1932,22 +1886,53 @@ ) . 1 + +C +o +p +y +r +i +g +h +t + +O +f +f +i +c +e + +S +u +m +m +a +r +y + +D +e +c +e +m +b +e +r + +1 +9 +9 +8 + +P +a +g +e + +1 - - - - - - - - - - - - - - -
Page: 1
diff --git a/samples/dmca.txt.ref b/samples/dmca.txt.ref index c1f0d76..df91f96 100644 --- a/samples/dmca.txt.ref +++ b/samples/dmca.txt.ref @@ -47,13 +47,13 @@ merely an overview of the law’s provisions; for purposes of length and significant amount of detail has been omitted. A complete understanding of any provision of the DMCA requires reference to the text of the legislation itself. +Pub. L. No. 105-304, 112 Stat. 2860 (Oct. 28, 1998). +1 + Copyright Office Summary December 1998 Page 1 -Pub. L. No. 105-304, 112 Stat. 2860 (Oct. 28, 1998). -1 - \ No newline at end of file diff --git a/samples/dmca.xml.ref b/samples/dmca.xml.ref index ebaa43a..01af9be 100644 --- a/samples/dmca.xml.ref +++ b/samples/dmca.xml.ref @@ -2050,68 +2050,7 @@ - - -C -o -p -y -r -i -g -h -t - -O -f -f -i -c -e - -S -u -m -m -a -r -y - - - - - - -D -e -c -e -m -b -e -r - -1 -9 -9 -8 - - - - - - -P -a -g -e - -1 - - - - - + P u @@ -2174,6 +2113,67 @@ + + +C +o +p +y +r +i +g +h +t + +O +f +f +i +c +e + +S +u +m +m +a +r +y + + + + + + +D +e +c +e +m +b +e +r + +1 +9 +9 +8 + + + + + + +P +a +g +e + +1 + + + +
@@ -2214,14 +2214,14 @@ + - - + + - + - diff --git a/samples/f1040nr.html.ref b/samples/f1040nr.html.ref index 69f9b18..18aeeaf 100644 --- a/samples/f1040nr.html.ref +++ b/samples/f1040nr.html.ref @@ -854,6 +854,33 @@ + +( +1 +) + +F +i +r +s +t + +n +a +m +e + + +L +a +s +t + +n +a +m +e + F i @@ -1461,33 +1488,6 @@ - -( -1 -) - -F -i -r -s -t - -n -a -m -e - - -L -a -s -t - -n -a -m -e - ( 3 @@ -1948,6 +1948,31 @@ ' + +d + + + + + + +9 + +b + +8 + +9 + +a +b + +1 +0 +a + +b + T o @@ -3765,37 +3790,12 @@ D - -8 - -9 - -a -b - -1 -0 -a - -b - - -9 - -b - - - - - -d - 1 @@ -4225,44 +4225,5 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Page: 1
diff --git a/samples/f1040nr.txt.ref b/samples/f1040nr.txt.ref index 5ec9dc4..b2e7a8b 100644 --- a/samples/f1040nr.txt.ref +++ b/samples/f1040nr.txt.ref @@ -54,6 +54,12 @@ If same as above, write “Same.” +(1) First name + + +Last name + + Filing Status and Exemptions for Individuals (see page 8) Filing status. Check only one box (1–6 below). @@ -89,12 +95,6 @@ Dependents: (see page 9) -(1) First name - - -Last name - - (3) Dependent’s relationship to you @@ -164,6 +164,21 @@ on lines above ' +d + + + + + +9 b + +8 +9 a +b +10a + +b + Total number of exemptions claimed Wages, salaries, tips, etc. Attach Form(s) W-2 @@ -276,27 +291,12 @@ Cat. No. 11364D -8 -9 a -b -10a - -b - -9 b - - - - - -d - 1 0a 1 1 diff --git a/samples/f1040nr.xml.ref b/samples/f1040nr.xml.ref index 1a50a58..d3b107e 100644 --- a/samples/f1040nr.xml.ref +++ b/samples/f1040nr.xml.ref @@ -1161,7 +1161,55 @@ - + + +( +1 +) + + +F +i +r +s +t + + +n +a +m +e + + + + + + + + + + + +L +a +s +t + + +n +a +m +e + + + + + + + + + + F i @@ -1943,7 +1991,7 @@ - + % @@ -1955,68 +2003,20 @@ - + - + - - -( -1 -) - - -F -i -r -s -t - - -n -a -m -e - - - - - - - - - - - -L -a -s -t - - -n -a -m -e - - - - - - - - - ( @@ -2762,7 +2762,79 @@ - + + +d + + + + + + + + + + + + + + + + + + + + + +9 + + +b + + + + + + +8 + + + + + +9 + + +a + + + + +b + + + + + +1 +0 +a + + + + + + + + + +b + + + + + + T o @@ -5276,99 +5348,27 @@ - - -8 - - - - - -9 - - -a - - - - -b - - - - - -1 -0 -a - - - - - - - - - -b - - - - - - - -9 - - -b - - - - - + - + - + - - - - - - - - - - - - - - - - -d - - - - - 1 @@ -5923,19 +5923,19 @@ - - - - - - - - - - - - + + + + + + + + + + + + @@ -5969,29 +5969,29 @@ + + + + + + - - - - + + - - + - + - + - - - diff --git a/samples/i1040nr.html.ref b/samples/i1040nr.html.ref index 295195f..d0f1e0d 100644 --- a/samples/i1040nr.html.ref +++ b/samples/i1040nr.html.ref @@ -469,6 +469,107 @@ i c e + +r +e +t +i +r +e +m +e +n +t + +p +l +a +n + +a +n +d + +y +o +u +r + +2 +0 +0 +8 + +m +o +d +i +f +i +e +d +A +G +I + +i +s + +l +e +s +s + +t +h +a +n + +$ +6 +3 +, +0 +0 +0 + +( +$ +1 +0 +5 +, +0 +0 +0 +) + +i +f + +a +q +u +a +l +i +f +y +i +n +g + +w +i +d +o +w +( +e +r +) +) +. u s @@ -5360,107 +5461,6 @@ i l l - -r -e -t -i -r -e -m -e -n -t - -p -l -a -n - -a -n -d - -y -o -u -r - -2 -0 -0 -8 - -m -o -d -i -f -i -e -d -A -G -I - -i -s - -l -e -s -s - -t -h -a -n - -$ -6 -3 -, -0 -0 -0 - -( -$ -1 -0 -5 -, -0 -0 -0 -) - -i -f - -a -q -u -a -l -i -f -y -i -n -g - -w -i -d -o -w -( -e -r -) -) -. C a @@ -5494,13 +5494,5 @@ - - - - - - - -
Page: 1
diff --git a/samples/i1040nr.txt.ref b/samples/i1040nr.txt.ref index d55000a..a943a19 100644 --- a/samples/i1040nr.txt.ref +++ b/samples/i1040nr.txt.ref @@ -21,6 +21,10 @@ U.S. Nonresident Alien Income Tax Return Department of the Treasury Internal Revenue Service +retirement plan and your 2008 modified +AGI is less than $63,000 ($105,000) if a +qualifying widow(er)). + use a different address this year. See Section references are to the Internal Where To File on page 4. @@ -177,10 +181,6 @@ purchased after 2007). deduction if you were covered by a the return for an estate or trust, you will -retirement plan and your 2008 modified -AGI is less than $63,000 ($105,000) if a -qualifying widow(er)). - Cat. No. 11368V \ No newline at end of file diff --git a/samples/i1040nr.xml.ref b/samples/i1040nr.xml.ref index 0b8a427..19cce4b 100644 --- a/samples/i1040nr.xml.ref +++ b/samples/i1040nr.xml.ref @@ -548,7 +548,121 @@
- + + +r +e +t +i +r +e +m +e +n +t + +p +l +a +n + +a +n +d + +y +o +u +r + +2 +0 +0 +8 + +m +o +d +i +f +i +e +d + + + + +A +G +I + +i +s + +l +e +s +s + +t +h +a +n + +$ +6 +3 +, +0 +0 +0 + +( +$ +1 +0 +5 +, +0 +0 +0 +) + +i +f + +a + + + + +q +u +a +l +i +f +y +i +n +g + +w +i +d +o +w +( +e +r +) +) +. + + + + + u s @@ -6090,120 +6204,6 @@ - - -r -e -t -i -r -e -m -e -n -t - -p -l -a -n - -a -n -d - -y -o -u -r - -2 -0 -0 -8 - -m -o -d -i -f -i -e -d - - - - -A -G -I - -i -s - -l -e -s -s - -t -h -a -n - -$ -6 -3 -, -0 -0 -0 - -( -$ -1 -0 -5 -, -0 -0 -0 -) - -i -f - -a - - - - -q -u -a -l -i -f -y -i -n -g - -w -i -d -o -w -( -e -r -) -) -. - - - - C @@ -6262,8 +6262,8 @@ - - + + diff --git a/samples/jo.html.ref b/samples/jo.html.ref index 8b858b0..e0bf27f 100644 --- a/samples/jo.html.ref +++ b/samples/jo.html.ref @@ -906,9 +906,5 @@ - - - -
Page: 1
diff --git a/samples/kampo.html.ref b/samples/kampo.html.ref index d2ff0a1..19b9532 100644 --- a/samples/kampo.html.ref +++ b/samples/kampo.html.ref @@ -811,6 +811,8 @@ + + @@ -909,8 +911,6 @@ - - @@ -2666,28 +2666,5 @@ - - - - - - - - - - - - - - - - - - - - - - -
Page: 1
diff --git a/samples/kampo.txt.ref b/samples/kampo.txt.ref index 5b07dd2..613c62b 100644 --- a/samples/kampo.txt.ref +++ b/samples/kampo.txt.ref @@ -70,6 +70,8 @@ 二 普通自転車(法第六十三条の三に規定す 号において同じ。)は、横断歩道において直 +」 + 一 歩行者は、道路の横断を始めてはならず、 横断を終わるか、又は横断をやめて引き返 二 横断歩道を進行しようとする普通自転車 @@ -77,8 +79,6 @@ 一 歩行者は、道路を横断してはならないこ 二 横断歩道を進行しようとする普通自転車 -」 - に改め、同条第四項の表の人の形の記号を有 」 diff --git a/samples/kampo.xml.ref b/samples/kampo.xml.ref index 39712fe..13feac0 100644 --- a/samples/kampo.xml.ref +++ b/samples/kampo.xml.ref @@ -1045,7 +1045,14 @@
- + + + + + + + + @@ -1119,7 +1126,7 @@ - + @@ -1169,13 +1176,6 @@ - - - - - - - @@ -3245,11 +3245,11 @@ + - - + + - diff --git a/samples/naacl06-shinyama.html.ref b/samples/naacl06-shinyama.html.ref index 493758b..c0c438c 100644 --- a/samples/naacl06-shinyama.html.ref +++ b/samples/naacl06-shinyama.html.ref @@ -179,6 +179,15 @@ e d u + +A +b +s +t +r +a +c +t W e @@ -616,15 +625,6 @@ t s . - -A -b -s -t -r -a -c -t 1 B @@ -2893,13 +2893,5 @@ l e . - - - - - - - -
Page: 1
diff --git a/samples/naacl06-shinyama.txt.ref b/samples/naacl06-shinyama.txt.ref index f46da3c..bb8ebaa 100644 --- a/samples/naacl06-shinyama.txt.ref +++ b/samples/naacl06-shinyama.txt.ref @@ -9,6 +9,8 @@ New York University New York, NY, 10003 {yusuke,sekine}@cs.nyu.edu +Abstract + We are trying to extend the boundary of Information Extraction (IE) systems. Ex- isting IE systems require a lot of time and @@ -23,8 +25,6 @@ presents them as tables. We present a pre- liminary system that obtains reasonably good results. -Abstract - 1 Background Every day, a large number of news articles are cre- diff --git a/samples/naacl06-shinyama.xml.ref b/samples/naacl06-shinyama.xml.ref index 8185992..60168b5 100644 --- a/samples/naacl06-shinyama.xml.ref +++ b/samples/naacl06-shinyama.xml.ref @@ -225,7 +225,21 @@
- + + +A +b +s +t +r +a +c +t + + + + + W e @@ -783,20 +797,6 @@ - - -A -b -s -t -r -a -c -t - - - - 1 @@ -3726,8 +3726,8 @@ - - + + diff --git a/samples/nlp2004slides.html.ref b/samples/nlp2004slides.html.ref index 08c7e12..4860cce 100644 --- a/samples/nlp2004slides.html.ref +++ b/samples/nlp2004slides.html.ref @@ -84,6 +84,5 @@ -
Page: 1
diff --git a/samples/simple1.html.ref b/samples/simple1.html.ref index e816e8d..32f00ca 100644 --- a/samples/simple1.html.ref +++ b/samples/simple1.html.ref @@ -55,12 +55,5 @@ r l d - - - - - - -
Page: 1