diff --git a/Makefile b/Makefile
index c972828..d0f2f35 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,6 @@ $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap
- cd samples && $(MAKE) all
+ cd samples && $(MAKE) test
test_clean:
-cd samples && $(MAKE) clean
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 8334ab0..6b609a5 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -5,7 +5,7 @@ from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
-from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextFlow, LTTextLine
+from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str
@@ -32,9 +32,7 @@ class PDFPageAggregator(PDFTextDevice):
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
- self.cur_item.fixate()
- if self.laparams:
- self.cur_item.analyze_layout(self.laparams)
+ self.cur_item.fixate(self.laparams)
self.pageno += 1
return self.cur_item
@@ -143,7 +141,7 @@ class TextConverter(PDFConverter):
self.write('\n')
page = PDFConverter.end_page(self, page)
if self.showpageno:
- self.write('Page %d\n' % page.id)
+ self.write('Page %s\n' % page.pageid)
render(page)
self.write('\f')
return
@@ -170,7 +168,16 @@ class HTMLConverter(PDFConverter):
def write_rect(self, color, width, x, y, w, h):
self.outfp.write('\n' %
- (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
+ (color, width,
+ x*self.scale, (self.yoffset-y)*self.scale,
+ w*self.scale, h*self.scale))
+ return
+
+ def write_text(self, text, x, y, size):
+ self.outfp.write('' %
+ (x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
+ self.write(text)
+ self.outfp.write('\n')
return
def write_image(self, image):
@@ -194,37 +201,30 @@ class HTMLConverter(PDFConverter):
def render(item):
if isinstance(item, LTPage):
self.yoffset += item.y1
- self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+ self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height)
if self.showpageno:
self.outfp.write('
' %
((self.yoffset-item.y1)*self.scale))
- self.outfp.write('
Page %s \n' % (page.id, page.id))
+ self.outfp.write('Page %s\n' % (page.pageid, page.pageid))
for child in item:
render(child)
elif isinstance(item, LTChar):
- self.outfp.write('' %
- (item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
- item.get_size()*self.scale))
- self.write(item.text)
- self.outfp.write('\n')
+ self.write_text(item.text, item.x0, item.y1, item.get_size())
if self.debug:
- self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+ self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTPolygon):
- self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+ self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:
render(child)
elif isinstance(item, LTTextBox):
- self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
- for child in item:
- render(child)
- elif isinstance(item, LTTextFlow):
+ self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
if self.debug:
- self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+ self.write_text(str(item.index+1), item.x0, item.y1, 20)
elif isinstance(item, LTFigure):
- self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+ self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTImage):
@@ -233,6 +233,14 @@ class HTMLConverter(PDFConverter):
return
page = PDFConverter.end_page(self, page)
render(page)
+ if page.layout:
+ def show_layout(item):
+ if isinstance(item, LTTextGroup):
+ self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
+ for child in item:
+ show_layout(child)
+ return
+ show_layout(page.layout)
self.yoffset += self.pagepad
return
@@ -270,13 +278,13 @@ class XMLConverter(PDFConverter):
def render(item):
if isinstance(item, LTPage):
self.outfp.write('\n' %
- (item.id, bbox2str(item.bbox), item.rotate))
+ (item.pageid, bbox2str(item.bbox), item.rotate))
for child in item:
render(child)
self.outfp.write('\n')
- elif isinstance(item, LTLine) and item.direction:
- self.outfp.write('\n' %
- (item.linewidth, item.direction, bbox2str(item.bbox)))
+ elif isinstance(item, LTLine):
+ self.outfp.write('\n' %
+ (item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTRect):
self.outfp.write('\n' %
(item.linewidth, bbox2str(item.bbox)))
@@ -284,8 +292,8 @@ class XMLConverter(PDFConverter):
self.outfp.write('\n' %
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
- self.outfp.write('