diff --git a/README.html b/README.html index 035c865..2f0cd0b 100644 --- a/README.html +++ b/README.html @@ -17,7 +17,7 @@ Python PDF parser and analyzer
html
: HTML format. (Default)
+text
: TEXT format.
sgml
: SGML format.
tag
: "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
diff --git a/pdflib/cluster.py b/pdflib/cluster.py
index 2b49704..4ea3e9d 100644
--- a/pdflib/cluster.py
+++ b/pdflib/cluster.py
@@ -59,9 +59,9 @@ class Plane(object):
return objs
-## Clusters
+## ClusterSet
##
-class Clusters(object):
+class ClusterSet(object):
def __init__(self):
self.clusters = {}
@@ -86,11 +86,12 @@ class Clusters(object):
def cluster_pageobjs(objs, ratio):
+ idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
plane = Plane()
for obj in objs:
plane.add(obj.bbox, obj)
plane.finish()
- clusters = Clusters()
+ cset = ClusterSet()
for obj in objs:
(bx0,by0,bx1,by1) = obj.bbox
margin = abs(obj.fontsize * ratio)
@@ -100,17 +101,26 @@ def cluster_pageobjs(objs, ratio):
y1 = max(by0,by1)
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
if len(found) == 1:
- clusters.add(found.pop())
+ cset.add(found.pop())
else:
- clusters.merge(found)
+ cset.merge(found)
+ clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
r = []
- for objs in clusters.finish():
+ for objs in clusters:
+ objs = sorted(objs, key=lambda obj: idx[obj])
+ h = v = 0
(bx0,by0,bx1,by1) = objs[0].bbox
+ (lx0,ly0,_,_) = objs[0].bbox
for obj in objs[1:]:
(x0,y0,x1,y1) = obj.bbox
+ if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
+ v += 1
+ else:
+ h += 1
+ (lx0,ly0) = (x0,y0)
bx0 = min(bx0, x0)
bx1 = max(bx1, x1)
by0 = min(by0, y0)
by1 = max(by1, y1)
- r.append(((bx0,by0,bx1,by1), objs))
+ r.append(((bx0,by0,bx1,by1), h < v, objs))
return r
diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py
index dd8cb81..ad72ef1 100755
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@@ -2,7 +2,7 @@
import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
+from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB
@@ -15,6 +15,15 @@ def encprops(props, codec):
if not props: return ''
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
+def get_textobjs(item, r=None):
+ if r == None: r = []
+ if isinstance(item, TextItem):
+ r.append(item)
+ elif isinstance(item, PageItem):
+ for child in item.objs:
+ get_textobjs(child, r)
+ return r
+
## PDFConverter
class PDFConverter(PDFPageAggregator):
@@ -73,7 +82,8 @@ class HTMLConverter(PDFConverter):
page = PDFConverter.end_page(self, page)
def f(item):
if isinstance(item, FigureItem):
- pass
+ for child in item.objs:
+ f(child)
elif isinstance(item, TextItem):
if item.direction == 2:
wmode = 'tb-rl'
@@ -95,8 +105,8 @@ class HTMLConverter(PDFConverter):
for child in page.objs:
f(child)
if self.cluster_margin:
- textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
- for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin):
+ clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
+ for ((x0,y0,x1,y1),_,objs) in clusters:
self.outfp.write('\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
@@ -114,7 +124,7 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
- def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True):
+ def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
self.pagenum = pagenum
self.cluster_margin = cluster_margin
@@ -125,15 +135,18 @@ class TextConverter(PDFConverter):
from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page)
if self.cluster_margin:
- textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
- idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) )
+ textobjs = get_textobjs(page)
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
- clusters.sort(key=lambda (_,objs): idx[objs[0]])
- for (_,objs) in clusters:
- for item in sorted(objs, key=lambda obj:idx[obj]):
- text = item.text
- self.outfp.write(text.encode(self.codec, 'replace'))
- self.outfp.write('\n')
+ for (_,vertical,objs) in clusters:
+ for (i,item) in enumerate(objs):
+ (x0,y0,x1,y1) = item.bbox
+ if (i and
+ ((not vertical and (y1 < ly0 or ly1 < y0)) or
+ (vertical and (x1 < lx0 or lx1 < x0)))):
+ self.outfp.write('\n')
+ (lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
+ self.outfp.write(item.text.encode(self.codec, 'replace'))
+ self.outfp.write('\n\n')
else:
for item in page.objs:
if isinstance(item, TextItem):
@@ -243,7 +256,7 @@ def main(argv):
codec = 'ascii'
pagenos = set()
maxpages = 0
- outtype = 'text'
+ outtype = 'html'
password = ''
splitwords = False
outfp = sys.stdout