diff --git a/TODO b/TODO
index 40aba69..ecc746c 100644
--- a/TODO
+++ b/TODO
@@ -1,5 +1,4 @@
TODOs:
- - sgml->xml
- PEP-8 conformance.
- Better text extraction / layout analysis.
- Better API Documentation.
diff --git a/docs/index.html b/docs/index.html
index e7d1a23..47639bd 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Oct 31 11:08:31 JST 2009
+Last Modified: Sat Oct 31 12:03:49 JST 2009
@@ -209,7 +209,7 @@ By default, it extracts texts from all the pages.
-
text
: TEXT format. (Default)
-
html
: HTML format. Not recommended for extraction purpose because the markup is very messy.
- -
sgml
: SGML format. Provides the most information available.
+ -
xml
: XML format. Provides the most information available.
-
tag
: "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 48799aa..ae22b47 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -156,10 +156,16 @@ class PDFConverter(PDFPageAggregator):
return
-## SGMLConverter
+## XMLConverter
##
-class SGMLConverter(PDFConverter):
+class XMLConverter(PDFConverter):
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
+ PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
+ self.outfp.write('\n' % codec)
+ self.outfp.write('\n')
+ return
+
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
@@ -202,6 +208,10 @@ class SGMLConverter(PDFConverter):
render(page)
return
+ def close(self):
+ self.outfp.write('\n')
+ return
+
## HTMLConverter
##
diff --git a/samples/Makefile b/samples/Makefile
index 08fc43b..13d196c 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -25,30 +25,33 @@ TEXTS= \
naacl06-shinyama.txt \
nlp2004slides.txt
-SGMLS= \
- simple1.sgml \
- simple2.sgml \
- dmca.sgml \
- f1040nr.sgml \
- i1040nr.sgml \
- jo.sgml \
- kampo.sgml \
- naacl06-shinyama.sgml \
- nlp2004slides.sgml
+XMLS= \
+ simple1.xml \
+ simple2.xml \
+ dmca.xml \
+ f1040nr.xml \
+ i1040nr.xml \
+ jo.xml \
+ kampo.xml \
+ naacl06-shinyama.xml \
+ nlp2004slides.xml
all:
clean:
-rm $(HTMLS)
-rm $(TEXTS)
- -rm $(SGMLS)
+ -rm $(XMLS)
-test: $(HTMLS) $(TEXTS) $(SGMLS)
+test: htmls texts xmls
+htmls: $(HTMLS)
+tests: $(TEXTS)
+xmls: $(XMLS)
-.SUFFIXES: .pdf .html .sgml .txt
+.SUFFIXES: .pdf .html .xml .txt
.pdf.html:
$(PDF2TXT) -t html $< > $@
-.pdf.sgml:
- $(PDF2TXT) -t sgml $< > $@
+.pdf.xml:
+ $(PDF2TXT) -t xml $< > $@
.pdf.txt:
$(PDF2TXT) -t text $< > $@
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index d212c36..95db6f8 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -3,7 +3,7 @@ import sys
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice
-from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
+from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB, find_cmap_path
from pdfminer.layout import LAParams
@@ -13,7 +13,7 @@ def main(argv):
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
- '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
+ '[-t text|html|xml|tag] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
@@ -65,8 +65,8 @@ def main(argv):
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
- elif outfile.endswith('.sgml'):
- outtype = 'sgml'
+ elif outfile.endswith('.xml'):
+ outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
@@ -75,8 +75,8 @@ def main(argv):
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
- elif outtype == 'sgml':
- device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
+ elif outtype == 'xml':
+ device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
elif outtype == 'tag':