sgml to xml

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@146 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-10-31 03:04:56 +00:00
parent 23b8058ad4
commit 78f7866554
5 changed files with 38 additions and 26 deletions

1
TODO
View File

@ -1,5 +1,4 @@
TODOs: TODOs:
- sgml->xml
- PEP-8 conformance. - PEP-8 conformance.
- Better text extraction / layout analysis. - Better text extraction / layout analysis.
- Better API Documentation. - Better API Documentation.

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Oct 31 11:08:31 JST 2009 Last Modified: Sat Oct 31 12:03:49 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -209,7 +209,7 @@ By default, it extracts texts from all the pages.
<ul> <ul>
<li> <code>text</code> : TEXT format. (Default) <li> <code>text</code> : TEXT format. (Default)
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy. <li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
<li> <code>sgml</code> : SGML format. Provides the most information available. <li> <code>xml</code> : XML format. Provides the most information available.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>"). Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").

View File

@ -156,10 +156,16 @@ class PDFConverter(PDFPageAggregator):
return return
## SGMLConverter ## XMLConverter
## ##
class SGMLConverter(PDFConverter): class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n')
return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
@ -202,6 +208,10 @@ class SGMLConverter(PDFConverter):
render(page) render(page)
return return
def close(self):
self.outfp.write('</pages>\n')
return
## HTMLConverter ## HTMLConverter
## ##

View File

@ -25,30 +25,33 @@ TEXTS= \
naacl06-shinyama.txt \ naacl06-shinyama.txt \
nlp2004slides.txt nlp2004slides.txt
SGMLS= \ XMLS= \
simple1.sgml \ simple1.xml \
simple2.sgml \ simple2.xml \
dmca.sgml \ dmca.xml \
f1040nr.sgml \ f1040nr.xml \
i1040nr.sgml \ i1040nr.xml \
jo.sgml \ jo.xml \
kampo.sgml \ kampo.xml \
naacl06-shinyama.sgml \ naacl06-shinyama.xml \
nlp2004slides.sgml nlp2004slides.xml
all: all:
clean: clean:
-rm $(HTMLS) -rm $(HTMLS)
-rm $(TEXTS) -rm $(TEXTS)
-rm $(SGMLS) -rm $(XMLS)
test: $(HTMLS) $(TEXTS) $(SGMLS) test: htmls texts xmls
htmls: $(HTMLS)
tests: $(TEXTS)
xmls: $(XMLS)
.SUFFIXES: .pdf .html .sgml .txt .SUFFIXES: .pdf .html .xml .txt
.pdf.html: .pdf.html:
$(PDF2TXT) -t html $< > $@ $(PDF2TXT) -t html $< > $@
.pdf.sgml: .pdf.xml:
$(PDF2TXT) -t sgml $< > $@ $(PDF2TXT) -t xml $< > $@
.pdf.txt: .pdf.txt:
$(PDF2TXT) -t text $< > $@ $(PDF2TXT) -t text $< > $@

View File

@ -3,7 +3,7 @@ import sys
from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB, find_cmap_path from pdfminer.cmap import CMapDB, find_cmap_path
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
@ -13,7 +13,7 @@ def main(argv):
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0]) '[-t text|html|xml|tag] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
@ -65,8 +65,8 @@ def main(argv):
if outfile: if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'): if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html' outtype = 'html'
elif outfile.endswith('.sgml'): elif outfile.endswith('.xml'):
outtype = 'sgml' outtype = 'xml'
elif outfile.endswith('.tag'): elif outfile.endswith('.tag'):
outtype = 'tag' outtype = 'tag'
if outfile: if outfile:
@ -75,8 +75,8 @@ def main(argv):
outfp = sys.stdout outfp = sys.stdout
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'sgml': elif outtype == 'xml':
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams) device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams) device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
elif outtype == 'tag': elif outtype == 'tag':