sgml to xml
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@146 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
23b8058ad4
commit
78f7866554
1
TODO
1
TODO
|
@ -1,5 +1,4 @@
|
||||||
TODOs:
|
TODOs:
|
||||||
- sgml->xml
|
|
||||||
- PEP-8 conformance.
|
- PEP-8 conformance.
|
||||||
- Better text extraction / layout analysis.
|
- Better text extraction / layout analysis.
|
||||||
- Better API Documentation.
|
- Better API Documentation.
|
||||||
|
|
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat Oct 31 11:08:31 JST 2009
|
Last Modified: Sat Oct 31 12:03:49 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -209,7 +209,7 @@ By default, it extracts texts from all the pages.
|
||||||
<ul>
|
<ul>
|
||||||
<li> <code>text</code> : TEXT format. (Default)
|
<li> <code>text</code> : TEXT format. (Default)
|
||||||
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
|
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
|
||||||
<li> <code>sgml</code> : SGML format. Provides the most information available.
|
<li> <code>xml</code> : XML format. Provides the most information available.
|
||||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||||
|
|
|
@ -156,10 +156,16 @@ class PDFConverter(PDFPageAggregator):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## SGMLConverter
|
## XMLConverter
|
||||||
##
|
##
|
||||||
class SGMLConverter(PDFConverter):
|
class XMLConverter(PDFConverter):
|
||||||
|
|
||||||
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||||
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
|
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||||
|
self.outfp.write('<pages>\n')
|
||||||
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
|
@ -202,6 +208,10 @@ class SGMLConverter(PDFConverter):
|
||||||
render(page)
|
render(page)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.outfp.write('</pages>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## HTMLConverter
|
## HTMLConverter
|
||||||
##
|
##
|
||||||
|
|
|
@ -25,30 +25,33 @@ TEXTS= \
|
||||||
naacl06-shinyama.txt \
|
naacl06-shinyama.txt \
|
||||||
nlp2004slides.txt
|
nlp2004slides.txt
|
||||||
|
|
||||||
SGMLS= \
|
XMLS= \
|
||||||
simple1.sgml \
|
simple1.xml \
|
||||||
simple2.sgml \
|
simple2.xml \
|
||||||
dmca.sgml \
|
dmca.xml \
|
||||||
f1040nr.sgml \
|
f1040nr.xml \
|
||||||
i1040nr.sgml \
|
i1040nr.xml \
|
||||||
jo.sgml \
|
jo.xml \
|
||||||
kampo.sgml \
|
kampo.xml \
|
||||||
naacl06-shinyama.sgml \
|
naacl06-shinyama.xml \
|
||||||
nlp2004slides.sgml
|
nlp2004slides.xml
|
||||||
|
|
||||||
all:
|
all:
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm $(HTMLS)
|
-rm $(HTMLS)
|
||||||
-rm $(TEXTS)
|
-rm $(TEXTS)
|
||||||
-rm $(SGMLS)
|
-rm $(XMLS)
|
||||||
|
|
||||||
test: $(HTMLS) $(TEXTS) $(SGMLS)
|
test: htmls texts xmls
|
||||||
|
htmls: $(HTMLS)
|
||||||
|
tests: $(TEXTS)
|
||||||
|
xmls: $(XMLS)
|
||||||
|
|
||||||
.SUFFIXES: .pdf .html .sgml .txt
|
.SUFFIXES: .pdf .html .xml .txt
|
||||||
.pdf.html:
|
.pdf.html:
|
||||||
$(PDF2TXT) -t html $< > $@
|
$(PDF2TXT) -t html $< > $@
|
||||||
.pdf.sgml:
|
.pdf.xml:
|
||||||
$(PDF2TXT) -t sgml $< > $@
|
$(PDF2TXT) -t xml $< > $@
|
||||||
.pdf.txt:
|
.pdf.txt:
|
||||||
$(PDF2TXT) -t text $< > $@
|
$(PDF2TXT) -t text $< > $@
|
||||||
|
|
|
@ -3,7 +3,7 @@ import sys
|
||||||
from pdfminer.pdfparser import PDFDocument, PDFParser
|
from pdfminer.pdfparser import PDFDocument, PDFParser
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
|
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
|
||||||
from pdfminer.cmap import CMapDB, find_cmap_path
|
from pdfminer.cmap import CMapDB, find_cmap_path
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ def main(argv):
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||||
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||||
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
'[-t text|html|xml|tag] [-o output] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
|
||||||
|
@ -65,8 +65,8 @@ def main(argv):
|
||||||
if outfile:
|
if outfile:
|
||||||
if outfile.endswith('.htm') or outfile.endswith('.html'):
|
if outfile.endswith('.htm') or outfile.endswith('.html'):
|
||||||
outtype = 'html'
|
outtype = 'html'
|
||||||
elif outfile.endswith('.sgml'):
|
elif outfile.endswith('.xml'):
|
||||||
outtype = 'sgml'
|
outtype = 'xml'
|
||||||
elif outfile.endswith('.tag'):
|
elif outfile.endswith('.tag'):
|
||||||
outtype = 'tag'
|
outtype = 'tag'
|
||||||
if outfile:
|
if outfile:
|
||||||
|
@ -75,8 +75,8 @@ def main(argv):
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outtype == 'text':
|
if outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
elif outtype == 'sgml':
|
elif outtype == 'xml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
|
|
Loading…
Reference in New Issue