some usage document added
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@214 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
eb535d4106
commit
7f587cafec
|
@ -0,0 +1,185 @@
|
||||||
|
%TGIF 4.1.45-QPL
|
||||||
|
state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
|
||||||
|
%
|
||||||
|
% @(#)$Header$
|
||||||
|
% %W%
|
||||||
|
%
|
||||||
|
unit("1 pixel/pixel").
|
||||||
|
color_info(19,65535,0,[
|
||||||
|
"magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
|
||||||
|
"red", 65535, 0, 0, 65535, 0, 0, 1,
|
||||||
|
"green", 0, 65535, 0, 0, 65535, 0, 1,
|
||||||
|
"blue", 0, 0, 65535, 0, 0, 65535, 1,
|
||||||
|
"yellow", 65535, 65535, 0, 65535, 65535, 0, 1,
|
||||||
|
"pink", 65535, 49344, 52171, 65535, 49344, 52171, 1,
|
||||||
|
"cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
|
||||||
|
"CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1,
|
||||||
|
"white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
|
||||||
|
"black", 0, 0, 0, 0, 0, 0, 1,
|
||||||
|
"DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1,
|
||||||
|
"#00000000c000", 0, 0, 49344, 0, 0, 49152, 1,
|
||||||
|
"#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1,
|
||||||
|
"#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1,
|
||||||
|
"#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1,
|
||||||
|
"#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1,
|
||||||
|
"#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1,
|
||||||
|
"#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1,
|
||||||
|
"#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1
|
||||||
|
]).
|
||||||
|
script_frac("0.6").
|
||||||
|
fg_bg_colors('black','white').
|
||||||
|
dont_reencode("FFDingbests:ZapfDingbats").
|
||||||
|
objshadow_info('#c0c0c0',2,2).
|
||||||
|
page(1,"",1,'').
|
||||||
|
oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
poly('black','',2,[
|
||||||
|
270,270,350,230],1,2,1,54,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
poly('black','',2,[
|
||||||
|
270,280,350,320],1,2,1,55,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
box('black','',350,100,450,150,2,2,1,2,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
text('black',400,118,1,1,1,84,15,3,12,3,0,0,0,0,2,84,15,0,0,"",0,0,0,0,130,'',[
|
||||||
|
minilines(84,15,0,0,1,0,0,[
|
||||||
|
mini_line(84,12,3,0,0,0,[
|
||||||
|
str_block(0,84,12,3,0,0,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,0,0,0,0,0,0,
|
||||||
|
"PDFDocument")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
box('black','',150,100,250,150,2,2,1,13,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
text('black',200,118,1,1,1,63,15,14,12,3,0,0,0,0,2,63,15,0,0,"",0,0,0,0,130,'',[
|
||||||
|
minilines(63,15,0,0,1,0,0,[
|
||||||
|
mini_line(63,12,3,0,0,0,[
|
||||||
|
str_block(0,63,12,3,0,0,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,63,12,3,0,0,0,0,0,0,0,
|
||||||
|
"PDFParser")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
box('black','',350,200,450,250,2,2,1,20,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
text('black',400,218,1,1,1,88,15,21,12,3,0,0,0,0,2,88,15,0,0,"",0,0,0,0,230,'',[
|
||||||
|
minilines(88,15,0,0,1,0,0,[
|
||||||
|
mini_line(88,12,3,0,0,0,[
|
||||||
|
str_block(0,88,12,3,0,0,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,88,12,3,0,0,0,0,0,0,0,
|
||||||
|
"PDFInterpreter")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
box('black','',350,300,450,350,2,2,1,23,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
text('black',400,318,1,1,1,65,15,24,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,330,'',[
|
||||||
|
minilines(65,15,0,0,1,0,0,[
|
||||||
|
mini_line(65,12,3,0,0,0,[
|
||||||
|
str_block(0,65,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"PDFDevice")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
box('black','',180,250,280,300,2,2,1,29,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
text('black',230,268,1,1,1,131,15,30,12,3,2,0,0,0,2,131,15,0,0,"",0,0,0,0,280,'',[
|
||||||
|
minilines(131,15,0,0,1,0,0,[
|
||||||
|
mini_line(131,12,3,0,0,0,[
|
||||||
|
str_block(0,131,12,3,0,0,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,131,12,3,0,0,0,0,0,0,0,
|
||||||
|
"PDFResourceManager")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
poly('black','',2,[
|
||||||
|
250,140,350,140],1,2,1,45,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
poly('black','',2,[
|
||||||
|
350,110,250,110],1,2,1,46,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
poly('black','',2,[
|
||||||
|
400,150,400,200],1,2,1,47,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
poly('black','',2,[
|
||||||
|
400,250,400,300],1,2,1,56,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
poly('black','',2,[
|
||||||
|
400,350,400,380],0,2,1,65,0,0,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
text('black',400,388,3,1,1,44,41,71,12,3,0,-2,0,0,2,44,41,0,0,"",0,0,0,0,400,'',[
|
||||||
|
minilines(44,41,0,0,1,-2,0,[
|
||||||
|
mini_line(44,12,3,0,0,0,[
|
||||||
|
str_block(0,44,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"Display")])
|
||||||
|
]),
|
||||||
|
mini_line(20,12,3,0,0,0,[
|
||||||
|
str_block(0,20,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,20,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"File")])
|
||||||
|
]),
|
||||||
|
mini_line(23,12,3,0,0,0,[
|
||||||
|
str_block(0,23,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,23,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"etc.")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
text('black',300,88,1,1,1,92,15,79,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,100,'',[
|
||||||
|
minilines(92,15,0,0,1,0,0,[
|
||||||
|
mini_line(92,12,3,0,0,0,[
|
||||||
|
str_block(0,92,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"request objects")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
text('black',300,148,1,1,1,78,15,84,12,3,0,0,0,0,2,78,15,0,0,"",0,0,0,0,160,'',[
|
||||||
|
minilines(78,15,0,0,1,0,0,[
|
||||||
|
mini_line(78,12,3,0,0,0,[
|
||||||
|
str_block(0,78,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,78,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"store objects")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
oval('black','',20,100,120,150,2,2,1,106,0,0,0,0,0,'2',0,[
|
||||||
|
]).
|
||||||
|
text('black',70,118,1,1,1,46,15,107,12,3,0,0,0,0,2,46,15,0,0,"",0,0,0,0,130,'',[
|
||||||
|
minilines(46,15,0,0,1,0,0,[
|
||||||
|
mini_line(46,12,3,0,0,0,[
|
||||||
|
str_block(0,46,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,46,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"PDF file")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
poly('black','',2,[
|
||||||
|
120,120,150,120],0,2,1,114,0,2,0,0,0,0,0,'2',0,0,
|
||||||
|
"0","",[
|
||||||
|
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
|
||||||
|
]).
|
||||||
|
text('black',400,158,1,1,1,68,15,115,12,3,2,0,0,0,2,68,15,0,0,"",0,0,0,0,170,'',[
|
||||||
|
minilines(68,15,0,0,1,0,0,[
|
||||||
|
mini_line(68,12,3,0,0,0,[
|
||||||
|
str_block(0,68,12,3,0,0,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,68,12,3,0,0,0,0,0,0,0,
|
||||||
|
"page object")])
|
||||||
|
])
|
||||||
|
])]).
|
||||||
|
text('black',400,258,1,1,1,115,15,119,12,3,2,0,0,0,2,115,15,0,0,"",0,0,0,0,270,'',[
|
||||||
|
minilines(115,15,0,0,1,0,0,[
|
||||||
|
mini_line(115,12,3,0,0,0,[
|
||||||
|
str_block(0,115,12,3,0,-1,0,0,0,[
|
||||||
|
str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0,
|
||||||
|
"rendering sequence")])
|
||||||
|
])
|
||||||
|
])]).
|
Binary file not shown.
After Width: | Height: | Size: 2.0 KiB |
|
@ -0,0 +1,114 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||||
|
<title>PDFMiner Development Guide</title>
|
||||||
|
<style type="text/css"><!--
|
||||||
|
blockquote { background: #eeeeee; }
|
||||||
|
.comment { color: darkgreen; }
|
||||||
|
--></style>
|
||||||
|
</head><body>
|
||||||
|
|
||||||
|
<h1>PDFMiner Development Guide</h1>
|
||||||
|
<p>
|
||||||
|
This document describes how to use PDFMiner as a library
|
||||||
|
from other applications.
|
||||||
|
<ul>
|
||||||
|
<li> <a href="#basic">Basic Usage</a>
|
||||||
|
<li> <a href="#layout">Layout Analysis</a>
|
||||||
|
<li> <a href="#toc">TOC Extraction</a>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<a name="basic">
|
||||||
|
<hr noshade>
|
||||||
|
<h2>Basic Usage</h2>
|
||||||
|
<p>
|
||||||
|
A typical way to parse a PDF file is the following:
|
||||||
|
<blockquote><pre>
|
||||||
|
<span class="comment"># Open a PDF file.</span>
|
||||||
|
fp = open('mypdf.pdf', 'rb')
|
||||||
|
<span class="comment"># Create a PDF parser object associated with the file object.</span>
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
<span class="comment"># Create a PDF document object that stores the document structure.</span>
|
||||||
|
doc = PDFDocument()
|
||||||
|
<span class="comment"># Connect the parser and document objects.</span>
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
|
<span class="comment"># Supply the document password for initialization.</span>
|
||||||
|
<span class="comment"># (If no password is set, give an empty string.)</span>
|
||||||
|
doc.initialize(password)
|
||||||
|
<span class="comment"># Check if the document allows text extraction. If not, abort.</span>
|
||||||
|
if not doc.is_extractable:
|
||||||
|
raise PDFTextExtractionNotAllowed
|
||||||
|
<span class="comment"># Create a PDF resource manager object that stores shared resources.</span>
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
<span class="comment"># Create a PDF device object.</span>
|
||||||
|
device = PDFDevice(rsrcmgr)
|
||||||
|
<span class="comment"># Create a PDF interpreter object.</span>
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
<span class="comment"># Process each page contained in the document.</span>
|
||||||
|
for page in doc.get_pages():
|
||||||
|
interpreter.process_page(page)
|
||||||
|
</pre></blockquote>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
In PDFMiner, there are several objects involved in parsing a PDF file.
|
||||||
|
Figure 1. shows the relationships between these objects.
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<img src="objrel.png"><br>
|
||||||
|
<small>Figure 1. Relationships between objects</small>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<a name="layout">
|
||||||
|
<hr noshade>
|
||||||
|
<h2>Accessing Layout Objects</h2>
|
||||||
|
<p>
|
||||||
|
PDFMiner performs a basic layout analysis.
|
||||||
|
|
||||||
|
<blockquote><pre>
|
||||||
|
<span class="comment"># Set parameters for analysis.</span>
|
||||||
|
laparams = LAParams()
|
||||||
|
<span class="comment"># Create a PDF page aggregator object.</span>
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
for page in doc.get_pages():
|
||||||
|
interpreter.process_page(page)
|
||||||
|
<span class="comment"># receive the top-level layout object.</span>
|
||||||
|
ltpage = device.get_result()
|
||||||
|
</pre></blockquote>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li> <code>LTPage</code>
|
||||||
|
<li> <code>LTTextBox</code>
|
||||||
|
<li> <code>LTTextLine</code>
|
||||||
|
<li> <code>LTChar</code>
|
||||||
|
<li> <code>LTText</code>
|
||||||
|
<li> <code>LTFigure</code>
|
||||||
|
<li> <code>LTImage</code>
|
||||||
|
<li> <code>LTRect</code>
|
||||||
|
<li> <code>LTPolygon</code>
|
||||||
|
<li> <code>LTLine</code>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<a name="toc">
|
||||||
|
<hr noshade>
|
||||||
|
<h2>TOC Extraction</h2>
|
||||||
|
|
||||||
|
<blockquote><pre>
|
||||||
|
fp = open('mypdf.pdf', 'rb')
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument()
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
|
doc.initialize(password)
|
||||||
|
<span class="comment"># Get the outlines of the document.</span>
|
||||||
|
outlines = doc.get_outlines()
|
||||||
|
for (level,title,dest,a,se) in outlines:
|
||||||
|
print (level, title)
|
||||||
|
</pre></blockquote>
|
||||||
|
|
||||||
|
|
||||||
|
<hr noshade>
|
||||||
|
<address>Yusuke Shinyama</address>
|
||||||
|
</body>
|
|
@ -809,14 +809,22 @@ class PDFPageInterpreter(object):
|
||||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||||
|
|
||||||
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''):
|
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''):
|
||||||
doc = PDFDocument()
|
# Create a PDF parser object associated with the file object.
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
|
# Create a PDF document object that stores the document structure.
|
||||||
|
doc = PDFDocument()
|
||||||
|
# Connect the parser and document objects.
|
||||||
parser.set_document(doc)
|
parser.set_document(doc)
|
||||||
doc.set_parser(parser)
|
doc.set_parser(parser)
|
||||||
|
# Supply the document password for initialization.
|
||||||
|
# (If no password is set, give an empty string.)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
|
# Check if the document allows text extraction. If not, abort.
|
||||||
if not doc.is_extractable:
|
if not doc.is_extractable:
|
||||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||||
|
# Create a PDF interpreter object.
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
# Process each page contained in the document.
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
if pagenos and (pageno not in pagenos): continue
|
if pagenos and (pageno not in pagenos): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
|
Loading…
Reference in New Issue