update usage document

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@219 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-05-29 11:51:24 +00:00
parent f9c9357547
commit a1da7dbca3
3 changed files with 468 additions and 22 deletions

389
docs/layout.obj Normal file
View File

@ -0,0 +1,389 @@
%TGIF 4.1.45-QPL
state(0,37,100.000,0,0,0,16,1,9,1,1,0,0,0,0,1,1,'Helvetica-Bold',1,69120,0,0,1,5,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
%
% @(#)$Header$
% %W%
%
unit("1 pixel/pixel").
color_info(19,65535,0,[
"magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
"red", 65535, 0, 0, 65535, 0, 0, 1,
"green", 0, 65535, 0, 0, 65535, 0, 1,
"blue", 0, 0, 65535, 0, 0, 65535, 1,
"yellow", 65535, 65535, 0, 65535, 65535, 0, 1,
"pink", 65535, 49344, 52171, 65535, 49344, 52171, 1,
"cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
"CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1,
"white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
"black", 0, 0, 0, 0, 0, 0, 1,
"DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1,
"#00000000c000", 0, 0, 49344, 0, 0, 49152, 1,
"#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1,
"#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1,
"#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1,
"#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1,
"#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1,
"#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1,
"#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1
]).
script_frac("0.6").
fg_bg_colors('black','white').
dont_reencode("FFDingbests:ZapfDingbats").
objshadow_info('#c0c0c0',2,2).
page(1,"",1,'').
box('black','',50,45,300,355,2,2,1,0,0,0,0,0,0,'2',0,[
]).
box('black','',75,75,195,225,2,1,1,10,8,0,0,0,0,'1',0,[
]).
box('black','',85,105,185,125,2,1,1,18,8,0,0,0,0,'1',0,[
]).
box('black','',85,105,105,125,2,1,1,19,0,0,0,0,0,'1',0,[
]).
box('black','',105,105,125,125,2,1,1,20,0,0,0,0,0,'1',0,[
]).
text('black',95,108,1,1,1,9,15,21,12,3,0,0,0,0,2,9,15,0,0,"",0,0,0,0,120,'',[
minilines(9,15,0,0,1,0,0,[
mini_line(9,12,3,0,0,0,[
str_block(0,9,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica',0,69120,9,12,3,0,-1,0,0,0,0,0,
"A")])
])
])]).
text('black',115,108,1,1,1,8,15,28,12,3,0,0,0,0,2,8,15,0,0,"",0,0,0,0,120,'',[
minilines(8,15,0,0,1,0,0,[
mini_line(8,12,3,0,0,0,[
str_block(0,8,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica',0,69120,8,12,3,0,-1,0,0,0,0,0,
"B")])
])
])]).
box('black','',125,105,145,125,0,1,1,32,0,0,0,0,0,'1',0,[
]).
text('black',135,108,1,1,1,9,15,36,12,3,0,0,0,0,2,9,15,0,0,"",0,0,0,0,120,'',[
minilines(9,15,0,0,1,0,0,[
mini_line(9,12,3,0,0,0,[
str_block(0,9,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica',0,69120,9,12,3,0,-1,0,0,0,0,0,
"C")])
])
])]).
poly('black','',2,[
215,140,215,220],0,3,1,51,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
box('black','',175,265,270,325,0,3,1,65,0,0,0,0,0,'3',0,[
]).
box('black','',185,270,260,320,0,1,1,69,8,0,0,0,0,'1',0,[
]).
poly('black','',6,[
195,295,215,290,235,310,245,285,225,300,195,295],0,2,1,74,0,0,0,0,0,0,0,'2',0,0,
"00","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
box('black','',85,275,140,315,1,2,0,87,0,0,0,0,0,'2',0,[
]).
text('black',85,23,1,1,1,44,15,93,12,3,0,0,0,0,2,44,15,0,0,"",0,0,0,0,35,'',[
minilines(44,15,0,0,1,0,0,[
mini_line(44,12,3,0,0,0,[
str_block(0,44,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0,
"LTPage")])
])
])]).
text('black',255,133,1,1,1,39,15,100,12,3,0,0,0,0,2,39,15,0,0,"",0,0,0,0,145,'',[
minilines(39,15,0,0,1,0,0,[
mini_line(39,12,3,0,0,0,[
str_block(0,39,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,39,12,3,0,-1,0,0,0,0,0,
"LTLine")])
])
])]).
text('black',125,83,1,1,1,42,15,104,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,95,'',[
minilines(42,15,0,0,1,0,0,[
mini_line(42,12,3,0,0,0,[
str_block(0,42,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0,
"LTChar")])
])
])]).
text('black',245,53,1,1,1,65,15,108,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,65,'',[
minilines(65,15,0,0,1,0,0,[
mini_line(65,12,3,0,0,0,[
str_block(0,65,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0,
"LTTextBox")])
])
])]).
text('black',245,88,1,1,1,66,15,110,12,3,0,0,0,0,2,66,15,0,0,"",0,0,0,0,100,'',[
minilines(66,15,0,0,1,0,0,[
mini_line(66,12,3,0,0,0,[
str_block(0,66,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,66,12,3,0,-1,0,0,0,0,0,
"LTTextLine")])
])
])]).
text('black',255,243,1,1,1,51,15,112,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,255,'',[
minilines(51,15,0,0,1,0,0,[
mini_line(51,12,3,0,0,0,[
str_block(0,51,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0,
"LTFigure")])
])
])]).
text('black',140,243,1,1,1,51,15,114,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,255,'',[
minilines(51,15,0,0,1,0,0,[
mini_line(51,12,3,0,0,0,[
str_block(0,51,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0,
"LTImage")])
])
])]).
text('black',240,223,1,1,1,43,15,116,12,3,0,0,0,0,2,43,15,0,0,"",0,0,0,0,235,'',[
minilines(43,15,0,0,1,0,0,[
mini_line(43,12,3,0,0,0,[
str_block(0,43,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,43,12,3,0,0,0,0,0,0,0,
"LTRect")])
])
])]).
text('black',190,333,1,1,1,62,15,118,12,3,0,0,0,0,2,62,15,0,0,"",0,0,0,0,345,'',[
minilines(62,15,0,0,1,0,0,[
mini_line(62,12,3,0,0,0,[
str_block(0,62,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,62,12,3,0,-1,0,0,0,0,0,
"LTPolygon")])
])
])]).
text('black',170,138,1,1,1,42,15,121,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,150,'',[
minilines(42,15,0,0,1,0,0,[
mini_line(42,12,3,0,0,0,[
str_block(0,42,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0,
"LTText")])
])
])]).
box('black','',145,105,165,125,0,1,1,125,8,0,0,0,0,'1',0,[
]).
poly('black','',2,[
105,95,95,110],0,1,1,135,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
165,140,155,115],0,1,1,138,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
215,65,190,80],0,1,1,139,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
215,100,180,115],0,1,1,140,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
235,140,215,150],0,1,1,141,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
220,235,205,265],0,1,1,146,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
235,255,225,275],0,1,1,147,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
195,330,220,300],0,1,1,148,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
125,255,110,280],0,1,1,149,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
text('black',610,33,1,1,1,44,15,151,12,3,0,0,0,0,2,44,15,0,0,"",0,0,0,0,45,'',[
minilines(44,15,0,0,1,0,0,[
mini_line(44,12,3,0,0,0,[
str_block(0,44,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0,
"LTPage")])
])
])]).
text('black',460,108,1,1,1,65,15,152,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,120,'',[
minilines(65,15,0,0,1,0,0,[
mini_line(65,12,3,0,0,0,[
str_block(0,65,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0,
"LTTextBox")])
])
])]).
text('black',410,178,1,1,1,66,15,154,12,3,0,0,0,0,2,66,15,0,0,"",0,0,0,0,190,'',[
minilines(66,15,0,0,1,0,0,[
mini_line(66,12,3,0,0,0,[
str_block(0,66,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,66,12,3,0,-1,0,0,0,0,0,
"LTTextLine")])
])
])]).
text('black',360,248,1,1,1,42,15,157,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,260,'',[
minilines(42,15,0,0,1,0,0,[
mini_line(42,12,3,0,0,0,[
str_block(0,42,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0,
"LTChar")])
])
])]).
text('black',420,248,1,1,1,42,15,159,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,260,'',[
minilines(42,15,0,0,1,0,0,[
mini_line(42,12,3,0,0,0,[
str_block(0,42,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0,
"LTChar")])
])
])]).
text('black',480,248,1,1,1,42,15,161,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,260,'',[
minilines(42,15,0,0,1,0,0,[
mini_line(42,12,3,0,0,0,[
str_block(0,42,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0,
"LTText")])
])
])]).
text('black',460,178,1,1,1,12,15,170,12,3,0,0,0,0,2,12,15,0,0,"",0,0,0,0,190,'',[
minilines(12,15,0,0,1,0,0,[
mini_line(12,12,3,0,0,0,[
str_block(0,12,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,12,12,3,0,-1,0,0,0,0,0,
"...")])
])
])]).
text('black',520,248,1,1,1,12,15,172,12,3,0,0,0,0,2,12,15,0,0,"",0,0,0,0,260,'',[
minilines(12,15,0,0,1,0,0,[
mini_line(12,12,3,0,0,0,[
str_block(0,12,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,12,12,3,0,-1,0,0,0,0,0,
"...")])
])
])]).
text('black',560,108,1,1,1,51,15,174,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,120,'',[
minilines(51,15,0,0,1,0,0,[
mini_line(51,12,3,0,0,0,[
str_block(0,51,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0,
"LTFigure")])
])
])]).
text('black',635,108,1,1,1,39,15,178,12,3,0,0,0,0,2,39,15,0,0,"",0,0,0,0,120,'',[
minilines(39,15,0,0,1,0,0,[
mini_line(39,12,3,0,0,0,[
str_block(0,39,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,39,12,3,0,-1,0,0,0,0,0,
"LTLine")])
])
])]).
text('black',700,108,1,1,1,43,15,180,12,3,0,0,0,0,2,43,15,0,0,"",0,0,0,0,120,'',[
minilines(43,15,0,0,1,0,0,[
mini_line(43,12,3,0,0,0,[
str_block(0,43,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,43,12,3,0,0,0,0,0,0,0,
"LTRect")])
])
])]).
text('black',580,178,1,1,1,62,15,182,12,3,0,0,0,0,2,62,15,0,0,"",0,0,0,0,190,'',[
minilines(62,15,0,0,1,0,0,[
mini_line(62,12,3,0,0,0,[
str_block(0,62,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,62,12,3,0,-1,0,0,0,0,0,
"LTPolygon")])
])
])]).
text('black',775,108,1,1,1,51,15,186,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,120,'',[
minilines(51,15,0,0,1,0,0,[
mini_line(51,12,3,0,0,0,[
str_block(0,51,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0,
"LTImage")])
])
])]).
poly('black','',2,[
475,105,590,50],0,1,1,190,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
560,110,595,50],0,1,1,191,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
635,105,600,50],0,1,1,192,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
610,50,700,100],0,1,1,193,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
765,100,630,50],0,1,1,194,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
460,125,425,175],0,1,1,196,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
560,125,570,175],0,1,1,197,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
415,195,370,245],0,1,1,198,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
415,195,420,245],0,1,1,199,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
415,195,475,245],0,1,1,200,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
470,125,485,175],0,1,1,206,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
420,195,510,220],0,1,1,207,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
poly('black','',2,[
565,125,635,175],0,1,1,208,0,0,0,0,0,0,0,'1',0,0,
"0","",[
0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[
]).
text('black',635,178,1,1,1,12,15,215,12,3,0,0,0,0,2,12,15,0,0,"",0,0,0,0,190,'',[
minilines(12,15,0,0,1,0,0,[
mini_line(12,12,3,0,0,0,[
str_block(0,12,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,12,12,3,0,-1,0,0,0,0,0,
"...")])
])
])]).

BIN
docs/layout.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

View File

@ -2,16 +2,18 @@
<html> <html>
<head> <head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>PDFMiner Development Guide</title> <title>Programming with PDFMiner</title>
<style type="text/css"><!-- <style type="text/css"><!--
blockquote { background: #eeeeee; } blockquote { background: #eeeeee; }
.comment { color: darkgreen; } .comment { color: darkgreen; }
--></style> --></style>
</head><body> </head><body>
<h1>PDFMiner Development Guide</h1>
<p> <p>
This document describes how to use PDFMiner as a library <a href="index.html">[Back to PDFMiner]</a>
<h1>Programming with PDFMiner</h1>
<p>
This document explains how to use PDFMiner as a library
from other applications. from other applications.
<ul> <ul>
<li> <a href="#basic">Basic Usage</a> <li> <a href="#basic">Basic Usage</a>
@ -59,7 +61,7 @@ for page in doc.get_pages():
In PDFMiner, there are several Python classes involved in parsing a PDF file, In PDFMiner, there are several Python classes involved in parsing a PDF file,
as shown in Figure 1. as shown in Figure 1.
<div> <div align=center>
<img src="objrel.png"><br> <img src="objrel.png"><br>
<small>Figure 1. Relationships between PDFMiner objects</small> <small>Figure 1. Relationships between PDFMiner objects</small>
</div> </div>
@ -70,11 +72,10 @@ as shown in Figure 1.
<p> <p>
PDF documents are more like graphics, rather than text documents. PDF documents are more like graphics, rather than text documents.
It presents no logical structure such as sentences or paragraphs (for most cases). It presents no logical structure such as sentences or paragraphs (for most cases).
PDFMiner tries to reconstruct the original structure by performing PDFMiner attempts to reconstruct some of these structures by performing
basic layout analysis. basic layout analysis.
<p> <p>
Here is a typical way to do it:
<blockquote><pre> <blockquote><pre>
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
@ -86,40 +87,96 @@ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in doc.get_pages(): for page in doc.get_pages():
interpreter.process_page(page) interpreter.process_page(page)
<span class="comment"># receive the top-level layout object.</span> <span class="comment"># receive the LTPage object for the page.</span>
ltpage = device.get_result() layout = device.get_result()
</pre></blockquote> </pre></blockquote>
<ul> The layout analyzer gives a "<code>LTPage</code>" object for each page
<li> <code>LTPage</code> in the PDF document. The object contains child objects within the page,
<li> <code>LTTextBox</code> forming a tree-like structure. Figure 2 shows the relationship between
<li> <code>LTTextLine</code> these objects.
<li> <code>LTChar</code>
<li> <code>LTText</code> <div align=center>
<li> <code>LTFigure</code> <img src="layout.png"><br>
<li> <code>LTImage</code> <small>Figure 2. Layout objects and its tree structure</small>
<li> <code>LTRect</code> </div>
<li> <code>LTPolygon</code>
<li> <code>LTLine</code> <dl>
</ul> <dt> <code>LTPage</code>
<dd> Represents an entire page. May contain child objects like
<code>LTTextBox</code>, <code>LTFigure</code>, <code>LTImage</code>, <code>LTRect</code>,
<code>LTPolygon</code> and <code>LTLine</code>.
<dt> <code>LTTextBox</code>
<dd> Represents a group of text chunks that can be contained in a rectangular area.
Note that this box is created by geometric analysis and does not necessarily
represents a logical boundary of the text.
It contains a list of <code>LTTextLine</code> objects.
<dt> <code>LTTextLine</code>
<dd> Contains a list of <code>LTChar</code> objects that represent
a single text line. The characters are aligned either horizontaly
or vertically, depending on the text's writing mode.
<dt> <code>LTChar</code>
<dt> <code>LTText</code>
<dd> These objects represent an actual letter in the text as a Unicode string.
Note that, while a <code>LTChar</code> object has actual boundaries,
<code>LTText</code> objects does not, as these are "virtual" characters,
inserted by a layout analyzer according to the relationship between two characters
(e.g. a space).
<dt> <code>LTFigure</code>
<dd> Represents an area used by PDF Form objects. PDF Forms can be used to
present figures or pictures by embedding yet another PDF document within a page.
Note that <code>LTFigure</code> objects can appear recursively.
<dt> <code>LTImage</code>
<dd> Represents an image object. Embedded images can be
in JPEG or other formats, but currently PDFMiner does not
pay much attention to graphical objects.
<dt> <code>LTLine</code>
<dd> Represents a single straight line shown in a page.
Could be used for separating texts or figures.
<dt> <code>LTRect</code>
<dd> Represents a rectangle shown in a page.
Could be used for framing another pictures or figures.
<dt> <code>LTPolygon</code>
<dd> Represents a polygon in a page.
</dl>
<a name="toc"> <a name="toc">
<hr noshade> <hr noshade>
<h2>TOC Extraction</h2> <h2>TOC Extraction</h2>
<p>
PDFMiner provides functions to access the document's table of contents
("Outlines").
<blockquote><pre> <blockquote><pre>
from pdfminer.pdfparser import PDFParser, PDFDocument
fp = open('mypdf.pdf', 'rb') fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument() doc = PDFDocument()
parser.set_document(doc) parser.set_document(doc)
doc.set_parser(parser) doc.set_parser(parser)
doc.initialize(password) doc.initialize(password)
<span class="comment"># Get the outlines of the document.</span> <span class="comment"># Get the outlines of the document.</span>
outlines = doc.get_outlines() outlines = doc.get_outlines()
for (level,title,dest,a,se) in outlines: for (level,title,dest,a,se) in outlines:
print (level, title) print (level, title)
</pre></blockquote> </pre></blockquote>
<p>
In some PDF documents, destinations are referred to as page numbers.
In other PDF documents, destinations are referred to as page numbers plus
the location within the page. Since PDF does not provide a way to
point to graphical objects in a page, normally these in-page destinations
are specified by physical coordinates.
<hr noshade> <hr noshade>
<address>Yusuke Shinyama</address> <address>Yusuke Shinyama</address>