From 7f587cafec57a48c5cd33f3fc56f8f927d3ace9c Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 24 Apr 2010 13:31:31 +0000 Subject: [PATCH] some usage document added git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@214 1aa58f4a-7d42-0410-adbc-911cccaed67c --- docs/objrel.obj | 185 ++++++++++++++++++++++++++++++++++++++++++ docs/objrel.png | Bin 0 -> 2006 bytes docs/usage.html | 114 ++++++++++++++++++++++++++ pdfminer/pdfinterp.py | 10 ++- 4 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 docs/objrel.obj create mode 100644 docs/objrel.png create mode 100644 docs/usage.html diff --git a/docs/objrel.obj b/docs/objrel.obj new file mode 100644 index 0000000..bcfb0a7 --- /dev/null +++ b/docs/objrel.obj @@ -0,0 +1,185 @@ +%TGIF 4.1.45-QPL +state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0). +% +% @(#)$Header$ +% %W% +% +unit("1 pixel/pixel"). +color_info(19,65535,0,[ + "magenta", 65535, 0, 65535, 65535, 0, 65535, 1, + "red", 65535, 0, 0, 65535, 0, 0, 1, + "green", 0, 65535, 0, 0, 65535, 0, 1, + "blue", 0, 0, 65535, 0, 0, 65535, 1, + "yellow", 65535, 65535, 0, 65535, 65535, 0, 1, + "pink", 65535, 49344, 52171, 65535, 49344, 52171, 1, + "cyan", 0, 65535, 65535, 0, 65535, 65535, 1, + "CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1, + "white", 65535, 65535, 65535, 65535, 65535, 65535, 1, + "black", 0, 0, 0, 0, 0, 0, 1, + "DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1, + "#00000000c000", 0, 0, 49344, 0, 0, 49152, 1, + "#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1, + "#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1, + "#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1, + "#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1, + "#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1, + "#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1, + "#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1 +]). +script_frac("0.6"). +fg_bg_colors('black','white'). +dont_reencode("FFDingbests:ZapfDingbats"). +objshadow_info('#c0c0c0',2,2). +page(1,"",1,''). +oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[ +]). +poly('black','',2,[ + 270,270,350,230],1,2,1,54,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +poly('black','',2,[ + 270,280,350,320],1,2,1,55,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +box('black','',350,100,450,150,2,2,1,2,0,0,0,0,0,'2',0,[ +]). +text('black',400,118,1,1,1,84,15,3,12,3,0,0,0,0,2,84,15,0,0,"",0,0,0,0,130,'',[ +minilines(84,15,0,0,1,0,0,[ +mini_line(84,12,3,0,0,0,[ +str_block(0,84,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,0,0,0,0,0,0, + "PDFDocument")]) +]) +])]). +box('black','',150,100,250,150,2,2,1,13,0,0,0,0,0,'2',0,[ +]). +text('black',200,118,1,1,1,63,15,14,12,3,0,0,0,0,2,63,15,0,0,"",0,0,0,0,130,'',[ +minilines(63,15,0,0,1,0,0,[ +mini_line(63,12,3,0,0,0,[ +str_block(0,63,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,63,12,3,0,0,0,0,0,0,0, + "PDFParser")]) +]) +])]). +box('black','',350,200,450,250,2,2,1,20,0,0,0,0,0,'2',0,[ +]). +text('black',400,218,1,1,1,88,15,21,12,3,0,0,0,0,2,88,15,0,0,"",0,0,0,0,230,'',[ +minilines(88,15,0,0,1,0,0,[ +mini_line(88,12,3,0,0,0,[ +str_block(0,88,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,88,12,3,0,0,0,0,0,0,0, + "PDFInterpreter")]) +]) +])]). +box('black','',350,300,450,350,2,2,1,23,0,0,0,0,0,'2',0,[ +]). +text('black',400,318,1,1,1,65,15,24,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,330,'',[ +minilines(65,15,0,0,1,0,0,[ +mini_line(65,12,3,0,0,0,[ +str_block(0,65,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0, + "PDFDevice")]) +]) +])]). +box('black','',180,250,280,300,2,2,1,29,0,0,0,0,0,'2',0,[ +]). +text('black',230,268,1,1,1,131,15,30,12,3,2,0,0,0,2,131,15,0,0,"",0,0,0,0,280,'',[ +minilines(131,15,0,0,1,0,0,[ +mini_line(131,12,3,0,0,0,[ +str_block(0,131,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,131,12,3,0,0,0,0,0,0,0, + "PDFResourceManager")]) +]) +])]). +poly('black','',2,[ + 250,140,350,140],1,2,1,45,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +poly('black','',2,[ + 350,110,250,110],1,2,1,46,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +poly('black','',2,[ + 400,150,400,200],1,2,1,47,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +poly('black','',2,[ + 400,250,400,300],1,2,1,56,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +poly('black','',2,[ + 400,350,400,380],0,2,1,65,0,0,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +text('black',400,388,3,1,1,44,41,71,12,3,0,-2,0,0,2,44,41,0,0,"",0,0,0,0,400,'',[ +minilines(44,41,0,0,1,-2,0,[ +mini_line(44,12,3,0,0,0,[ +str_block(0,44,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0, + "Display")]) +]), +mini_line(20,12,3,0,0,0,[ +str_block(0,20,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,20,12,3,0,-1,0,0,0,0,0, + "File")]) +]), +mini_line(23,12,3,0,0,0,[ +str_block(0,23,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,23,12,3,0,-1,0,0,0,0,0, + "etc.")]) +]) +])]). +text('black',300,88,1,1,1,92,15,79,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,100,'',[ +minilines(92,15,0,0,1,0,0,[ +mini_line(92,12,3,0,0,0,[ +str_block(0,92,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0, + "request objects")]) +]) +])]). +text('black',300,148,1,1,1,78,15,84,12,3,0,0,0,0,2,78,15,0,0,"",0,0,0,0,160,'',[ +minilines(78,15,0,0,1,0,0,[ +mini_line(78,12,3,0,0,0,[ +str_block(0,78,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,78,12,3,0,-1,0,0,0,0,0, + "store objects")]) +]) +])]). +oval('black','',20,100,120,150,2,2,1,106,0,0,0,0,0,'2',0,[ +]). +text('black',70,118,1,1,1,46,15,107,12,3,0,0,0,0,2,46,15,0,0,"",0,0,0,0,130,'',[ +minilines(46,15,0,0,1,0,0,[ +mini_line(46,12,3,0,0,0,[ +str_block(0,46,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,46,12,3,0,-1,0,0,0,0,0, + "PDF file")]) +]) +])]). +poly('black','',2,[ + 120,120,150,120],0,2,1,114,0,2,0,0,0,0,0,'2',0,0, + "0","",[ + 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ +]). +text('black',400,158,1,1,1,68,15,115,12,3,2,0,0,0,2,68,15,0,0,"",0,0,0,0,170,'',[ +minilines(68,15,0,0,1,0,0,[ +mini_line(68,12,3,0,0,0,[ +str_block(0,68,12,3,0,0,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,68,12,3,0,0,0,0,0,0,0, + "page object")]) +]) +])]). +text('black',400,258,1,1,1,115,15,119,12,3,2,0,0,0,2,115,15,0,0,"",0,0,0,0,270,'',[ +minilines(115,15,0,0,1,0,0,[ +mini_line(115,12,3,0,0,0,[ +str_block(0,115,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0, + "rendering sequence")]) +]) +])]). diff --git a/docs/objrel.png b/docs/objrel.png new file mode 100644 index 0000000000000000000000000000000000000000..528228cedfd7aeafd47945d2635049ff0333a21f GIT binary patch literal 2006 zcmV;{2Pyc8P)0~QDNsfBo@7Aym zA0)oZ<-_SD`F?yIC1&H&UhIDbE!${1-%!wo!cO%YefdSN=zOb!U`t%p6v=wBu1FFH zUW!HS*s_Fm#Y%&qakto5EL+0bf^`mq`Jy(IoE9zX5(dT9w~9#nDBjS=KdygLk1@%p z3WeoudRDiqzKSvIIazrcwCN>uQZrOBg0a|x*qt$KGRHGup})d~W}F^$Is(?FL$}r8 zovCn=!myWg3oRK6Ta{5E355sa(g6?-)8mpEt{+w+V9%=@$f0^O3bwC{$xuCe(UUxmoVB+M5*J#<{QK6;n<_AUj~zAkYZZaYJ_W zI)^~E+zMUi2zZEvHRr;AyOKPU?a|EAw1@YTcN0^fnSK4B;%1F8>}#iXje)O9`SKKK zDLG3HK;53pGO(O~m0D6WAxNx~E&aFUqRK!bR9Lvxm?w!TP!S45c~YX#X}p)x(=5DS zzMp8ElXS3;(ut&GbfO3)uf@L8;^PV2E$4UVuJ+|%i(y?8N_PTJ zeSO?5IZ!8In6o5d%{kPU9_+2yggL9b$^7n8TMa>T+kCy%At#&V;GJ`6BkxrM0@3dd z^!pJ#0-(nn^e7iOt}^*U&`8eY4??q5)ZD>NI?sa;JzGW1k4%0q)MC_(9NEXHH2{+j zomKp*1HaBeudL8(xyaRHB(&-E_13F_h3+H)SIE&YBGCi147=}@z4tvIwBfM*CbcvW zHy5)i5bAnkzJL5wBLoP~^@iS8o@#w>+~$Q>x>k4KT;JOv@xq?2H67URdxIf&9~+wP z_AwQx_~cpbrrPwZuBd?uzFoNy{t+BDYi^MbHvM^iI8L3mRyDPgEfoqs6pp3tsw z!tM@{x&0Ss}*?5S|&@shX2iUBcWs?v;c!HVUd(m_$N1X3qoJB`pn1yD@v7g>De4 zyijKH*0_sIktQxUkbOmdXa){|8>;>yKTP_I+;HG8@vYJ}`G&IKrGxcz_@zwi$QCkpJiNpcN zuUAAYHZ&!_qA5b5#sR%6ZtaW3IZct@%_)xywKxFQ#A4rpzxS1lUX%GJR*Q(MnnwHh zP$=Bg$Ei@2gp@{{vr4l1O=mSC$>Uh?G}ziO>0cE%$!G@atSD#*WkbFJ(vL@Ph8fAo zwpDR$>++qr)loCPCdaQgLFdGYapC8;$HE#(@WNIpyRrEL5vA^nq>?X?GF={eBotke6FA9l;Mk59_NrgKBD5q{XR=vjULdaJ8H`~IoQ4gG20 zFY-ej3)}u8XzsXWX!@%FQ1(^v!;k){1JK6W;lhL8z0AxD}f@~+FW5~gUuG`7axnx09cB7Z6I= zeFuK0gWg-AcXR0dBYww<-Xo)T<>)v7KJtK$b)cgw=y(o3f`pD)?XiO-CN2kt|0MW4 zn16IH?MC#Vuye7~M77xI?sQXHHHm>8O%qG-K6d9jI?2P~z-e)w|QKZ zfx`j)?N$N~>#}PqP&kacFYeP)AlOTfUgJ>MVqQVe`^O73EChkqPp#3|7X)5ECr4vn z5O~GrAkb+iA-f+wIc-DWp>0c}PpyVB2&Qd2m(@;6GR*=YeNZwdtST9qHV%OFdCc5* zdQGg*2Oc5N`N}QpClneGNMjtim4dQt({TU*>1C9hJ}$d?0HmW$!a1v_ZtDOzdaA5x z|7FwXPXW;TuNL5Vjt4Cq93?n3=it!YuvmR$i + + + +PDFMiner Development Guide + + + +

PDFMiner Development Guide

+

+This document describes how to use PDFMiner as a library +from other applications. +

+ + +
+

Basic Usage

+

+A typical way to parse a PDF file is the following: +

+# Open a PDF file.
+fp = open('mypdf.pdf', 'rb')
+# Create a PDF parser object associated with the file object.
+parser = PDFParser(fp)
+# Create a PDF document object that stores the document structure.
+doc = PDFDocument()
+# Connect the parser and document objects.
+parser.set_document(doc)
+doc.set_parser(parser)
+# Supply the document password for initialization.
+# (If no password is set, give an empty string.)
+doc.initialize(password)
+# Check if the document allows text extraction. If not, abort.
+if not doc.is_extractable:
+    raise PDFTextExtractionNotAllowed
+# Create a PDF resource manager object that stores shared resources.
+rsrcmgr = PDFResourceManager()
+# Create a PDF device object.
+device = PDFDevice(rsrcmgr)
+# Create a PDF interpreter object.
+interpreter = PDFPageInterpreter(rsrcmgr, device)
+# Process each page contained in the document.
+for page in doc.get_pages():
+    interpreter.process_page(page)
+
+ +

+In PDFMiner, there are several objects involved in parsing a PDF file. +Figure 1. shows the relationships between these objects. + +

+
+Figure 1. Relationships between objects +
+ +
+
+

Accessing Layout Objects

+

+PDFMiner performs a basic layout analysis. + +

+# Set parameters for analysis.
+laparams = LAParams()
+# Create a PDF page aggregator object.
+device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+interpreter = PDFPageInterpreter(rsrcmgr, device)
+for page in doc.get_pages():
+    interpreter.process_page(page)
+    # receive the top-level layout object.
+    ltpage = device.get_result()
+
+ +
    +
  • LTPage +
  • LTTextBox +
  • LTTextLine +
  • LTChar +
  • LTText +
  • LTFigure +
  • LTImage +
  • LTRect +
  • LTPolygon +
  • LTLine +
+ +
+
+

TOC Extraction

+ +
+fp = open('mypdf.pdf', 'rb')
+parser = PDFParser(fp)
+doc = PDFDocument()
+parser.set_document(doc)
+doc.set_parser(parser)
+doc.initialize(password)
+# Get the outlines of the document.
+outlines = doc.get_outlines()
+for (level,title,dest,a,se) in outlines:
+    print (level, title)
+
+ + +
+
Yusuke Shinyama
+ diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 2b02a8e..a78338a 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -809,14 +809,22 @@ class PDFPageInterpreter(object): class PDFTextExtractionNotAllowed(PDFInterpreterError): pass def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''): - doc = PDFDocument() + # Create a PDF parser object associated with the file object. parser = PDFParser(fp) + # Create a PDF document object that stores the document structure. + doc = PDFDocument() + # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) + # Supply the document password for initialization. + # (If no password is set, give an empty string.) doc.initialize(password) + # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) + # Process each page contained in the document. for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page)