From 0e0acfc3ffbf5225f1b4927719262c7ac5ca9894 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 17 Oct 2010 05:14:40 +0000 Subject: [PATCH] documentation improved git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@247 1aa58f4a-7d42-0410-adbc-911cccaed67c --- docs/objrel.obj | 28 ++++++++++--------- docs/objrel.png | Bin 2006 -> 2038 bytes docs/programming.html | 61 ++++++++++++++++++++++++++++-------------- 3 files changed, 56 insertions(+), 33 deletions(-) diff --git a/docs/objrel.obj b/docs/objrel.obj index bcfb0a7..12a0f56 100644 --- a/docs/objrel.obj +++ b/docs/objrel.obj @@ -1,4 +1,4 @@ -%TGIF 4.1.45-QPL +%TGIF 4.2.2 state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0). % % @(#)$Header$ @@ -30,6 +30,8 @@ script_frac("0.6"). fg_bg_colors('black','white'). dont_reencode("FFDingbests:ZapfDingbats"). objshadow_info('#c0c0c0',2,2). +rotate_pivot(0,0,0,0). +spline_tightness(1). page(1,"",1,''). oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[ ]). @@ -167,19 +169,19 @@ poly('black','',2,[ "0","",[ 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ ]). -text('black',400,158,1,1,1,68,15,115,12,3,2,0,0,0,2,68,15,0,0,"",0,0,0,0,170,'',[ -minilines(68,15,0,0,1,0,0,[ -mini_line(68,12,3,0,0,0,[ -str_block(0,68,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,68,12,3,0,0,0,0,0,0,0, - "page object")]) +text('black',400,158,1,1,1,84,15,115,12,3,2,0,0,0,2,84,15,0,0,"",0,0,0,0,170,'',[ +minilines(84,15,0,0,1,0,0,[ +mini_line(84,12,3,0,0,0,[ +str_block(0,84,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,-1,0,0,0,0,0, + "page contents")]) ]) ])]). -text('black',400,258,1,1,1,115,15,119,12,3,2,0,0,0,2,115,15,0,0,"",0,0,0,0,270,'',[ -minilines(115,15,0,0,1,0,0,[ -mini_line(115,12,3,0,0,0,[ -str_block(0,115,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0, - "rendering sequence")]) +text('black',400,258,1,1,1,129,15,119,12,3,2,0,0,0,2,129,15,0,0,"",0,0,0,0,270,'',[ +minilines(129,15,0,0,1,0,0,[ +mini_line(129,12,3,0,0,0,[ +str_block(0,129,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,129,12,3,0,-1,0,0,0,0,0, + "rendering instructions")]) ]) ])]). diff --git a/docs/objrel.png b/docs/objrel.png index 528228cedfd7aeafd47945d2635049ff0333a21f..3b9f5b6275f8f3f4f49762002bcbbd3be93db1d2 100644 GIT binary patch literal 2038 zcmV8%Z1h@U)6S2&A0%a6;*xo~*Cl?u!A*KD1DUaQzpQ zOUU=>#8wwl8-_q5scGmfF41zV~g`um~;)brOAb6u> zjWf%VH*`w^LGy98@-%PB>vPr>45qV2Pjy!Wfp_8iN3Kzy=PhyYyu*$uiE)N4047R46FNabDERKh6tG!2K z;V6Y+JLwi`8Wh$FULgsENBzS3XZ-}54RZcR(sm8*ar2nJF0WD!bi0|D!PpepT}FSEI}YYU=|_J4Acoje)PJ6 zK)zfHpXa;bDHax;6aU>>@=~+CnWbv?W|EKNV_<7#_esIcntj+YF6<%)-{#W!G0@Ue zmKK22o?YQ!KK*unp}ZV|#5&)hzprM?93*mqhdY%7iI0K0oFV#RjzOpK_B@q%IFp}= zSFVT@?4x?F>dTbqa&FLK$7%8V5j@DJ4`y%QX~7mPr$q;T^uuMFhx0BY5HK z<3Y}Wl!Rf*Qss5$t-c<>_L@z&qk51`A1u_%A=uh$?d~+l`F1{d<=oiFYt@86^xc8J zAJHQKddxwOa*^XImp=uK>m2mT3cZ$#Ts=lYo1NdmdNr_~9VOrjIT}VJ_69A}?hMMq`(6mza9IB^ztB~- zpRSaFkkU$`bGBp<0)$tzLOc4B(HV@}qVSd)B?r!S20J8C*rrC+ft}7^Fcj`%#h`8< z=Yuu(9o6Q1ogLM+aVu)<-qef^$G|xL}M*l>J1EfeI#ez4jpn-KW#Wil~6SWJgVFP z0)>PtfgpdSLqku(quc~e-~|46;b8#@hY_9!gSA2C!QgPu0ENRn+=IbC;Sjjc{TJZ4 z#ry+K7lWnC+ADW)T?E4Gj9JCOpK``Zc|m!eQ1+p)LP+f6JoyNNrRz;+TR(qB z$QkUdRS{9ggWA-8o-5@NBLXFE?{VsAv?>`rSUCmb%hpoUH5Qb%cQ5+`yMPbZ!AUl$>8xVd#dsHL%soeIIDVa|K0dN<0w@Uz#&&VsHd-d_}#>hC{n zy)Q2(?}aU|&`z8zW~Vo##SERvQ1VUge(1flDN(XnqbR?m+pnCwCUje|w^|~~bi;vf zXw!k}sW7Y(WuL)a4_=8*Ivq+DvkgO%GnGcBLX`1+90%(}Pc&#z-fDc_YLwOSNA>Ph z9OlNr+GdM=s=!HRHBi-wgr=M~wegUr*GXIy)^aPG+b@ZtHn);`IzuGk7EQIe`GzR@+Uv~RO6Hx$SKb57<+cOO#)d9u zisG=DXMJ4E%iD(T^idqvEOu0X+udo>=RYo$g`q!n{Y7y|v9Rthg656~hAn>;0BXJ} zaroIkbpZN! zcQQH@9DsZ$L6GlcbPPGT&~-ccMOVnd&L97GC;_K7%xY6F7krIDua`?8S!9UBO@}HXVx<;IPYnx)z7SlE%JzaZ3p7O}Q`L z(^3%Fmb}*~D6H|X5IFe53vTQS0EbVlxv?(*96l%K#=ZdX)|Uf7r=7Uw{>qEB4S=V% zt$M#JV^1&wpk&*r72`aoxmj_@nn}qo^kvO$TpY6JF;hR#qEc`lcmzP_Uuu<}GiX8} zi*cmZGWx3RjsxP5ojld)s}|sR&H!3C z=p{I8&BCF(VX->M7A@l#3soF!Z6Kj#Oo8Ez2Q~Hrx^gTJx^ES9t=@X@xSxmJk6i1p z5nzx&BLMd}IBd+}9tVewa_)2P{Vlcx9m)F;IFhqDs{Vns3!FRHiKMh@gJ&z&4yA*( zfirT+g}@n^tpTbGLe~InwLvQsvFd0`cD?x&vCcuStk7#Y{OZvQ%@hz2y+Y>F-$ON8 USqtW(82|tP07*qoM6N<$f`cLBtN;K2 literal 2006 zcmV;{2Pyc8P)0~QDNsfBo@7Aym zA0)oZ<-_SD`F?yIC1&H&UhIDbE!${1-%!wo!cO%YefdSN=zOb!U`t%p6v=wBu1FFH zUW!HS*s_Fm#Y%&qakto5EL+0bf^`mq`Jy(IoE9zX5(dT9w~9#nDBjS=KdygLk1@%p z3WeoudRDiqzKSvIIazrcwCN>uQZrOBg0a|x*qt$KGRHGup})d~W}F^$Is(?FL$}r8 zovCn=!myWg3oRK6Ta{5E355sa(g6?-)8mpEt{+w+V9%=@$f0^O3bwC{$xuCe(UUxmoVB+M5*J#<{QK6;n<_AUj~zAkYZZaYJ_W zI)^~E+zMUi2zZEvHRr;AyOKPU?a|EAw1@YTcN0^fnSK4B;%1F8>}#iXje)O9`SKKK zDLG3HK;53pGO(O~m0D6WAxNx~E&aFUqRK!bR9Lvxm?w!TP!S45c~YX#X}p)x(=5DS zzMp8ElXS3;(ut&GbfO3)uf@L8;^PV2E$4UVuJ+|%i(y?8N_PTJ zeSO?5IZ!8In6o5d%{kPU9_+2yggL9b$^7n8TMa>T+kCy%At#&V;GJ`6BkxrM0@3dd z^!pJ#0-(nn^e7iOt}^*U&`8eY4??q5)ZD>NI?sa;JzGW1k4%0q)MC_(9NEXHH2{+j zomKp*1HaBeudL8(xyaRHB(&-E_13F_h3+H)SIE&YBGCi147=}@z4tvIwBfM*CbcvW zHy5)i5bAnkzJL5wBLoP~^@iS8o@#w>+~$Q>x>k4KT;JOv@xq?2H67URdxIf&9~+wP z_AwQx_~cpbrrPwZuBd?uzFoNy{t+BDYi^MbHvM^iI8L3mRyDPgEfoqs6pp3tsw z!tM@{x&0Ss}*?5S|&@shX2iUBcWs?v;c!HVUd(m_$N1X3qoJB`pn1yD@v7g>De4 zyijKH*0_sIktQxUkbOmdXa){|8>;>yKTP_I+;HG8@vYJ}`G&IKrGxcz_@zwi$QCkpJiNpcN zuUAAYHZ&!_qA5b5#sR%6ZtaW3IZct@%_)xywKxFQ#A4rpzxS1lUX%GJR*Q(MnnwHh zP$=Bg$Ei@2gp@{{vr4l1O=mSC$>Uh?G}ziO>0cE%$!G@atSD#*WkbFJ(vL@Ph8fAo zwpDR$>++qr)loCPCdaQgLFdGYapC8;$HE#(@WNIpyRrEL5vA^nq>?X?GF={eBotke6FA9l;Mk59_NrgKBD5q{XR=vjULdaJ8H`~IoQ4gG20 zFY-ej3)}u8XzsXWX!@%FQ1(^v!;k){1JK6W;lhL8z0AxD}f@~+FW5~gUuG`7axnx09cB7Z6I= zeFuK0gWg-AcXR0dBYww<-Xo)T<>)v7KJtK$b)cgw=y(o3f`pD)?XiO-CN2kt|0MW4 zn16IH?MC#Vuye7~M77xI?sQXHHHm>8O%qG-K6d9jI?2P~z-e)w|QKZ zfx`j)?N$N~>#}PqP&kacFYeP)AlOTfUgJ>MVqQVe`^O73EChkqPp#3|7X)5ECr4vn z5O~GrAkb+iA-f+wIc-DWp>0c}PpyVB2&Qd2m(@;6GR*=YeNZwdtST9qHV%OFdCc5* zdQGg*2Oc5N`N}QpClneGNMjtim4dQt({TU*>1C9hJ}$d?0HmW$!a1v_ZtDOzdaA5x z|7FwXPXW;TuNL5Vjt4Cq93?n3=it!YuvmR$i +
  • Overview
  • Basic Usage
  • Layout Analysis
  • TOC Extraction + +
    +

    Overview

    +

    +PDF is evil. +Because a PDF file is normally big and has a complex structure, +parsing a PDF as a whole is time-and-memory +consuming. Furthermore, not every part is needed for most PDF +processing. Therefore, PDFMiner takes a strategy of lazy parsing, +which is to parse the stuff only when it's necessary. To parse PDF +files, you need at least two classes: PDFParser +and PDFDocument. These objects work together. +PDFParser fetches (or parses) data from a PDF, +and PDFDocument stores it. You'll also need +PDFPageInterpreter to process the page contents +and PDFDevice to translate it to whatever you need. + +

    +PDF documents are more like graphics format, rather than text +format. The contents in PDF are just a bunch of procedures that +tell how to render the stuff on a display or paper. In most +cases, it presents no logical structure such as sentences or +paragraphs. So PDFMiner attempts to reconstruct some of them by +performing layout analysis. Ugly, I know. Again, PDF is evil. + +

    +Figure 1 shows the relationship between these classes: + +

    +
    +Figure 1. Relationships between PDFMiner classes +
    +

    Basic Usage

    @@ -57,25 +91,11 @@ for page in doc.get_pages(): interpreter.process_page(page) -

    -In PDFMiner, there are several Python classes involved in parsing a PDF file, -as shown in Figure 1. - -

    -
    -Figure 1. Relationships between PDFMiner objects -
    -

    Accessing Layout Objects

    -PDF documents are more like graphics, rather than text documents. -In most cases, it presents no logical structure such as sentences or paragraphs. -PDFMiner attempts to reconstruct some of them by performing -basic layout analysis. -

    -Here is a typical way to do it: +Here is a typical way to use the layout analysis function:

     from pdfminer.layout import LAParams
     from pdfminer.converter import PDFPageAggregator
    @@ -172,11 +192,12 @@ for (level,title,dest,a,se) in outlines:
     

    -In some PDF documents, destinations are referred to as page numbers. -In other PDF documents, destinations are referred to as page numbers plus -the location within the page. Since PDF does not provide a way to -point to graphical objects in a page, normally these in-page destinations -are specified by physical coordinates. +Some PDF documents use page numbers as destinations, while others +use page numbers and the physical location within the page. Since +PDF does not have a logical strucutre, and it does not provide a +way to refer to any in-page object from the outside, there's no +way to tell exactly which part of text these destinations are +refering to.


    Yusuke Shinyama