diff --git a/docs/objrel.obj b/docs/objrel.obj index bcfb0a7..12a0f56 100644 --- a/docs/objrel.obj +++ b/docs/objrel.obj @@ -1,4 +1,4 @@ -%TGIF 4.1.45-QPL +%TGIF 4.2.2 state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0). % % @(#)$Header$ @@ -30,6 +30,8 @@ script_frac("0.6"). fg_bg_colors('black','white'). dont_reencode("FFDingbests:ZapfDingbats"). objshadow_info('#c0c0c0',2,2). +rotate_pivot(0,0,0,0). +spline_tightness(1). page(1,"",1,''). oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[ ]). @@ -167,19 +169,19 @@ poly('black','',2,[ "0","",[ 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ ]). -text('black',400,158,1,1,1,68,15,115,12,3,2,0,0,0,2,68,15,0,0,"",0,0,0,0,170,'',[ -minilines(68,15,0,0,1,0,0,[ -mini_line(68,12,3,0,0,0,[ -str_block(0,68,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,68,12,3,0,0,0,0,0,0,0, - "page object")]) +text('black',400,158,1,1,1,84,15,115,12,3,2,0,0,0,2,84,15,0,0,"",0,0,0,0,170,'',[ +minilines(84,15,0,0,1,0,0,[ +mini_line(84,12,3,0,0,0,[ +str_block(0,84,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,-1,0,0,0,0,0, + "page contents")]) ]) ])]). -text('black',400,258,1,1,1,115,15,119,12,3,2,0,0,0,2,115,15,0,0,"",0,0,0,0,270,'',[ -minilines(115,15,0,0,1,0,0,[ -mini_line(115,12,3,0,0,0,[ -str_block(0,115,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0, - "rendering sequence")]) +text('black',400,258,1,1,1,129,15,119,12,3,2,0,0,0,2,129,15,0,0,"",0,0,0,0,270,'',[ +minilines(129,15,0,0,1,0,0,[ +mini_line(129,12,3,0,0,0,[ +str_block(0,129,12,3,0,-1,0,0,0,[ +str_seg('black','Helvetica-Bold',1,69120,129,12,3,0,-1,0,0,0,0,0, + "rendering instructions")]) ]) ])]). diff --git a/docs/objrel.png b/docs/objrel.png index 528228c..3b9f5b6 100644 Binary files a/docs/objrel.png and b/docs/objrel.png differ diff --git a/docs/programming.html b/docs/programming.html index 193051b..efd3ffc 100644 --- a/docs/programming.html +++ b/docs/programming.html @@ -16,11 +16,45 @@ blockquote { background: #eeeeee; } This document explains how to use PDFMiner as a library from other applications. + +
+

Overview

+

+PDF is evil. +Because a PDF file is normally big and has a complex structure, +parsing a PDF as a whole is time-and-memory +consuming. Furthermore, not every part is needed for most PDF +processing. Therefore, PDFMiner takes a strategy of lazy parsing, +which is to parse the stuff only when it's necessary. To parse PDF +files, you need at least two classes: PDFParser +and PDFDocument. These objects work together. +PDFParser fetches (or parses) data from a PDF, +and PDFDocument stores it. You'll also need +PDFPageInterpreter to process the page contents +and PDFDevice to translate it to whatever you need. + +

+PDF documents are more like graphics format, rather than text +format. The contents in PDF are just a bunch of procedures that +tell how to render the stuff on a display or paper. In most +cases, it presents no logical structure such as sentences or +paragraphs. So PDFMiner attempts to reconstruct some of them by +performing layout analysis. Ugly, I know. Again, PDF is evil. + +

+Figure 1 shows the relationship between these classes: + +

+
+Figure 1. Relationships between PDFMiner classes +
+

Basic Usage

@@ -57,25 +91,11 @@ for page in doc.get_pages(): interpreter.process_page(page) -

-In PDFMiner, there are several Python classes involved in parsing a PDF file, -as shown in Figure 1. - -

-
-Figure 1. Relationships between PDFMiner objects -
-

Accessing Layout Objects

-PDF documents are more like graphics, rather than text documents. -In most cases, it presents no logical structure such as sentences or paragraphs. -PDFMiner attempts to reconstruct some of them by performing -basic layout analysis. -

-Here is a typical way to do it: +Here is a typical way to use the layout analysis function:

 from pdfminer.layout import LAParams
 from pdfminer.converter import PDFPageAggregator
@@ -172,11 +192,12 @@ for (level,title,dest,a,se) in outlines:
 

-In some PDF documents, destinations are referred to as page numbers. -In other PDF documents, destinations are referred to as page numbers plus -the location within the page. Since PDF does not provide a way to -point to graphical objects in a page, normally these in-page destinations -are specified by physical coordinates. +Some PDF documents use page numbers as destinations, while others +use page numbers and the physical location within the page. Since +PDF does not have a logical strucutre, and it does not provide a +way to refer to any in-page object from the outside, there's no +way to tell exactly which part of text these destinations are +refering to.


Yusuke Shinyama