diff --git a/docs/programming.html b/docs/programming.html index 6daaeae..47425d3 100644 --- a/docs/programming.html +++ b/docs/programming.html @@ -9,7 +9,7 @@
-Last Modified: Mon Nov 11 10:18:06 UTC 2013 +Last Modified: Wed Nov 13 05:50:56 UTC 2013
@@ -23,9 +23,9 @@ from other applications.

Overview

@@ -75,8 +75,12 @@ Figure 1 shows the relationship between the classes in PDFMiner.

A typical way to parse a PDF file is the following:

-from pdfminer.pdfparser import PDFParser, PDFDocument
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 
 # Open a PDF file.
@@ -84,15 +88,12 @@ fp = open('mypdf.pdf', 'rb')
 # Create a PDF parser object associated with the file object.
 parser = PDFParser(fp)
 # Create a PDF document object that stores the document structure.
-doc = PDFDocument()
-# Connect the parser and document objects.
-parser.set_document(doc)
-doc.set_parser(parser)
+document = PDFDocument(parser)
 # Supply the password for initialization.
 # (If no password is set, give an empty string.)
-doc.initialize(password)
+document.initialize(password)
 # Check if the document allows text extraction. If not, abort.
-if not doc.is_extractable:
+if not document.is_extractable:
     raise PDFTextExtractionNotAllowed
 # Create a PDF resource manager object that stores shared resources.
 rsrcmgr = PDFResourceManager()
@@ -101,11 +102,11 @@ device = PDFDevice(rsrcmgr)
 # Create a PDF interpreter object.
 interpreter = PDFPageInterpreter(rsrcmgr, device)
 # Process each page contained in the document.
-for page in doc.get_pages():
+for page in PDFPage.create_pages(document):
     interpreter.process_page(page)
 
-

Accessing Layout Objects

+

Performing Layout Analysis

Here is a typical way to use the layout analysis function:

@@ -117,15 +118,15 @@ laparams = LAParams()
 # Create a PDF page aggregator object.
 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
 interpreter = PDFPageInterpreter(rsrcmgr, device)
-for page in doc.get_pages():
+for page in PDFPage.create_pages(document):
     interpreter.process_page(page)
     # receive the LTPage object for the page.
     layout = device.get_result()
 
-The layout analyzer gives a "LTPage" object for each page -in the PDF document. The object contains child objects within the page, -forming a tree-like structure. Figure 2 shows the relationship between +A layout analyzer returns a LTPage object for each page +in the PDF document. This object contains child objects within the page, +forming a tree structure. Figure 2 shows the relationship between these objects.
@@ -179,29 +180,29 @@ Could be used for separating text or figures. Could be used for framing another pictures or figures.
LTCurve -
Represents a generic bezier curve. +
Represents a generic Bezier curve.

Also, check out a more complete example by Denis Papathanasiou. -

TOC Extraction

+

Obtaining Table of Contents

PDFMiner provides functions to access the document's table of contents ("Outlines").

-from pdfminer.pdfparser import PDFParser, PDFDocument
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
 
+# Open a PDF document.
 fp = open('mypdf.pdf', 'rb')
 parser = PDFParser(fp)
-doc = PDFDocument()
-parser.set_document(doc)
-doc.set_parser(parser)
-doc.initialize(password)
+document = PDFDocument(parser)
+document.initialize(password)
 
 # Get the outlines of the document.
-outlines = doc.get_outlines()
+outlines = document.get_outlines()
 for (level,title,dest,a,se) in outlines:
     print (level, title)
 
@@ -209,12 +210,12 @@ for (level,title,dest,a,se) in outlines:

Some PDF documents use page numbers as destinations, while others use page numbers and the physical location within the page. Since -PDF does not have a logical strucutre, and it does not provide a +PDF does not have a logical structure, and it does not provide a way to refer to any in-page object from the outside, there's no way to tell exactly which part of text these destinations are -refering to. +referring to. -

Parser Extension

+

Extending Functionality

You can extend PDFPageInterpreter and PDFDevice class diff --git a/docs/style.css b/docs/style.css index a4caeae..612e308 100644 --- a/docs/style.css +++ b/docs/style.css @@ -1,3 +1,4 @@ blockquote { background: #eeeeee; } h1 { border-bottom: solid black 2px; } h2 { border-bottom: solid black 1px; } +.comment { color: darkgreen; }