diff --git a/docs/programming.html b/docs/programming.html index 6daaeae..47425d3 100644 --- a/docs/programming.html +++ b/docs/programming.html @@ -9,7 +9,7 @@
A typical way to parse a PDF file is the following:
--from pdfminer.pdfparser import PDFParser, PDFDocument -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice # Open a PDF file. @@ -84,15 +88,12 @@ fp = open('mypdf.pdf', 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. -doc = PDFDocument() -# Connect the parser and document objects. -parser.set_document(doc) -doc.set_parser(parser) +document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) -doc.initialize(password) +document.initialize(password) # Check if the document allows text extraction. If not, abort. -if not doc.is_extractable: +if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() @@ -101,11 +102,11 @@ device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. -for page in doc.get_pages(): +for page in PDFPage.create_pages(document): interpreter.process_page(page)
Here is a typical way to use the layout analysis function:
-The layout analyzer gives a "@@ -117,15 +118,15 @@ laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) -for page in doc.get_pages(): +for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result()
LTPage
" object for each page
-in the PDF document. The object contains child objects within the page,
-forming a tree-like structure. Figure 2 shows the relationship between
+A layout analyzer returns a LTPage
object for each page
+in the PDF document. This object contains child objects within the page,
+forming a tree structure. Figure 2 shows the relationship between
these objects.
LTCurve
-Also, check out a more complete example by Denis Papathanasiou. -
PDFMiner provides functions to access the document's table of contents ("Outlines").
@@ -209,12 +210,12 @@ for (level,title,dest,a,se) in outlines:-from pdfminer.pdfparser import PDFParser, PDFDocument +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +# Open a PDF document. fp = open('mypdf.pdf', 'rb') parser = PDFParser(fp) -doc = PDFDocument() -parser.set_document(doc) -doc.set_parser(parser) -doc.initialize(password) +document = PDFDocument(parser) +document.initialize(password) # Get the outlines of the document. -outlines = doc.get_outlines() +outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: print (level, title)
Some PDF documents use page numbers as destinations, while others use page numbers and the physical location within the page. Since -PDF does not have a logical strucutre, and it does not provide a +PDF does not have a logical structure, and it does not provide a way to refer to any in-page object from the outside, there's no way to tell exactly which part of text these destinations are -refering to. +referring to. -
You can extend PDFPageInterpreter
and PDFDevice
class
diff --git a/docs/style.css b/docs/style.css
index a4caeae..612e308 100644
--- a/docs/style.css
+++ b/docs/style.css
@@ -1,3 +1,4 @@
blockquote { background: #eeeeee; }
h1 { border-bottom: solid black 2px; }
h2 { border-bottom: solid black 1px; }
+.comment { color: darkgreen; }