From bc034c8e59d90e9b2cb8697a1d2fe9a33401761e Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Thu, 7 Nov 2019 21:12:34 +0100 Subject: [PATCH] Create sphinx documentation for Read the Docs (#329) Fixes #171 Fixes #199 Fixes #118 Fixes #178 Added: tests for building documentation and example code in documentation Added: docstrings for common used functions and classes Removed: old documentation --- .travis.yml | 2 +- CHANGELOG.md | 3 + README.md | 68 +-- docs/.gitignore | 1 + docs/Makefile | 20 + docs/cid.obj | 225 --------- docs/cid.png | Bin 2689 -> 0 bytes docs/index.html | 427 ------------------ docs/layout.obj | 391 ---------------- docs/make.bat | 35 ++ docs/objrel.obj | 187 -------- docs/objrel.png | Bin 2038 -> 0 bytes docs/programming.html | 223 --------- docs/requirements.txt | 1 + docs/source/_static/layout_analysis.html | 28 ++ .../_static/layout_analysis_group_boxes.html | 23 + .../_static/layout_analysis_group_lines.html | 45 ++ .../_static/layout_analysis_output.png} | Bin docs/source/api/commandline.rst | 25 + docs/source/api/composable.rst | 20 + docs/source/api/highlevel.rst | 21 + docs/source/api/index.rst | 9 + docs/source/conf.py | 61 +++ docs/source/index.rst | 72 +++ docs/source/topics/converting_pdf_to_text.rst | 132 ++++++ docs/source/topics/index.rst | 7 + docs/source/tutorials/commandline.rst | 41 ++ docs/source/tutorials/composable.rst | 33 ++ docs/source/tutorials/highlevel.rst | 67 +++ docs/source/tutorials/index.rst | 9 + docs/style.css | 4 - pdfminer/converter.py | 3 + pdfminer/high_level.py | 54 ++- pdfminer/layout.py | 141 +++--- pdfminer/pdfdevice.py | 6 +- pdfminer/pdfinterp.py | 3 +- setup.py | 5 +- tools/dumppdf.py | 32 +- tools/pdf2txt.py | 94 ++-- tox.ini | 11 +- 40 files changed, 879 insertions(+), 1650 deletions(-) create mode 100644 docs/.gitignore create mode 100644 docs/Makefile delete mode 100644 docs/cid.obj delete mode 100644 docs/cid.png delete mode 100644 docs/index.html delete mode 100644 docs/layout.obj create mode 100644 docs/make.bat delete mode 100644 docs/objrel.obj delete mode 100644 docs/objrel.png delete mode 100644 docs/programming.html create mode 100644 docs/requirements.txt create mode 100644 docs/source/_static/layout_analysis.html create mode 100644 docs/source/_static/layout_analysis_group_boxes.html create mode 100644 docs/source/_static/layout_analysis_group_lines.html rename docs/{layout.png => source/_static/layout_analysis_output.png} (100%) create mode 100644 docs/source/api/commandline.rst create mode 100644 docs/source/api/composable.rst create mode 100644 docs/source/api/highlevel.rst create mode 100644 docs/source/api/index.rst create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 docs/source/topics/converting_pdf_to_text.rst create mode 100644 docs/source/topics/index.rst create mode 100644 docs/source/tutorials/commandline.rst create mode 100644 docs/source/tutorials/composable.rst create mode 100644 docs/source/tutorials/highlevel.rst create mode 100644 docs/source/tutorials/index.rst delete mode 100644 docs/style.css diff --git a/.travis.yml b/.travis.yml index 7fdc03c..319aa54 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,4 +9,4 @@ python: install: - pip install tox-travis script: - - tox + - tox -r diff --git a/CHANGELOG.md b/CHANGELOG.md index db72a80..859d9c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330) - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46)) +- Sphinx documentation that is published on + [Read the Docs](https://pdfminersix.readthedocs.io/) + ([#329](https://github.com/pdfminer/pdfminer.six/pull/329)) ### Fixed - Unhandled AssertionError when dumping pdf containing reference to object id 0 diff --git a/README.md b/README.md index fae5fb0..6ea2015 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,22 @@ -PDFMiner.six +pdfminer.six ============ -PDFMiner.six is a fork of PDFMiner using six for Python 2+3 compatibility +[![Build Status](https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master)](https://travis-ci.org/pdfminer/pdfminer.six) +[![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/) +[![gitter](https://badges.gitter.im/pdfminer-six/Lobby.svg)](https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium) -[![Build Status](https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master)](https://travis-ci.org/pdfminer/pdfminer.six) [![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/) - -PDFMiner is a tool for extracting information from PDF documents. +Pdfminer.six is an community maintained fork of the original PDFMiner. It is a +tool for extracting information from PDF documents. Unlike other PDF-related tools, it focuses entirely on getting -and analyzing text data. PDFMiner allows one to obtain +and analyzing text data. Pdfminer.six allows one to obtain the exact location of text in a page, as well as other information such as fonts or lines. It includes a PDF converter that can transform PDF files into other text formats (such as HTML). It has an extensible PDF parser that can be used for other purposes than text analysis. - * Webpage: https://github.com/pdfminer/ - * Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/ +Check out the full documentation on +[Read the Docs](https://pdfminersix.readthedocs.io). Features @@ -33,53 +34,20 @@ Features * Automatic layout analysis. -How to Install --------------- +How to use +---------- - * Install Python 2.7 or newer. - * Install + * Install Python 2.7 or newer. Note that Python 2 support is dropped at + January, 2020. `pip install pdfminer.six` - * Run the following test: + * Use command-line interface to extract text from pdf: - `pdf2txt.py samples/simple1.pdf` - - -Command Line Tools ------------------- - -PDFMiner comes with two handy tools: -pdf2txt.py and dumppdf.py. - -**pdf2txt.py** - -pdf2txt.py extracts text contents from a PDF file. -It extracts all the text that are to be rendered programmatically, -i.e. text represented as ASCII or Unicode strings. -It cannot recognize text drawn as images that would require optical character recognition. -It also extracts the corresponding locations, font names, font sizes, writing -direction (horizontal or vertical) for each text portion. -You need to provide a password for protected PDF documents when its access is restricted. -You cannot extract any text from a PDF document which does not have extraction permission. - -(For details, refer to /docs/index.html.) - -**dumppdf.py** - -dumppdf.py dumps the internal contents of a PDF file in pseudo-XML format. -This program is primarily for debugging purposes, -but it's also possible to extract some meaningful contents (e.g. images). - -(For details, refer to /docs/index.html.) - - -TODO ----- - - * PEP-8 and PEP-257 conformance. - * Better documentation. - * Performance improvements. + `python pdf2txt.py samples/simple1.pdf` + +* Check out more examples and documentation on +[Read the Docs](https://pdfminersix.readthedocs.io). Contributing diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..d163863 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +build/ \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/cid.obj b/docs/cid.obj deleted file mode 100644 index 540ef60..0000000 --- a/docs/cid.obj +++ /dev/null @@ -1,225 +0,0 @@ -%TGIF 4.1.45-QPL -state(0,37,100.000,0,0,0,16,1,9,1,1,2,0,1,0,1,1,'NewCenturySchlbk-Bold',1,103680,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0). -% -% @(#)$Header$ -% %W% -% -unit("1 pixel/pixel"). -color_info(19,65535,0,[ - "magenta", 65535, 0, 65535, 65535, 0, 65535, 1, - "red", 65535, 0, 0, 65535, 0, 0, 1, - "green", 0, 65535, 0, 0, 65535, 0, 1, - "blue", 0, 0, 65535, 0, 0, 65535, 1, - "yellow", 65535, 65535, 0, 65535, 65535, 0, 1, - "pink", 65535, 49344, 52171, 65535, 49344, 52171, 1, - "cyan", 0, 65535, 65535, 0, 65535, 65535, 1, - "CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1, - "white", 65535, 65535, 65535, 65535, 65535, 65535, 1, - "black", 0, 0, 0, 0, 0, 0, 1, - "DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1, - "#00000000c000", 0, 0, 49344, 0, 0, 49152, 1, - "#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1, - "#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1, - "#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1, - "#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1, - "#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1, - "#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1, - "#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1 -]). -script_frac("0.6"). -fg_bg_colors('black','white'). -dont_reencode("FFDingbests:ZapfDingbats"). -objshadow_info('#c0c0c0',2,2). -page(1,"",1,''). -text('black',90,95,1,1,1,66,20,0,15,5,0,0,0,0,2,66,20,0,0,"",0,0,0,0,110,'',[ -minilines(66,20,0,0,1,0,0,[ -mini_line(66,15,5,0,0,0,[ -str_block(0,66,15,5,0,-1,0,0,0,[ -str_seg('black','Courier-Bold',1,103680,66,15,5,0,-1,0,0,0,0,0, - "U+30FC")]) -]) -])]). -text('black',100,285,1,1,1,66,20,3,15,5,0,0,0,0,2,66,20,0,0,"",0,0,0,0,300,'',[ -minilines(66,20,0,0,1,0,0,[ -mini_line(66,15,5,0,0,0,[ -str_block(0,66,15,5,0,-2,0,0,0,[ -str_seg('black','Courier-Bold',1,103680,66,15,5,0,-2,0,0,0,0,0, - "U+5199")]) -]) -])]). -text('black',400,38,2,1,1,119,30,5,12,3,0,0,0,0,2,119,30,0,0,"",0,0,0,0,50,'',[ -minilines(119,30,0,0,1,0,0,[ -mini_line(83,12,3,0,0,0,[ -str_block(0,83,12,3,0,-3,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0, - "Adobe-Japan1")]) -]), -mini_line(119,12,3,0,0,0,[ -str_block(0,119,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,119,12,3,0,-1,0,0,0,0,0, - "CID:660 (horizontal)")]) -]) -])]). -text('black',400,118,2,1,1,114,30,8,12,3,0,0,0,0,2,114,30,0,0,"",0,0,0,0,130,'',[ -minilines(114,30,0,0,1,0,0,[ -mini_line(83,12,3,0,0,0,[ -str_block(0,83,12,3,0,-3,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0, - "Adobe-Japan1")]) -]), -mini_line(114,12,3,0,0,0,[ -str_block(0,114,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,114,12,3,0,-1,0,0,0,0,0, - "CID:7891 (vertical)")]) -]) -])]). -text('black',400,238,2,1,1,125,30,15,12,3,0,0,0,0,2,125,30,0,0,"",0,0,0,0,250,'',[ -minilines(125,30,0,0,1,0,0,[ -mini_line(83,12,3,0,0,0,[ -str_block(0,83,12,3,0,-3,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0, - "Adobe-Japan1")]) -]), -mini_line(125,12,3,0,0,0,[ -str_block(0,125,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,125,12,3,0,-1,0,0,0,0,0, - "CID:2296 (Japanese)")]) -]) -])]). -text('black',400,318,2,1,1,115,30,16,12,3,0,0,0,0,2,115,30,0,0,"",0,0,0,0,330,'',[ -minilines(115,30,0,0,1,0,0,[ -mini_line(67,12,3,0,0,0,[ -str_block(0,67,12,3,0,-3,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,67,12,3,0,-3,0,0,0,0,0, - "Adobe-GB1")]) -]), -mini_line(115,12,3,0,0,0,[ -str_block(0,115,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0, - "CID:3967 (Chinese)")]) -]) -])]). -text('black',200,84,2,1,1,116,38,20,16,3,0,0,0,0,2,116,38,0,0,"",0,0,0,0,100,'',[ -minilines(116,38,0,0,1,0,0,[ -mini_line(70,16,3,0,0,0,[ -str_block(0,70,16,3,0,-1,0,0,0,[ -str_seg('black','NewCenturySchlbk-Roman',0,97920,70,16,3,0,-1,0,0,0,0,0, - "Japanese")]) -]), -mini_line(116,16,3,0,0,0,[ -str_block(0,116,16,3,0,-1,0,0,0,[ -str_seg('black','NewCenturySchlbk-Roman',0,97920,116,16,3,0,-1,0,0,0,0,0, - "long-vowel sign")]) -]) -])]). -oval('black','',30,70,280,140,0,1,1,49,0,0,0,0,0,'1',0,[ -]). -oval('black','',30,260,280,330,0,1,1,51,0,0,0,0,0,'1',0,[ -]). -text('black',200,274,2,1,1,85,38,53,16,3,0,0,0,0,2,85,38,0,0,"",0,0,0,0,290,'',[ -minilines(85,38,0,0,1,0,0,[ -mini_line(61,16,3,0,0,0,[ -str_block(0,61,16,3,0,-1,0,0,0,[ -str_seg('black','NewCenturySchlbk-Roman',0,97920,61,16,3,0,-1,0,0,0,0,0, - "Chinese")]) -]), -mini_line(85,16,3,0,0,0,[ -str_block(0,85,16,3,0,-1,0,0,0,[ -str_seg('black','NewCenturySchlbk-Roman',0,97920,85,16,3,0,-1,0,0,0,0,0, - "letter \"sha\"")]) -]) -])]). -box('black','',330,30,560,80,0,1,1,57,0,0,0,0,0,'1',0,[ -]). -box('black','',330,110,560,160,0,1,1,59,0,0,0,0,0,'1',0,[ -]). -box('black','',330,230,560,280,0,1,1,60,0,0,0,0,0,'1',0,[ -]). -box('black','',330,310,560,360,0,1,1,61,0,0,0,0,0,'1',0,[ -]). -group([ -poly('black','',4,[ - 506,246,501,235,541,235,536,246],0,2,1,68,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]), -poly('black','',5,[ - 519,238,516,252,529,252,524,275,516,272],0,2,1,69,0,0,0,0,0,0,0,'2',0,0, - "00","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]), -poly('black','',2,[ - 501,261,541,261],0,2,1,70,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]), -poly('black','',2,[ - 519,244,529,244],0,2,1,71,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]) -], -76,0,0,[ -]). -group([ -poly('black','',3,[ - 519,119,524,127,524,152],0,2,1,67,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]) -], -78,0,0,[ -]). -group([ -poly('black','',3,[ - 540,57,509,57,501,49],0,2,1,66,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]) -], -80,0,0,[ -]). -group([ -poly('black','',4,[ - 506,326,501,315,541,315,536,326],0,2,1,90,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]), -poly('black','',5,[ - 519,318,515,332,531,332,526,355,519,352],0,2,1,89,0,0,0,0,0,0,0,'2',0,0, - "00","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]), -poly('black','',2,[ - 501,341,526,341],0,2,1,88,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]), -poly('black','',2,[ - 519,324,529,324],0,2,1,87,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]) -], -134,0,0,[ -]). -poly('black','',2,[ - 270,90,320,70],1,3,1,158,0,0,0,0,0,0,0,'3',0,0, - "0","",[ - 0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[ -]). -poly('black','',2,[ - 280,110,320,130],1,3,1,159,0,0,0,0,0,0,0,'3',0,0, - "0","",[ - 0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[ -]). -poly('black','',2,[ - 270,280,310,250],1,3,1,160,0,0,0,0,0,0,0,'3',0,0, - "0","",[ - 0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[ -]). -poly('black','',2,[ - 270,300,310,330],1,3,1,161,0,0,0,0,0,0,0,'3',0,0, - "0","",[ - 0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[ -]). diff --git a/docs/cid.png b/docs/cid.png deleted file mode 100644 index a85f27bb11debe74f9f016996988c734f474434f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2689 zcmV-{3V!v8P)100006P)t-s00030 z|No`gpWOfe3N}eZK~#90?c7Uj+sGLJV7c1XS-`N;&1GE#W)}^dt05g~0fxEV-OB=k zeeJ180K>i%Xi{HdR0y3_VP`Kj0$;*bF%Ns_u_$`VrLybo+5!mi9H4=vI1o=o=;ZHvo49#%@+4kdCjwIJ*MlJf(3o zFYHWnUkgA1KrtnJ)PRq;atcGN@$ly|6wsbxD)^|C(%@_2Qd1c&&s>}DZqF#e%#=p& z)=6dV6S=EqG^8I*DZSN`-daP+Zou;Bt#*6EM(^F6hCFO!y%w=ncZ}|^agxzEn%{)= zqqjD5GA|E5$Ywan`);;-dv%8H{*`hViRr&!;NB(|`HWHG*^Mn{r(fbjpRq~;t6`S} z|7ay_uwIKj5jd;`5tzehOoWbW+-Tbh2e@X$aubbc^J3S!z29z# zb{+3sG8u28#s)#&0+#7UyW7SYu@X--PT@Z-&|jP`Jy^kqDqkL+ ze=z-%B?o0PI>=De#a`)-_xlRo&<>-as|u*A`|Dn_n%WSsVQ;E`T=BlU6ZpU>yP=tC zE3r;88?Q^+W`4u2tAEO8;4+nR8Ry4}vZzQ<8a*b?-1d#tFe=JHbGnS4ijtS1r!hLH ztD*Gm*)r7#~9((uH9Az@b{IMdQGpNLPG%t zH{J-hGSdqmmqCYnY25wh-oZaOm}yTk7?Gcvz3FZ33%E&sL@^}Sh&=a02KvpQ0vTsS z`yXZh9Gx0(aHquhsp1;6M+o|iHMvWSpB}pgwLgabnb;x5PgT#L^$YQ5%Eb8TnP*V@ zr!Z+{V*FJ13~E0a;O;gleroy#t-lV?pp_Uu9U0VdgFi-2il2@Q>iESUtH5_iCcCUXZzI}-eoDFK;akmDcN z7&hSVt;*^6dwvax800t%HWYcdDb^&tzWw3Tdqjg65AuzpnxxjN@9xd5cpr!xgt(Mz zsJhsa)V*qb&!!rrbRA@~s)pL(vND=8Sz7MnWo#@4ayq^H8X%Ul@f^7==+7g;5xVQ5c0$ z7==+7g;5xVQ5c00PsREu2E%sC;#)TafW!5<3?ATjtGUXQ2l&lFNYK!8u8&G@ z#NY2L08Pxuw?0a-OI$i5{(k=$8pk=;N2xYmLXyIWzu$k8+DLf6@7$p@M*RK0^KL7R z5r4m*+HeyeILvNnWTRAJG$!&J{Rv8Ys<0VaIlo~{<~J;4*fT9_;obl&3HX>lr-=1D zV7XmbUD4c|YuFDR!=FmdG%Iz zZ;@goz29$j9K&vEzp(74`1R_Ejt%^bnqZk{V54e;49m0|mVHP3Y;}5`WDw>%;JGBo zFm-4Q8}P>uZul>w`OvkH=3U^~CFs6%^&v7oudW8lBW#f7fArBmsD7MWpC4G-s~eBk zZZ0$vjRO!){ydaJYDkq#C%cVS_e*tVJ`drf*Q@u{pPeXiWJAM=)T9dxuHa zvgH}Hd3|o(BN>`!(B>C9w`F`^TOw!khipb{zU2=&e^rChVl$%iKCMm3H}ndd5nXp; zT}uA8$vJ<=jiaJd&W(VPb=`@1`Q)pgo&D3fThYv9BUYp)c&yI66kp<pdHw4EpxMbrrjZsm0 z_2XPdsjevf#e~&iX$)Rb&X;cIGIUs^fX`JP_jP?&u?1A$ICR)PJqO@7sSLZUYqJvv zczKs(Un0XyV|37;N!?hQIK&T=u+ix-8-XNQX29T>jOfYkFU&u9B^b{+{Nt$zEJn1o zd-gB$;y$l0{9zsKea2)&+YeDT)f%rq?_nd_T}(Ej?by>PvtI*-so;v=XELHKrf8X} z4j*<3uC(z!XOa56w#osJL!`R0gU3O)O9D$xH8e;!6<%8U3cP)Zxi!F@l)Em z6K9-FFaq&Y>bes=-cS5MIQB_fcY@;%XafgOjfGhu*`WV_2CjhvsJeLK8}O+XjGt20 zo!~eO+Q0!+)f|QCBJoqox)aNbDd7NW(1^xQiR1lITnYzJhwqQdL$xBYPvW{0KR$*7 zsO#eBJlEqMx{2)8a5t%HJRIxUJ3rS zFvhhVW7z05#2T;4@LB@pk;>5I*}F$OygH~h;C;zvxXJB!@mFa`84Rb@8P)YHMl}6D v8GL?Y_X>>G6&TtT7}gaSIDHQd`Z(%8O5BJ!PrZ|W00000NkvXXu0mjfRwXe1 diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index 6b6857c..0000000 --- a/docs/index.html +++ /dev/null @@ -1,427 +0,0 @@ - - - - - -PDFMiner - - - -
- -Last Modified: Wed Jun 25 10:27:52 UTC 2014 - -
- -

PDFMiner

-

-Python PDF parser and analyzer - -

-Homepage -  -Recent Changes -  -PDFMiner API - -

- -

What's It?

-

-PDFMiner is a tool for extracting information from PDF documents. -Unlike other PDF-related tools, it focuses entirely on getting -and analyzing text data. PDFMiner allows one to obtain -the exact location of text in a page, as well as -other information such as fonts or lines. -It includes a PDF converter that can transform PDF files -into other text formats (such as HTML). It has an extensible -PDF parser that can be used for other purposes than text analysis. - -

-

Features

-
    -
  • Written entirely in Python. (for version 2.6 or newer) -
  • Parse, analyze, and convert PDF documents. -
  • PDF-1.7 specification support. (well, almost) -
  • CJK languages and vertical writing scripts support. -
  • Various font types (Type1, TrueType, Type3, and CID) support. -
  • Basic encryption (RC4) support. -
  • PDF to HTML conversion. -
  • Outline (TOC) extraction. -
  • Tagged contents extraction. -
  • Reconstruct the original layout by grouping text chunks. -
-

-PDFMiner is about 20 times slower than -other C/C++-based counterparts such as XPdf. - -

-Online Demo: (pdf -> html conversion webapp)
- -http://pdf2html.tabesugi.net:8080/ - - -

Download

-

-Source distribution:
- -http://pypi.python.org/pypi/pdfminer_six/ - - -

-github:
- -https://github.com/goulu/pdfminer/ - - -

Where to Ask

-

-

-Questions and comments:
- -http://groups.google.com/group/pdfminer-users/ - - -

How to Install

-
    -
  1. Install Python 2.6 or newer. -
  2. Download the PDFMiner source. -
  3. Unpack it. -
  4. Run setup.py to install:
    -
    -# python setup.py install
    -
    -
  5. Do the following test:
    -
    -$ pdf2txt.py samples/simple1.pdf
    -Hello
    -
    -World
    -
    -Hello
    -
    -World
    -
    -H e l l o
    -
    -W o r l d
    -
    -H e l l o
    -
    -W o r l d
    -
    -
  6. Done! -
- -

For CJK languages

-

-In order to process CJK languages, you need an additional step to take -during installation: -

-# make cmap
-python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt
-reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'...
-writing 'CNS1_H.py'...
-...
-(this may take several minutes)
-
-# python setup.py install
-
- -

-On Windows machines which don't have make command, -paste the following commands on a command line prompt: -

-mkdir pdfminer\cmap
-python tools\conv_cmap.py -c B5=cp950 -c UniCNS-UTF8=utf-8 pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt
-python tools\conv_cmap.py -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt
-python tools\conv_cmap.py -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt
-python tools\conv_cmap.py -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt
-python setup.py install
-
- -

Command Line Tools

-

-PDFMiner comes with two handy tools: -pdf2txt.py and dumppdf.py. - -

pdf2txt.py

-

-pdf2txt.py extracts text contents from a PDF file. -It extracts all the text that are to be rendered programmatically, -i.e. text represented as ASCII or Unicode strings. -It cannot recognize text drawn as images that would require optical character recognition. -It also extracts the corresponding locations, font names, font sizes, writing -direction (horizontal or vertical) for each text portion. -You need to provide a password for protected PDF documents when its access is restricted. -You cannot extract any text from a PDF document which does not have extraction permission. - -

-Note: -Not all characters in a PDF can be safely converted to Unicode. - -

Examples

-
-$ pdf2txt.py -o output.html samples/naacl06-shinyama.pdf
-(extract text as an HTML file whose filename is output.html)
-
-$ pdf2txt.py -V -c euc-jp -o output.html samples/jo.pdf
-(extract a Japanese HTML file in vertical writing, CMap is required)
-
-$ pdf2txt.py -P mypassword -o output.txt secret.pdf
-(extract a text from an encrypted PDF file)
-
- -

Options

-
-
-o filename -
Specifies the output file name. -By default, it prints the extracted contents to stdout in text format. -

-

-p pageno[,pageno,...] -
Specifies the comma-separated list of the page numbers to be extracted. -Page numbers start at one. -By default, it extracts text from all the pages. -

-

-c codec -
Specifies the output codec. -

-

-t type -
Specifies the output format. The following formats are currently supported. -
    -
  • text : TEXT format. (Default) -
  • html : HTML format. Not recommended for extraction purposes because the markup is messy. -
  • xml : XML format. Provides the most information. -
  • tag : "Tagged PDF" format. A tagged PDF has its own contents annotated with -HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. -Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF"). -
-

-

-I image_directory -
Specifies the output directory for image extraction. -Currently only JPEG images are supported. -

-

-M char_margin -
-L line_margin -
-W word_margin -
These are the parameters used for layout analysis. -In an actual PDF file, text portions might be split into several chunks -in the middle of its running, depending on the authoring software. -Therefore, text extraction needs to splice text chunks. -In the figure below, two text chunks whose distance is closer than -the char_margin (shown as M) is considered -continuous and get grouped into one. Also, two lines whose distance is closer than -the line_margin (L) is grouped -as a text box, which is a rectangular area that contains a "cluster" of text portions. -Furthermore, it may be required to insert blank characters (spaces) as necessary -if the distance between two words is greater than the word_margin -(W), as a blank between words might not be -represented as a space, but indicated by the positioning of each word. -

-Each value is specified not as an actual length, but as a proportion of -the length to the size of each character in question. The default values -are M = 2.0, L = 0.5, and W = 0.1, respectively. - - - - - - - - - - - - - - - - - - - -
M
Q u ic kb r o wn   f o x
WL
-
j u m p s...
-
-

-

-F boxes_flow -
Specifies how much a horizontal and vertical position of a text matters -when determining a text order. The value should be within the range of --1.0 (only horizontal position matters) to +1.0 (only vertical position matters). -The default value is 0.5. -

-

-C -
Suppress object caching. -This will reduce the memory consumption but also slows down the process. -

-

-n -
Suppress layout analysis. -

-

-A -
Forces to perform layout analysis for all the text strings, -including text contained in figures. -

-

-V -
Allows vertical writing detection. -

-

-Y layout_mode -
Specifies how the page layout should be preserved. (Currently only applies to HTML format.) -
    -
  • exact : preserve the exact location of each individual character (a large and messy HTML). -
  • normal : preserve the location and line breaks in each text block. (Default) -
  • loose : preserve the overall location of each text block. -
-

-

-E extractdir -
Specifies the extraction directory of embedded files. -

-

-s scale -
Specifies the output scale. Can be used in HTML format only. -

-

-m maxpages -
Specifies the maximum number of pages to extract. -By default, it extracts all the pages in a document. -

-

-P password -
Provides the user password to access PDF contents. -

-

-d -
Increases the debug level. -
- -
- -

dumppdf.py

-

-dumppdf.py dumps the internal contents of a PDF file -in pseudo-XML format. This program is primarily for debugging purposes, -but it's also possible to extract some meaningful contents -(such as images). - -

Examples

-
-$ dumppdf.py -a foo.pdf
-(dump all the headers and contents, except stream objects)
-
-$ dumppdf.py -T foo.pdf
-(dump the table of contents)
-
-$ dumppdf.py -r -i6 foo.pdf > pic.jpeg
-(extract a JPEG image)
-
- -

Options

-
-
-a -
Instructs to dump all the objects. -By default, it only prints the document trailer (like a header). -

-

-i objno,objno, ... -
Specifies PDF object IDs to display. -Comma-separated IDs, or multiple -i options are accepted. -

-

-p pageno,pageno, ... -
Specifies the page number to be extracted. -Comma-separated page numbers, or multiple -p options are accepted. -Note that page numbers start at one, not zero. -

-

-r (raw) -
-b (binary) -
-t (text) -
Specifies the output format of stream contents. -Because the contents of stream objects can be very large, -they are omitted when none of the options above is specified. -

-With -r option, the "raw" stream contents are dumped without decompression. -With -b option, the decompressed contents are dumped as a binary blob. -With -t option, the decompressed contents are dumped in a text format, -similar to repr() manner. When --r or -b option is given, -no stream header is displayed for the ease of saving it to a file. -

-

-T -
Shows the table of contents. -

-

-E directory -
Extracts embedded files from the pdf into the given directory. -

-

-P password -
Provides the user password to access PDF contents. -

-

-d -
Increases the debug level. -
- -

Changes:

-
    -
  • 2014/09/15: pushed on PyPi
  • -
  • 2014/09/10: pdfminer_six forked from pdfminer since Yusuke didn't want to merge and pdfminer3k is outdated
  • -
- -

TODO

-
    -
  • PEP-8 and -PEP-257 conformance. -
  • Better documentation. -
  • Better text extraction / layout analysis. (writing mode detection, Type1 font file analysis, etc.) -
  • Crypt stream filter support. (More sample documents are needed!) -
- -

Related Projects

- - -

Terms and Conditions

-

-(This is so-called MIT/X License) -

- -Copyright (c) 2004-2013 Yusuke Shinyama <yusuke at cs dot nyu dot edu> -

-Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: -

-The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. -

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR -PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -


-
Yusuke Shinyama (yusuke at cs dot nyu dot edu)
- diff --git a/docs/layout.obj b/docs/layout.obj deleted file mode 100644 index 4c8e3cd..0000000 --- a/docs/layout.obj +++ /dev/null @@ -1,391 +0,0 @@ -%TGIF 4.2.2 -state(0,37,100.000,0,0,0,16,1,9,1,1,0,0,0,0,1,1,'Helvetica-Bold',1,69120,0,0,1,5,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0). -% -% @(#)$Header$ -% %W% -% -unit("1 pixel/pixel"). -color_info(19,65535,0,[ - "magenta", 65535, 0, 65535, 65535, 0, 65535, 1, - "red", 65535, 0, 0, 65535, 0, 0, 1, - "green", 0, 65535, 0, 0, 65535, 0, 1, - "blue", 0, 0, 65535, 0, 0, 65535, 1, - "yellow", 65535, 65535, 0, 65535, 65535, 0, 1, - "pink", 65535, 49344, 52171, 65535, 49344, 52171, 1, - "cyan", 0, 65535, 65535, 0, 65535, 65535, 1, - "CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1, - "white", 65535, 65535, 65535, 65535, 65535, 65535, 1, - "black", 0, 0, 0, 0, 0, 0, 1, - "DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1, - "#00000000c000", 0, 0, 49344, 0, 0, 49152, 1, - "#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1, - "#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1, - "#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1, - "#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1, - "#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1, - "#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1, - "#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1 -]). -script_frac("0.6"). -fg_bg_colors('black','white'). -dont_reencode("FFDingbests:ZapfDingbats"). -objshadow_info('#c0c0c0',2,2). -rotate_pivot(0,0,0,0). -spline_tightness(1). -page(1,"",1,''). -box('black','',50,45,300,355,2,2,1,0,0,0,0,0,0,'2',0,[ -]). -box('black','',75,75,195,225,2,1,1,10,8,0,0,0,0,'1',0,[ -]). -box('black','',85,105,185,125,2,1,1,18,8,0,0,0,0,'1',0,[ -]). -box('black','',85,105,105,125,2,1,1,19,0,0,0,0,0,'1',0,[ -]). -box('black','',105,105,125,125,2,1,1,20,0,0,0,0,0,'1',0,[ -]). -text('black',95,108,1,1,1,9,15,21,12,3,0,0,0,0,2,9,15,0,0,"",0,0,0,0,120,'',[ -minilines(9,15,0,0,1,0,0,[ -mini_line(9,12,3,0,0,0,[ -str_block(0,9,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica',0,69120,9,12,3,0,-1,0,0,0,0,0, - "A")]) -]) -])]). -text('black',115,108,1,1,1,8,15,28,12,3,0,0,0,0,2,8,15,0,0,"",0,0,0,0,120,'',[ -minilines(8,15,0,0,1,0,0,[ -mini_line(8,12,3,0,0,0,[ -str_block(0,8,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica',0,69120,8,12,3,0,-1,0,0,0,0,0, - "B")]) -]) -])]). -box('black','',125,105,145,125,0,1,1,32,0,0,0,0,0,'1',0,[ -]). -text('black',135,108,1,1,1,9,15,36,12,3,0,0,0,0,2,9,15,0,0,"",0,0,0,0,120,'',[ -minilines(9,15,0,0,1,0,0,[ -mini_line(9,12,3,0,0,0,[ -str_block(0,9,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica',0,69120,9,12,3,0,-1,0,0,0,0,0, - "C")]) -]) -])]). -poly('black','',2,[ - 215,140,215,220],0,3,1,51,0,0,0,0,0,0,0,'3',0,0, - "0","",[ - 0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[ -]). -box('black','',175,265,270,325,0,3,1,65,0,0,0,0,0,'3',0,[ -]). -box('black','',185,270,260,320,0,1,1,69,8,0,0,0,0,'1',0,[ -]). -poly('black','',6,[ - 195,295,215,290,235,310,245,285,225,300,195,295],0,2,1,74,0,0,0,0,0,0,0,'2',0,0, - "00","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -box('black','',85,275,140,315,1,2,0,87,0,0,0,0,0,'2',0,[ -]). -text('black',85,23,1,1,1,44,15,93,12,3,0,0,0,0,2,44,15,0,0,"",0,0,0,0,35,'',[ -minilines(44,15,0,0,1,0,0,[ -mini_line(44,12,3,0,0,0,[ -str_block(0,44,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0, - "LTPage")]) -]) -])]). -text('black',255,133,1,1,1,39,15,100,12,3,0,0,0,0,2,39,15,0,0,"",0,0,0,0,145,'',[ -minilines(39,15,0,0,1,0,0,[ -mini_line(39,12,3,0,0,0,[ -str_block(0,39,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,39,12,3,0,-1,0,0,0,0,0, - "LTLine")]) -]) -])]). -text('black',125,83,1,1,1,42,15,104,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,95,'',[ -minilines(42,15,0,0,1,0,0,[ -mini_line(42,12,3,0,0,0,[ -str_block(0,42,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0, - "LTChar")]) -]) -])]). -text('black',245,53,1,1,1,65,15,108,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,65,'',[ -minilines(65,15,0,0,1,0,0,[ -mini_line(65,12,3,0,0,0,[ -str_block(0,65,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0, - "LTTextBox")]) -]) -])]). -text('black',245,88,1,1,1,66,15,110,12,3,0,0,0,0,2,66,15,0,0,"",0,0,0,0,100,'',[ -minilines(66,15,0,0,1,0,0,[ -mini_line(66,12,3,0,0,0,[ -str_block(0,66,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,66,12,3,0,-1,0,0,0,0,0, - "LTTextLine")]) -]) -])]). -text('black',255,243,1,1,1,51,15,112,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,255,'',[ -minilines(51,15,0,0,1,0,0,[ -mini_line(51,12,3,0,0,0,[ -str_block(0,51,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0, - "LTFigure")]) -]) -])]). -text('black',140,243,1,1,1,51,15,114,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,255,'',[ -minilines(51,15,0,0,1,0,0,[ -mini_line(51,12,3,0,0,0,[ -str_block(0,51,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0, - "LTImage")]) -]) -])]). -text('black',240,223,1,1,1,43,15,116,12,3,0,0,0,0,2,43,15,0,0,"",0,0,0,0,235,'',[ -minilines(43,15,0,0,1,0,0,[ -mini_line(43,12,3,0,0,0,[ -str_block(0,43,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,43,12,3,0,0,0,0,0,0,0, - "LTRect")]) -]) -])]). -text('black',190,333,1,1,1,50,15,118,12,3,0,0,0,0,2,50,15,0,0,"",0,0,0,0,345,'',[ -minilines(50,15,0,0,1,0,0,[ -mini_line(50,12,3,0,0,0,[ -str_block(0,50,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,50,12,3,0,-1,0,0,0,0,0, - "LTCurve")]) -]) -])]). -text('black',170,138,1,1,1,42,15,121,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,150,'',[ -minilines(42,15,0,0,1,0,0,[ -mini_line(42,12,3,0,0,0,[ -str_block(0,42,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0, - "LTText")]) -]) -])]). -box('black','',145,105,165,125,0,1,1,125,8,0,0,0,0,'1',0,[ -]). -poly('black','',2,[ - 105,95,95,110],0,1,1,135,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 165,140,155,115],0,1,1,138,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 215,65,190,80],0,1,1,139,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 215,100,180,115],0,1,1,140,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 235,140,215,150],0,1,1,141,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 220,235,205,265],0,1,1,146,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 235,255,225,275],0,1,1,147,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 195,330,220,300],0,1,1,148,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 125,255,110,280],0,1,1,149,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -text('black',610,33,1,1,1,44,15,151,12,3,0,0,0,0,2,44,15,0,0,"",0,0,0,0,45,'',[ -minilines(44,15,0,0,1,0,0,[ -mini_line(44,12,3,0,0,0,[ -str_block(0,44,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0, - "LTPage")]) -]) -])]). -text('black',460,108,1,1,1,65,15,152,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,120,'',[ -minilines(65,15,0,0,1,0,0,[ -mini_line(65,12,3,0,0,0,[ -str_block(0,65,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0, - "LTTextBox")]) -]) -])]). -text('black',410,178,1,1,1,66,15,154,12,3,0,0,0,0,2,66,15,0,0,"",0,0,0,0,190,'',[ -minilines(66,15,0,0,1,0,0,[ -mini_line(66,12,3,0,0,0,[ -str_block(0,66,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,66,12,3,0,-1,0,0,0,0,0, - "LTTextLine")]) -]) -])]). -text('black',360,248,1,1,1,42,15,157,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,260,'',[ -minilines(42,15,0,0,1,0,0,[ -mini_line(42,12,3,0,0,0,[ -str_block(0,42,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0, - "LTChar")]) -]) -])]). -text('black',420,248,1,1,1,42,15,159,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,260,'',[ -minilines(42,15,0,0,1,0,0,[ -mini_line(42,12,3,0,0,0,[ -str_block(0,42,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0, - "LTChar")]) -]) -])]). -text('black',480,248,1,1,1,42,15,161,12,3,0,0,0,0,2,42,15,0,0,"",0,0,0,0,260,'',[ -minilines(42,15,0,0,1,0,0,[ -mini_line(42,12,3,0,0,0,[ -str_block(0,42,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,42,12,3,0,0,0,0,0,0,0, - "LTText")]) -]) -])]). -text('black',460,178,1,1,1,12,15,170,12,3,0,0,0,0,2,12,15,0,0,"",0,0,0,0,190,'',[ -minilines(12,15,0,0,1,0,0,[ -mini_line(12,12,3,0,0,0,[ -str_block(0,12,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,12,12,3,0,-1,0,0,0,0,0, - "...")]) -]) -])]). -text('black',520,248,1,1,1,12,15,172,12,3,0,0,0,0,2,12,15,0,0,"",0,0,0,0,260,'',[ -minilines(12,15,0,0,1,0,0,[ -mini_line(12,12,3,0,0,0,[ -str_block(0,12,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,12,12,3,0,-1,0,0,0,0,0, - "...")]) -]) -])]). -text('black',560,108,1,1,1,51,15,174,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,120,'',[ -minilines(51,15,0,0,1,0,0,[ -mini_line(51,12,3,0,0,0,[ -str_block(0,51,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0, - "LTFigure")]) -]) -])]). -text('black',635,108,1,1,1,39,15,178,12,3,0,0,0,0,2,39,15,0,0,"",0,0,0,0,120,'',[ -minilines(39,15,0,0,1,0,0,[ -mini_line(39,12,3,0,0,0,[ -str_block(0,39,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,39,12,3,0,-1,0,0,0,0,0, - "LTLine")]) -]) -])]). -text('black',700,108,1,1,1,43,15,180,12,3,0,0,0,0,2,43,15,0,0,"",0,0,0,0,120,'',[ -minilines(43,15,0,0,1,0,0,[ -mini_line(43,12,3,0,0,0,[ -str_block(0,43,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,43,12,3,0,0,0,0,0,0,0, - "LTRect")]) -]) -])]). -text('black',580,178,1,1,1,50,15,182,12,3,0,0,0,0,2,50,15,0,0,"",0,0,0,0,190,'',[ -minilines(50,15,0,0,1,0,0,[ -mini_line(50,12,3,0,0,0,[ -str_block(0,50,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,50,12,3,0,-1,0,0,0,0,0, - "LTCurve")]) -]) -])]). -text('black',775,108,1,1,1,51,15,186,12,3,0,0,0,0,2,51,15,0,0,"",0,0,0,0,120,'',[ -minilines(51,15,0,0,1,0,0,[ -mini_line(51,12,3,0,0,0,[ -str_block(0,51,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,51,12,3,0,-1,0,0,0,0,0, - "LTImage")]) -]) -])]). -poly('black','',2,[ - 475,105,590,50],0,1,1,190,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 560,110,595,50],0,1,1,191,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 635,105,600,50],0,1,1,192,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 610,50,700,100],0,1,1,193,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 765,100,630,50],0,1,1,194,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 460,125,425,175],0,1,1,196,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 560,125,570,175],0,1,1,197,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 415,195,370,245],0,1,1,198,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 415,195,420,245],0,1,1,199,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 415,195,475,245],0,1,1,200,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 470,125,485,175],0,1,1,206,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 420,195,510,220],0,1,1,207,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -poly('black','',2,[ - 565,125,635,175],0,1,1,208,0,0,0,0,0,0,0,'1',0,0, - "0","",[ - 0,8,3,0,'8','3','0'],[0,8,3,0,'8','3','0'],[ -]). -text('black',635,178,1,1,1,12,15,215,12,3,0,0,0,0,2,12,15,0,0,"",0,0,0,0,190,'',[ -minilines(12,15,0,0,1,0,0,[ -mini_line(12,12,3,0,0,0,[ -str_block(0,12,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,12,12,3,0,-1,0,0,0,0,0, - "...")]) -]) -])]). diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..9534b01 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/objrel.obj b/docs/objrel.obj deleted file mode 100644 index 12a0f56..0000000 --- a/docs/objrel.obj +++ /dev/null @@ -1,187 +0,0 @@ -%TGIF 4.2.2 -state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0). -% -% @(#)$Header$ -% %W% -% -unit("1 pixel/pixel"). -color_info(19,65535,0,[ - "magenta", 65535, 0, 65535, 65535, 0, 65535, 1, - "red", 65535, 0, 0, 65535, 0, 0, 1, - "green", 0, 65535, 0, 0, 65535, 0, 1, - "blue", 0, 0, 65535, 0, 0, 65535, 1, - "yellow", 65535, 65535, 0, 65535, 65535, 0, 1, - "pink", 65535, 49344, 52171, 65535, 49344, 52171, 1, - "cyan", 0, 65535, 65535, 0, 65535, 65535, 1, - "CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1, - "white", 65535, 65535, 65535, 65535, 65535, 65535, 1, - "black", 0, 0, 0, 0, 0, 0, 1, - "DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1, - "#00000000c000", 0, 0, 49344, 0, 0, 49152, 1, - "#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1, - "#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1, - "#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1, - "#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1, - "#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1, - "#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1, - "#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1 -]). -script_frac("0.6"). -fg_bg_colors('black','white'). -dont_reencode("FFDingbests:ZapfDingbats"). -objshadow_info('#c0c0c0',2,2). -rotate_pivot(0,0,0,0). -spline_tightness(1). -page(1,"",1,''). -oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[ -]). -poly('black','',2,[ - 270,270,350,230],1,2,1,54,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -poly('black','',2,[ - 270,280,350,320],1,2,1,55,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -box('black','',350,100,450,150,2,2,1,2,0,0,0,0,0,'2',0,[ -]). -text('black',400,118,1,1,1,84,15,3,12,3,0,0,0,0,2,84,15,0,0,"",0,0,0,0,130,'',[ -minilines(84,15,0,0,1,0,0,[ -mini_line(84,12,3,0,0,0,[ -str_block(0,84,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,0,0,0,0,0,0, - "PDFDocument")]) -]) -])]). -box('black','',150,100,250,150,2,2,1,13,0,0,0,0,0,'2',0,[ -]). -text('black',200,118,1,1,1,63,15,14,12,3,0,0,0,0,2,63,15,0,0,"",0,0,0,0,130,'',[ -minilines(63,15,0,0,1,0,0,[ -mini_line(63,12,3,0,0,0,[ -str_block(0,63,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,63,12,3,0,0,0,0,0,0,0, - "PDFParser")]) -]) -])]). -box('black','',350,200,450,250,2,2,1,20,0,0,0,0,0,'2',0,[ -]). -text('black',400,218,1,1,1,88,15,21,12,3,0,0,0,0,2,88,15,0,0,"",0,0,0,0,230,'',[ -minilines(88,15,0,0,1,0,0,[ -mini_line(88,12,3,0,0,0,[ -str_block(0,88,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,88,12,3,0,0,0,0,0,0,0, - "PDFInterpreter")]) -]) -])]). -box('black','',350,300,450,350,2,2,1,23,0,0,0,0,0,'2',0,[ -]). -text('black',400,318,1,1,1,65,15,24,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,330,'',[ -minilines(65,15,0,0,1,0,0,[ -mini_line(65,12,3,0,0,0,[ -str_block(0,65,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0, - "PDFDevice")]) -]) -])]). -box('black','',180,250,280,300,2,2,1,29,0,0,0,0,0,'2',0,[ -]). -text('black',230,268,1,1,1,131,15,30,12,3,2,0,0,0,2,131,15,0,0,"",0,0,0,0,280,'',[ -minilines(131,15,0,0,1,0,0,[ -mini_line(131,12,3,0,0,0,[ -str_block(0,131,12,3,0,0,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,131,12,3,0,0,0,0,0,0,0, - "PDFResourceManager")]) -]) -])]). -poly('black','',2,[ - 250,140,350,140],1,2,1,45,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -poly('black','',2,[ - 350,110,250,110],1,2,1,46,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -poly('black','',2,[ - 400,150,400,200],1,2,1,47,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -poly('black','',2,[ - 400,250,400,300],1,2,1,56,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -poly('black','',2,[ - 400,350,400,380],0,2,1,65,0,0,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -text('black',400,388,3,1,1,44,41,71,12,3,0,-2,0,0,2,44,41,0,0,"",0,0,0,0,400,'',[ -minilines(44,41,0,0,1,-2,0,[ -mini_line(44,12,3,0,0,0,[ -str_block(0,44,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0, - "Display")]) -]), -mini_line(20,12,3,0,0,0,[ -str_block(0,20,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,20,12,3,0,-1,0,0,0,0,0, - "File")]) -]), -mini_line(23,12,3,0,0,0,[ -str_block(0,23,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,23,12,3,0,-1,0,0,0,0,0, - "etc.")]) -]) -])]). -text('black',300,88,1,1,1,92,15,79,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,100,'',[ -minilines(92,15,0,0,1,0,0,[ -mini_line(92,12,3,0,0,0,[ -str_block(0,92,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0, - "request objects")]) -]) -])]). -text('black',300,148,1,1,1,78,15,84,12,3,0,0,0,0,2,78,15,0,0,"",0,0,0,0,160,'',[ -minilines(78,15,0,0,1,0,0,[ -mini_line(78,12,3,0,0,0,[ -str_block(0,78,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,78,12,3,0,-1,0,0,0,0,0, - "store objects")]) -]) -])]). -oval('black','',20,100,120,150,2,2,1,106,0,0,0,0,0,'2',0,[ -]). -text('black',70,118,1,1,1,46,15,107,12,3,0,0,0,0,2,46,15,0,0,"",0,0,0,0,130,'',[ -minilines(46,15,0,0,1,0,0,[ -mini_line(46,12,3,0,0,0,[ -str_block(0,46,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,46,12,3,0,-1,0,0,0,0,0, - "PDF file")]) -]) -])]). -poly('black','',2,[ - 120,120,150,120],0,2,1,114,0,2,0,0,0,0,0,'2',0,0, - "0","",[ - 0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[ -]). -text('black',400,158,1,1,1,84,15,115,12,3,2,0,0,0,2,84,15,0,0,"",0,0,0,0,170,'',[ -minilines(84,15,0,0,1,0,0,[ -mini_line(84,12,3,0,0,0,[ -str_block(0,84,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,-1,0,0,0,0,0, - "page contents")]) -]) -])]). -text('black',400,258,1,1,1,129,15,119,12,3,2,0,0,0,2,129,15,0,0,"",0,0,0,0,270,'',[ -minilines(129,15,0,0,1,0,0,[ -mini_line(129,12,3,0,0,0,[ -str_block(0,129,12,3,0,-1,0,0,0,[ -str_seg('black','Helvetica-Bold',1,69120,129,12,3,0,-1,0,0,0,0,0, - "rendering instructions")]) -]) -])]). diff --git a/docs/objrel.png b/docs/objrel.png deleted file mode 100644 index 3b9f5b6275f8f3f4f49762002bcbbd3be93db1d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2038 zcmV8%Z1h@U)6S2&A0%a6;*xo~*Cl?u!A*KD1DUaQzpQ zOUU=>#8wwl8-_q5scGmfF41zV~g`um~;)brOAb6u> zjWf%VH*`w^LGy98@-%PB>vPr>45qV2Pjy!Wfp_8iN3Kzy=PhyYyu*$uiE)N4047R46FNabDERKh6tG!2K z;V6Y+JLwi`8Wh$FULgsENBzS3XZ-}54RZcR(sm8*ar2nJF0WD!bi0|D!PpepT}FSEI}YYU=|_J4Acoje)PJ6 zK)zfHpXa;bDHax;6aU>>@=~+CnWbv?W|EKNV_<7#_esIcntj+YF6<%)-{#W!G0@Ue zmKK22o?YQ!KK*unp}ZV|#5&)hzprM?93*mqhdY%7iI0K0oFV#RjzOpK_B@q%IFp}= zSFVT@?4x?F>dTbqa&FLK$7%8V5j@DJ4`y%QX~7mPr$q;T^uuMFhx0BY5HK z<3Y}Wl!Rf*Qss5$t-c<>_L@z&qk51`A1u_%A=uh$?d~+l`F1{d<=oiFYt@86^xc8J zAJHQKddxwOa*^XImp=uK>m2mT3cZ$#Ts=lYo1NdmdNr_~9VOrjIT}VJ_69A}?hMMq`(6mza9IB^ztB~- zpRSaFkkU$`bGBp<0)$tzLOc4B(HV@}qVSd)B?r!S20J8C*rrC+ft}7^Fcj`%#h`8< z=Yuu(9o6Q1ogLM+aVu)<-qef^$G|xL}M*l>J1EfeI#ez4jpn-KW#Wil~6SWJgVFP z0)>PtfgpdSLqku(quc~e-~|46;b8#@hY_9!gSA2C!QgPu0ENRn+=IbC;Sjjc{TJZ4 z#ry+K7lWnC+ADW)T?E4Gj9JCOpK``Zc|m!eQ1+p)LP+f6JoyNNrRz;+TR(qB z$QkUdRS{9ggWA-8o-5@NBLXFE?{VsAv?>`rSUCmb%hpoUH5Qb%cQ5+`yMPbZ!AUl$>8xVd#dsHL%soeIIDVa|K0dN<0w@Uz#&&VsHd-d_}#>hC{n zy)Q2(?}aU|&`z8zW~Vo##SERvQ1VUge(1flDN(XnqbR?m+pnCwCUje|w^|~~bi;vf zXw!k}sW7Y(WuL)a4_=8*Ivq+DvkgO%GnGcBLX`1+90%(}Pc&#z-fDc_YLwOSNA>Ph z9OlNr+GdM=s=!HRHBi-wgr=M~wegUr*GXIy)^aPG+b@ZtHn);`IzuGk7EQIe`GzR@+Uv~RO6Hx$SKb57<+cOO#)d9u zisG=DXMJ4E%iD(T^idqvEOu0X+udo>=RYo$g`q!n{Y7y|v9Rthg656~hAn>;0BXJ} zaroIkbpZN! zcQQH@9DsZ$L6GlcbPPGT&~-ccMOVnd&L97GC;_K7%xY6F7krIDua`?8S!9UBO@}HXVx<;IPYnx)z7SlE%JzaZ3p7O}Q`L z(^3%Fmb}*~D6H|X5IFe53vTQS0EbVlxv?(*96l%K#=ZdX)|Uf7r=7Uw{>qEB4S=V% zt$M#JV^1&wpk&*r72`aoxmj_@nn}qo^kvO$TpY6JF;hR#qEc`lcmzP_Uuu<}GiX8} zi*cmZGWx3RjsxP5ojld)s}|sR&H!3C z=p{I8&BCF(VX->M7A@l#3soF!Z6Kj#Oo8Ez2Q~Hrx^gTJx^ES9t=@X@xSxmJk6i1p z5nzx&BLMd}IBd+}9tVewa_)2P{Vlcx9m)F;IFhqDs{Vns3!FRHiKMh@gJ&z&4yA*( zfirT+g}@n^tpTbGLe~InwLvQsvFd0`cD?x&vCcuStk7#Y{OZvQ%@hz2y+Y>F-$ON8 USqtW(82|tP07*qoM6N<$f`cLBtN;K2 diff --git a/docs/programming.html b/docs/programming.html deleted file mode 100644 index dc4c186..0000000 --- a/docs/programming.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - -Programming with PDFMiner - - - -
- -Last Modified: Mon Mar 24 11:49:28 UTC 2014 - -
- -

-[Back to PDFMiner homepage] - -

Programming with PDFMiner

-

-This page explains how to use PDFMiner as a library -from other applications. -

- -

Overview

-

-PDF is evil. Although it is called a PDF -"document", it's nothing like Word or HTML document. PDF is more -like a graphic representation. PDF contents are just a bunch of -instructions that tell how to place the stuff at each exact -position on a display or paper. In most cases, it has no logical -structure such as sentences or paragraphs and it cannot adapt -itself when the paper size changes. PDFMiner attempts to -reconstruct some of those structures by guessing from its -positioning, but there's nothing guaranteed to work. Ugly, I -know. Again, PDF is evil. - -

-[More technical details about the internal structure of PDF: -"How to Extract Text Contents from PDF Manually" -(part 1) -(part 2) -(part 3)] - -

-Because a PDF file has such a big and complex structure, -parsing a PDF file as a whole is time and memory consuming. However, -not every part is needed for most PDF processing tasks. Therefore -PDFMiner takes a strategy of lazy parsing, which is to parse the -stuff only when it's necessary. To parse PDF files, you need to use at -least two classes: PDFParser and PDFDocument. -These two objects are associated with each other. -PDFParser fetches data from a file, -and PDFDocument stores it. You'll also need -PDFPageInterpreter to process the page contents -and PDFDevice to translate it to whatever you need. -PDFResourceManager is used to store -shared resources such as fonts or images. - -

-Figure 1 shows the relationship between the classes in PDFMiner. - -

-
-Figure 1. Relationships between PDFMiner classes -
- -

Basic Usage

-

-A typical way to parse a PDF file is the following: -

-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfpage import PDFTextExtractionNotAllowed
-from pdfminer.pdfinterp import PDFResourceManager
-from pdfminer.pdfinterp import PDFPageInterpreter
-from pdfminer.pdfdevice import PDFDevice
-
-# Open a PDF file.
-fp = open('mypdf.pdf', 'rb')
-# Create a PDF parser object associated with the file object.
-parser = PDFParser(fp)
-# Create a PDF document object that stores the document structure.
-# Supply the password for initialization.
-document = PDFDocument(parser, password)
-# Check if the document allows text extraction. If not, abort.
-if not document.is_extractable:
-    raise PDFTextExtractionNotAllowed
-# Create a PDF resource manager object that stores shared resources.
-rsrcmgr = PDFResourceManager()
-# Create a PDF device object.
-device = PDFDevice(rsrcmgr)
-# Create a PDF interpreter object.
-interpreter = PDFPageInterpreter(rsrcmgr, device)
-# Process each page contained in the document.
-for page in PDFPage.create_pages(document):
-    interpreter.process_page(page)
-
- -

Performing Layout Analysis

-

-Here is a typical way to use the layout analysis function: -

-from pdfminer.layout import LAParams
-from pdfminer.converter import PDFPageAggregator
-
-# Set parameters for analysis.
-laparams = LAParams()
-# Create a PDF page aggregator object.
-device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-interpreter = PDFPageInterpreter(rsrcmgr, device)
-for page in PDFPage.create_pages(document):
-    interpreter.process_page(page)
-    # receive the LTPage object for the page.
-    layout = device.get_result()
-
- -A layout analyzer returns a LTPage object for each page -in the PDF document. This object contains child objects within the page, -forming a tree structure. Figure 2 shows the relationship between -these objects. - -
-
-Figure 2. Layout objects and its tree structure -
- -
-
LTPage -
Represents an entire page. May contain child objects like -LTTextBox, LTFigure, LTImage, LTRect, -LTCurve and LTLine. - -
LTTextBox -
Represents a group of text chunks that can be contained in a rectangular area. -Note that this box is created by geometric analysis and does not necessarily -represents a logical boundary of the text. -It contains a list of LTTextLine objects. -get_text() method returns the text content. - -
LTTextLine -
Contains a list of LTChar objects that represent -a single text line. The characters are aligned either horizontaly -or vertically, depending on the text's writing mode. -get_text() method returns the text content. - -
LTChar -
LTAnno -
Represent an actual letter in the text as a Unicode string. -Note that, while a LTChar object has actual boundaries, -LTAnno objects does not, as these are "virtual" characters, -inserted by a layout analyzer according to the relationship between two characters -(e.g. a space). - -
LTFigure -
Represents an area used by PDF Form objects. PDF Forms can be used to -present figures or pictures by embedding yet another PDF document within a page. -Note that LTFigure objects can appear recursively. - -
LTImage -
Represents an image object. Embedded images can be -in JPEG or other formats, but currently PDFMiner does not -pay much attention to graphical objects. - -
LTLine -
Represents a single straight line. -Could be used for separating text or figures. - -
LTRect -
Represents a rectangle. -Could be used for framing another pictures or figures. - -
LTCurve -
Represents a generic Bezier curve. -
- -

-Also, check out a more complete example by Denis Papathanasiou(Extracting Text & Images from PDF Files). - -

Obtaining Table of Contents

-

-PDFMiner provides functions to access the document's table of contents -("Outlines"). - -

-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfdocument import PDFDocument
-
-# Open a PDF document.
-fp = open('mypdf.pdf', 'rb')
-parser = PDFParser(fp)
-document = PDFDocument(parser, password)
-
-# Get the outlines of the document.
-outlines = document.get_outlines()
-for (level,title,dest,a,se) in outlines:
-    print (level, title)
-
- -

-Some PDF documents use page numbers as destinations, while others -use page numbers and the physical location within the page. Since -PDF does not have a logical structure, and it does not provide a -way to refer to any in-page object from the outside, there's no -way to tell exactly which part of text these destinations are -referring to. - -

Extending Functionality

- -

-You can extend PDFPageInterpreter and PDFDevice class -in order to process them differently / obtain other information. - -


-
Yusuke Shinyama
- diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..1e76fda --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +sphinx-argparse \ No newline at end of file diff --git a/docs/source/_static/layout_analysis.html b/docs/source/_static/layout_analysis.html new file mode 100644 index 0000000..3db7b00 --- /dev/null +++ b/docs/source/_static/layout_analysis.html @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + +
M
Q u ic kb r o w n
+ → + ← + W
\ No newline at end of file diff --git a/docs/source/_static/layout_analysis_group_boxes.html b/docs/source/_static/layout_analysis_group_boxes.html new file mode 100644 index 0000000..23889da --- /dev/null +++ b/docs/source/_static/layout_analysis_group_boxes.html @@ -0,0 +1,23 @@ + + + + + + + + + + + + + +
+ + Q u i c k   b r o w n
f o x +
+
j u m p s ...
\ No newline at end of file diff --git a/docs/source/_static/layout_analysis_group_lines.html b/docs/source/_static/layout_analysis_group_lines.html new file mode 100644 index 0000000..d3bd852 --- /dev/null +++ b/docs/source/_static/layout_analysis_group_lines.html @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Q u i c k   b r o w + n
+ L1 +
+ f o x +            +            + + L2 +
\ No newline at end of file diff --git a/docs/layout.png b/docs/source/_static/layout_analysis_output.png similarity index 100% rename from docs/layout.png rename to docs/source/_static/layout_analysis_output.png diff --git a/docs/source/api/commandline.rst b/docs/source/api/commandline.rst new file mode 100644 index 0000000..7e4aba3 --- /dev/null +++ b/docs/source/api/commandline.rst @@ -0,0 +1,25 @@ +.. _api_commandline: + + +Command-line API +**************** + +.. _api_pdf2txt: + +pdf2txt.py +========== + +.. argparse:: + :module: tools.pdf2txt + :func: maketheparser + :prog: python tools/pdf2txt.py + +.. _api_dumppdf: + +dumppdf.py +========== + +.. argparse:: + :module: tools.dumppdf + :func: create_parser + :prog: python tools/dumppdf.py diff --git a/docs/source/api/composable.rst b/docs/source/api/composable.rst new file mode 100644 index 0000000..7ccc184 --- /dev/null +++ b/docs/source/api/composable.rst @@ -0,0 +1,20 @@ +.. _api_composable: + +Composable API +************** + +.. _api_laparams: + +LAParams +======== + +.. currentmodule:: pdfminer.layout +.. autoclass:: LAParams + +Todo: +===== + +- `PDFDevice` + - `TextConverter` + - `PDFPageAggregator` +- `PDFPageInterpreter` \ No newline at end of file diff --git a/docs/source/api/highlevel.rst b/docs/source/api/highlevel.rst new file mode 100644 index 0000000..4f34b46 --- /dev/null +++ b/docs/source/api/highlevel.rst @@ -0,0 +1,21 @@ +.. _api_highlevel: + +High-level functions API +************************ + +.. _api_extract_text: + +extract_text +============ + +.. currentmodule:: pdfminer.high_level +.. autofunction:: extract_text + + +.. _api_extract_text_to_fp: + +extract_text_to_fp +================== + +.. currentmodule:: pdfminer.high_level +.. autofunction:: extract_text_to_fp diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst new file mode 100644 index 0000000..047ed2d --- /dev/null +++ b/docs/source/api/index.rst @@ -0,0 +1,9 @@ +API documentation +***************** + +.. toctree:: + :maxdepth: 2 + + commandline + highlevel + composable diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..a3a6be7 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,61 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../')) + + +# -- Project information ----------------------------------------------------- + +project = 'pdfminer.six' +copyright = '2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman' +author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman' + +# The full version, including alpha/beta/rc tags +release = '20191020' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinxarg.ext', + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', +] + +# Root rst file +master_doc = 'index' + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..dbec983 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,72 @@ +Welcome to pdfminer.six's documentation! +**************************************** + +.. image:: https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master + :target: https://travis-ci.org/pdfminer/pdfminer.six + :alt: Travis-ci build badge + +.. image:: https://img.shields.io/pypi/v/pdfminer.six.svg + :target: https://pypi.python.org/pypi/pdfminer.six/ + :alt: PyPi version badge + +.. image:: https://badges.gitter.im/pdfminer-six/Lobby.svg + :target: https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium + :alt: gitter badge + + +Pdfminer.six is a python package for extracting information from PDF documents. + +Check out the source on `github `_. + +Content +======= + +.. toctree:: + :maxdepth: 2 + + tutorials/index + topics/index + api/index + + +Features +======== + +* Parse all objects from a PDF document into Python objects. +* Analyze and group text in a human-readable way. +* Extract text, images (JPG, JBIG2 and Bitmaps), table-of-contents, tagged + contents and more. +* Support for (almost all) features from the PDF-1.7 specification +* Support for Chinese, Japanese and Korean CJK) languages as well as vertical + writing. +* Support for various font types (Type1, TrueType, Type3, and CID). +* Support for basic encryption (RC4). + + +Installation instructions +========================= + +Before using it, you must install it using Python 2.7 or newer. + +:: + + $ pip install pdfminer.six + +Note that Python 2.7 support is dropped at January, 2020. + +Common use-cases +---------------- + +* :ref:`tutorial_commandline` if you just want to extract text from a pdf once. +* :ref:`tutorial_highlevel` if you want to integrate pdfminer.six with your + Python code. +* :ref:`tutorial_composable` when you want to tailor the behavior of + pdfmine.six to your needs. + + +Contributing +============ + +We welcome any contributors to pdfminer.six! But, before doing anything, take +a look at the `contribution guide +`_. diff --git a/docs/source/topics/converting_pdf_to_text.rst b/docs/source/topics/converting_pdf_to_text.rst new file mode 100644 index 0000000..b192d49 --- /dev/null +++ b/docs/source/topics/converting_pdf_to_text.rst @@ -0,0 +1,132 @@ +.. _topic_pdf_to_text: + +Converting a PDF file to text +***************************** + +Most PDF files look like they contain well structured text. But the reality is +that a PDF file does not contain anything that resembles a paragraphs, +sentences or even words. When it comes to text, a PDF file is only aware of +the characters and their placement. + +This makes extracting meaningful pieces of text from PDF's files difficult. +The characters that compose a paragraph are no different from those that +compose the table, the page footer or the description of a figure. Unlike +other documents formats, like a `.txt` file or a word document, the PDF format +does not contain a stream of text. + +A PDF document does consists of a collection of objects that together describe +the appearance of one or more pages, possibly accompanied by additional +interactive elements and higher-level application data. A PDF file contains +the objects making up a PDF document along with associated structural +information, all represented as a single self-contained sequence of bytes. [1]_ + +Layout analysis algorithm +========================= + +PDFMiner attempts to reconstruct some of those structures by using heuristics +on the positioning of characters. This works well for sentences and +paragraphs because meaningful groups of nearby characters can be made. + +The layout analysis consist of three different stages: it groups characters +into words and lines, then it groups lines into boxes and finally it groups +textboxes hierarchically. These stages are discussed in the following +sections. The resulting output of the layout analysis is an ordered hierarchy +of layout objects on a PDF page. + +.. figure:: ../_static/layout_analysis_output.png + :align: center + + The output of the layout analysis is a hierarchy of layout objects. + + +The output of the layout analysis heavily depends on a couple of parameters. +All these parameters are part of the :ref:`api_laparams` class. + +Grouping characters into words and lines +---------------------------------------- + +The first step in going from characters to text is to group characters in a +meaningful way. Each character has an x-coordinate and a y-coordinate for its +bottom-left corner and upper-right corner, i.e. its bounding box. Pdfminer +.six uses these bounding boxes to decide which characters belong together. + +Characters that are both horizontally and vertically close are grouped. How +close they should be is determined by the `char_margin` (M in figure) and the +`line_overlap` (not in figure) parameter. The horizontal *distance* between the +bounding boxes of two characters should be smaller that the `char_margin` and +the vertical *overlap* between the bounding boxes should be smaller the the +`line_overlap`. + + +.. raw:: html + :file: ../_static/layout_analysis.html + +The values of `char_margin` and `line_overlap` are relative to the size of +the bounding boxes of the characters. The `char_margin` is relative to the +maximum width of either one of the bounding boxes, and the `line_overlap` is +relative to the minimum height of either one of the bounding boxes. + +Spaces need to be inserted between characters because the PDF format has no +notion of the space character. A space is inserted if the characters are +further apart that the `word_margin` (W in the figure). The `word_margin` is +relative to the maximum width or height of the new character. Having a larger +`word_margin` creates smaller words and inserts spaces between characters +more often. Note that the `word_margin` should be smaller than the +`char_margin` otherwise all the characters are seperated by a space. + +The result of this stage is a list of lines. Each line consists a list of +characters. These characters either original `LTChar` characters that +originate from the PDF file, or inserted `LTAnno` characters that +represent spaces between words or newlines at the end of each line. + +Grouping lines into boxes +------------------------- + +The second step is grouping lines in a meaningful way. Each line has a +bounding box that is determined by the bounding boxes of the characters that +it contains. Like grouping characters, pdfminer.six uses the bounding boxes +to group the lines. + +Lines that are both horizontally overlapping and vertically close are grouped. +How vertically close the lines should be is determined by the `line_margin`. +This margin is specified relative to the height of the bounding box. Lines +are close if the gap between the tops (see L :sub:`1` in the figure) and bottoms +(see L :sub:`2`) in the figure) of the bounding boxes are closer together +than the absolute line margin, i.e. the `line_margin` multiplied by the +height of the bounding box. + +.. raw:: html + :file: ../_static/layout_analysis_group_lines.html + +The result of this stage is a list of text boxes. Each box consist of a list +of lines. + +Grouping textboxes hierarchically +--------------------------------- + +the last step is to group the text boxes in a meaningful way. This step +repeatedly merges the two text boxes that are closest to each other. + +The closeness of bounding boxes is computed as the area that is between the +two text boxes (the blue area in the figure). In other words, it is the area of +the bounding box that surrounds both lines, minus the area of the bounding +boxes of the individual lines. + +.. raw:: html + :file: ../_static/layout_analysis_group_boxes.html + + +Working with rotated characters +=============================== + +The algorithm described above assumes that all characters have the same +orientation. However, any writing direction is possible in a PDF. To +accommodate for this, pdfminer.six allows to detect vertical writing with the +`detect_vertical` parameter. This will apply all the grouping steps as if the +pdf was rotated 90 (or 270) degrees + +References +========== + +.. [1] Adobe System Inc. (2007). *Pdf reference: Adobe portable document + format, version 1.7.* diff --git a/docs/source/topics/index.rst b/docs/source/topics/index.rst new file mode 100644 index 0000000..30c000a --- /dev/null +++ b/docs/source/topics/index.rst @@ -0,0 +1,7 @@ +Using pdfminer.six +****************** + +.. toctree:: + :maxdepth: 2 + + converting_pdf_to_text diff --git a/docs/source/tutorials/commandline.rst b/docs/source/tutorials/commandline.rst new file mode 100644 index 0000000..2477c65 --- /dev/null +++ b/docs/source/tutorials/commandline.rst @@ -0,0 +1,41 @@ +.. _tutorial_commandline: + +Get started with command-line tools +*********************************** + +pdfminer.six has several tools that can be used from the command line. The +command-line tools are aimed at users that occasionally want to extract text +from a pdf. + +Take a look at the high-level or composable interface if you want to use +pdfminer.six programmatically. + +Examples +======== + +pdf2txt.py +---------- + +:: + + $ python tools/pdf2txt.py example.pdf + all the text from the pdf appears on the command line + +The :ref:`api_pdf2txt` tool extracts all the text from a PDF. It uses layout +analysis with sensible defaults to order and group the text in a sensible way. + +dumppdf.py +---------- + +:: + + $ python tools/dumppdf.py -a example.pdf + + ... + + ... + + +The :ref:`api_dumppdf` tool can be used to extract the internal structure from a +PDF. This tool is primarily for debugging purposes, but that can be useful to +anybody working with PDF's. diff --git a/docs/source/tutorials/composable.rst b/docs/source/tutorials/composable.rst new file mode 100644 index 0000000..971332b --- /dev/null +++ b/docs/source/tutorials/composable.rst @@ -0,0 +1,33 @@ +.. _tutorial_composable: + +Get started using the composable components API +*********************************************** + +The command line tools and the high-level API are just shortcuts for often +used combinations of pdfminer.six components. You can use these components to +modify pdfminer.six to your own needs. + +For example, to extract the text from a PDF file and save it in a python +variable:: + + from io import StringIO + + from pdfminer.converter import TextConverter + from pdfminer.layout import LAParams + from pdfminer.pdfdocument import PDFDocument + from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter + from pdfminer.pdfpage import PDFPage + from pdfminer.pdfparser import PDFParser + + output_string = StringIO() + with open('samples/simple1.pdf', 'rb') as in_file: + parser = PDFParser(in_file) + doc = PDFDocument(parser) + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(doc): + interpreter.process_page(page) + + print(output_string.getvalue()) + diff --git a/docs/source/tutorials/highlevel.rst b/docs/source/tutorials/highlevel.rst new file mode 100644 index 0000000..15424b9 --- /dev/null +++ b/docs/source/tutorials/highlevel.rst @@ -0,0 +1,67 @@ +.. testsetup:: + + import sys + from pdfminer.high_level import extract_text_to_fp, extract_text + +.. _tutorial_highlevel: + +Get started using the high-level functions +****************************************** + +The high-level API can be used to do common tasks. + +The most simple way to extract text from a PDF is to use +:ref:`api_extract_text`: + +.. doctest:: + + >>> text = extract_text('samples/simple1.pdf') + >>> print(repr(text)) + 'Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\x0c' + >>> print(text) + ... # doctest: +NORMALIZE_WHITESPACE + Hello + + World + + World + + Hello + + H e l l o + + H e l l o + + W o r l d + + W o r l d + + + +To read text from a PDF and print it on the command line: + +.. doctest:: + + >>> if sys.version_info > (3, 0): + ... from io import StringIO + ... else: + ... from io import BytesIO as StringIO + >>> output_string = StringIO() + >>> with open('samples/simple1.pdf', 'rb') as fin: + ... extract_text_to_fp(fin, output_string) + >>> print(output_string.getvalue().strip()) + Hello WorldHello WorldHello WorldHello World + +Or to convert it to html and use layout analysis: + +.. doctest:: + + >>> if sys.version_info > (3, 0): + ... from io import StringIO + ... else: + ... from io import BytesIO as StringIO + >>> from pdfminer.layout import LAParams + >>> output_string = StringIO() + >>> with open('samples/simple1.pdf', 'rb') as fin: + ... extract_text_to_fp(fin, output_string, laparams=LAParams(), + ... output_type='html', codec=None) diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst new file mode 100644 index 0000000..f1b5b17 --- /dev/null +++ b/docs/source/tutorials/index.rst @@ -0,0 +1,9 @@ +Getting started +*************** + +.. toctree:: + :maxdepth: 2 + + commandline + highlevel + composable diff --git a/docs/style.css b/docs/style.css deleted file mode 100644 index 612e308..0000000 --- a/docs/style.css +++ /dev/null @@ -1,4 +0,0 @@ -blockquote { background: #eeeeee; } -h1 { border-bottom: solid black 2px; } -h2 { border-bottom: solid black 1px; } -.comment { color: darkgreen; } diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 4e51e37..eaf0520 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import logging import re +import sys from .pdfdevice import PDFTextDevice from .pdffont import PDFUnicodeNotDefined from .layout import LTContainer @@ -271,6 +272,8 @@ class HTMLConverter(PDFConverter): def write(self, text): if self.codec: text = text.encode(self.codec) + if sys.version_info < (3, 0): + text = str(text) self.outfp.write(text) return diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 2ce4276..17172c5 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -1,26 +1,20 @@ -# -*- coding: utf-8 -*- -""" -Functions that encapsulate "usual" use-cases for pdfminer, for use making -bundled scripts and for using pdfminer as a module for routine tasks. -""" - +"""Functions that can be used for the most common use-cases for pdfminer.six""" + import logging -import six import sys +import six + # Conditional import because python 2 is stupid if sys.version_info > (3, 0): from io import StringIO else: from io import BytesIO as StringIO -from .pdfdocument import PDFDocument -from .pdfparser import PDFParser from .pdfinterp import PDFResourceManager, PDFPageInterpreter -from .pdfdevice import PDFDevice, TagExtractor +from .pdfdevice import TagExtractor from .pdfpage import PDFPage from .converter import XMLConverter, HTMLConverter, TextConverter -from .cmapdb import CMapDB from .image import ImageWriter from .layout import LAParams @@ -35,21 +29,25 @@ def extract_text_to_fp(inf, outfp, Takes loads of optional arguments but the defaults are somewhat sane. Beware laparams: Including an empty LAParams is not the same as passing None! Returns nothing, acting as it does on two streams. Use StringIO to get strings. - - output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly. - codec: Text decoding codec - laparams: An LAParams object from pdfminer.layout. - Default is None but may not layout correctly. - maxpages: How many pages to stop parsing after - page_numbers: zero-indexed page numbers to operate on. - password: For encrypted PDFs, the password to decrypt. - scale: Scale factor - rotation: Rotation factor - layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter - output_dir: If given, creates an ImageWriter for extracted images. - strip_control: Does what it says on the tin - debug: Output more logging data - disable_caching: Does what it says on the tin + + :param inf: a file-like object to read PDF structure from, such as a + file handler (using the builtin `open()` function) or a `BytesIO`. + :param outfp: a file-like object to write the text to. + :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly. + :param codec: Text decoding codec + :param laparams: An LAParams object from pdfminer.layout. Default is None but may not layout correctly. + :param maxpages: How many pages to stop parsing after + :param page_numbers: zero-indexed page numbers to operate on. + :param password: For encrypted PDFs, the password to decrypt. + :param scale: Scale factor + :param rotation: Rotation factor + :param layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter + :param output_dir: If given, creates an ImageWriter for extracted images. + :param strip_control: Does what it says on the tin + :param debug: Output more logging data + :param disable_caching: Does what it says on the tin + :param other: + :return: """ if '_py2_no_more_posargs' in kwargs is not None: raise DeprecationWarning( @@ -67,7 +65,7 @@ def extract_text_to_fp(inf, outfp, imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) - + rsrcmgr = PDFResourceManager(caching=not disable_caching) if output_type == 'text': @@ -96,7 +94,7 @@ def extract_text_to_fp(inf, outfp, caching=not disable_caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 - interpreter.process_page(page) + interpreter.process_page(page) device.close() diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 4ae7822..8b8e397 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,17 +1,15 @@ import heapq + from .utils import INF from .utils import Plane -from .utils import get_bound -from .utils import uniq -from .utils import fsplit -from .utils import bbox2str -from .utils import matrix2str from .utils import apply_matrix_pt +from .utils import bbox2str +from .utils import fsplit +from .utils import get_bound +from .utils import matrix2str +from .utils import uniq -import six # Python 2+3 compatibility -## IndexAssigner -## class IndexAssigner(object): def __init__(self, index=0): @@ -28,9 +26,33 @@ class IndexAssigner(object): return -## LAParams -## class LAParams(object): + """Parameters for layout analysis + + :param line_overlap: If two characters have more overlap than this they + are considered to be on the same line. The overlap is specified + relative to the minimum height of both characters. + :param char_margin: If two characters are closer together than this + margin they are considered to be part of the same word. If + characters are on the same line but not part of the same word, an + intermediate space is inserted. The margin is specified relative to + the width of the character. + :param word_margin: If two words are are closer together than this + margin they are considered to be part of the same line. A space is + added in between for readability. The margin is specified relative + to the width of the word. + :param line_margin: If two lines are are close together they are + considered to be part of the same paragraph. The margin is + specified relative to the height of a line. + :param boxes_flow: Specifies how much a horizontal and vertical position + of a text matters when determining the order of lines. The value + should be within the range of -1.0 (only horizontal position + matters) to +1.0 (only vertical position matters). + :param detect_vertical: If vertical text should be considered during + layout analysis + :param all_texts: If layout analysis should be performed on text in + figures. + """ def __init__(self, line_overlap=0.5, @@ -54,30 +76,28 @@ class LAParams(object): (self.char_margin, self.line_margin, self.word_margin, self.all_texts)) -## LTItem -## class LTItem(object): + """Interface for things that can be analyzed""" def analyze(self, laparams): """Perform the layout analysis.""" return -## LTText -## class LTText(object): + """Interface for things that have text""" def __repr__(self): return ('<%s %r>' % (self.__class__.__name__, self.get_text())) def get_text(self): + """Text contained in this object""" raise NotImplementedError -## LTComponent -## class LTComponent(LTItem): + """Object with a bounding box""" def __init__(self, bbox): LTItem.__init__(self) @@ -91,10 +111,13 @@ class LTComponent(LTItem): # Disable comparison. def __lt__(self, _): raise ValueError + def __le__(self, _): raise ValueError + def __gt__(self, _): raise ValueError + def __ge__(self, _): raise ValueError @@ -149,9 +172,8 @@ class LTComponent(LTItem): return 0 -## LTCurve -## class LTCurve(LTComponent): + """A generic Bezier curve""" def __init__(self, linewidth, pts, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None): LTComponent.__init__(self, get_bound(pts)) @@ -168,18 +190,22 @@ class LTCurve(LTComponent): return ','.join('%.3f,%.3f' % p for p in self.pts) -## LTLine -## class LTLine(LTCurve): + """A single straight line. + + Could be used for separating text or figures. + """ def __init__(self, linewidth, p0, p1, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None): LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color) return -## LTRect -## class LTRect(LTCurve): + """A rectangle. + + Could be used for framing another pictures or figures. + """ def __init__(self, linewidth, bbox, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None): (x0, y0, x1, y1) = bbox @@ -187,9 +213,11 @@ class LTRect(LTCurve): return -## LTImage -## class LTImage(LTComponent): + """An image object. + + Embedded images can be in JPEG, Bitmap or JBIG2. + """ def __init__(self, name, stream, bbox): LTComponent.__init__(self, bbox) @@ -210,9 +238,13 @@ class LTImage(LTComponent): bbox2str(self.bbox), self.srcsize)) -## LTAnno -## class LTAnno(LTItem, LTText): + """Actual letter in the text as a Unicode string. + + Note that, while a LTChar object has actual boundaries, LTAnno objects does + not, as these are "virtual" characters, inserted by a layout analyzer + according to the relationship between two characters (e.g. a space). + """ def __init__(self, text): self._text = text @@ -222,9 +254,8 @@ class LTAnno(LTItem, LTText): return self._text -## LTChar -## class LTChar(LTComponent, LTText): + """Actual letter in the text as a Unicode string.""" def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate): @@ -285,9 +316,8 @@ class LTChar(LTComponent, LTText): return True -## LTContainer -## class LTContainer(LTComponent): + """Object that can be extended and analyzed""" def __init__(self, bbox): LTComponent.__init__(self, bbox) @@ -315,10 +345,7 @@ class LTContainer(LTComponent): return -## LTExpandableContainer -## class LTExpandableContainer(LTContainer): - def __init__(self): LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) return @@ -330,10 +357,7 @@ class LTExpandableContainer(LTContainer): return -## LTTextContainer -## class LTTextContainer(LTExpandableContainer, LTText): - def __init__(self): LTText.__init__(self) LTExpandableContainer.__init__(self) @@ -343,9 +367,12 @@ class LTTextContainer(LTExpandableContainer, LTText): return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText)) -## LTTextLine -## class LTTextLine(LTTextContainer): + """Contains a list of LTChar objects that represent a single text line. + + The characters are aligned either horizontally or vertically, depending on + the text's writing mode. + """ def __init__(self, word_margin): LTTextContainer.__init__(self) @@ -367,7 +394,6 @@ class LTTextLine(LTTextContainer): class LTTextLineHorizontal(LTTextLine): - def __init__(self, word_margin): LTTextLine.__init__(self, word_margin) self._x1 = +INF @@ -393,7 +419,6 @@ class LTTextLineHorizontal(LTTextLine): class LTTextLineVertical(LTTextLine): - def __init__(self, word_margin): LTTextLine.__init__(self, word_margin) self._y0 = -INF @@ -418,12 +443,13 @@ class LTTextLineVertical(LTTextLine): abs(obj.y1-self.y1) < d))] -## LTTextBox -## -## A set of text objects that are grouped within -## a certain rectangular area. -## class LTTextBox(LTTextContainer): + """Represents a group of text chunks in a rectangular area. + + Note that this box is created by geometric analysis and does not necessarily + represents a logical boundary of the text. It contains a list of + LTTextLine objects. + """ def __init__(self): LTTextContainer.__init__(self) @@ -437,7 +463,6 @@ class LTTextBox(LTTextContainer): class LTTextBoxHorizontal(LTTextBox): - def analyze(self, laparams): LTTextBox.analyze(self, laparams) self._objs.sort(key=lambda obj: -obj.y1) @@ -448,7 +473,6 @@ class LTTextBoxHorizontal(LTTextBox): class LTTextBoxVertical(LTTextBox): - def analyze(self, laparams): LTTextBox.analyze(self, laparams) self._objs.sort(key=lambda obj: -obj.x1) @@ -458,10 +482,7 @@ class LTTextBoxVertical(LTTextBox): return 'tb-rl' -## LTTextGroup -## class LTTextGroup(LTTextContainer): - def __init__(self, objs): LTTextContainer.__init__(self) self.extend(objs) @@ -469,7 +490,6 @@ class LTTextGroup(LTTextContainer): class LTTextGroupLRTB(LTTextGroup): - def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-left to bottom-right. @@ -480,7 +500,6 @@ class LTTextGroupLRTB(LTTextGroup): class LTTextGroupTBRL(LTTextGroup): - def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-right to bottom-left. @@ -490,10 +509,7 @@ class LTTextGroupTBRL(LTTextGroup): return -## LTLayoutContainer -## class LTLayoutContainer(LTContainer): - def __init__(self, bbox): LTContainer.__init__(self, bbox) self.groups = None @@ -709,9 +725,13 @@ class LTLayoutContainer(LTContainer): return -## LTFigure -## class LTFigure(LTLayoutContainer): + """Represents an area used by PDF Form objects. + + PDF Forms can be used to present figures or pictures by embedding yet + another PDF document within a page. Note that LTFigure objects can appear + recursively. + """ def __init__(self, name, bbox, matrix): self.name = name @@ -734,9 +754,12 @@ class LTFigure(LTLayoutContainer): return -## LTPage -## class LTPage(LTLayoutContainer): + """Represents an entire page. + + May contain child objects like LTTextBox, LTFigure, LTImage, LTRect, + LTCurve and LTLine. + """ def __init__(self, pageid, bbox, rotate=0): LTLayoutContainer.__init__(self, bbox) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 54925f1..072980e 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -2,13 +2,13 @@ import six +from . import utils from .pdffont import PDFUnicodeNotDefined -from . import utils -## PDFDevice -## class PDFDevice(object): + """Translate the output of PDFPageInterpreter to the output that is needed + """ def __init__(self, rsrcmgr): self.rsrcmgr = rsrcmgr diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index de54835..0e692e2 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -318,9 +318,8 @@ class PDFContentParser(PSStackParser): return -## Interpreter -## class PDFPageInterpreter(object): + """Processor for the content of a PDF page""" def __init__(self, rsrcmgr, device): self.rsrcmgr = rsrcmgr diff --git a/setup.py b/setup.py index a13e4da..06d8106 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,10 @@ setup( 'six', 'sortedcontainers', ], - extras_require={"dev": ["nose", "tox"]}, + extras_require={ + "dev": ["nose", "tox"], + "docs": ["sphinx", "sphinx-argparse"], + }, description='PDF parser and analyzer', long_description=package.__doc__, license='MIT/X', diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 0e3a4ee..49b29b8 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -240,51 +240,51 @@ def create_parser(): help='One or more paths to PDF files.') parser.add_argument( - '-d', '--debug', default=False, action='store_true', + '--debug', '-d', default=False, action='store_true', help='Use debug logging level.') procedure_parser = parser.add_mutually_exclusive_group() procedure_parser.add_argument( - '-T', '--extract-toc', default=False, action='store_true', + '--extract-toc', '-T', default=False, action='store_true', help='Extract structure of outline') procedure_parser.add_argument( - '-E', '--extract-embedded', type=str, + '--extract-embedded', '-E', type=str, help='Extract embedded files') parse_params = parser.add_argument_group( 'Parser', description='Used during PDF parsing') parse_params.add_argument( - "--page-numbers", type=int, default=None, nargs="+", - help="A space-seperated list of page numbers to parse.") + '--page-numbers', type=int, default=None, nargs='+', + help='A space-seperated list of page numbers to parse.') parse_params.add_argument( - "-p", "--pagenos", type=str, - help="A comma-separated list of page numbers to parse. Included for " - "legacy applications, use --page-numbers for more idiomatic " - "argument entry.") + '--pagenos', '-p', type=str, + help='A comma-separated list of page numbers to parse. Included for ' + 'legacy applications, use --page-numbers for more idiomatic ' + 'argument entry.') parse_params.add_argument( - '-i', '--objects', type=str, + '--objects', '-i', type=str, help='Comma separated list of object numbers to extract') parse_params.add_argument( - '-a', '--all', default=False, action='store_true', + '--all', '-a', default=False, action='store_true', help='If the structure of all objects should be extracted') parse_params.add_argument( - '-P', '--password', type=str, default='', + '--password', '-P', type=str, default='', help='The password to use for decrypting PDF file.') output_params = parser.add_argument_group( 'Output', description='Used during output generation.') output_params.add_argument( - '-o', '--outfile', type=str, default='-', + '--outfile', '-o', type=str, default='-', help='Path to file where output is written. Or "-" (default) to ' 'write to stdout.') codec_parser = output_params.add_mutually_exclusive_group() codec_parser.add_argument( - '-r', '--raw-stream', default=False, action='store_true', + '--raw-stream', '-r', default=False, action='store_true', help='Write stream objects without encoding') codec_parser.add_argument( - '-b', '--binary-stream', default=False, action='store_true', + '--binary-stream', '-b', default=False, action='store_true', help='Write stream objects with binary encoding') codec_parser.add_argument( - '-t', '--text-stream', default=False, action='store_true', + '--text-stream', '-t', default=False, action='store_true', help='Write stream objects as plain text') return parser diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 41a7e7e..d370a13 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -1,15 +1,9 @@ -#!/usr/bin/env python - -""" -Converts PDF text content (though not images containing text) to plain text, html, xml or "tags". -""" +"""A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags.""" import argparse import logging -import six import sys +import six -import pdfminer.settings -pdfminer.settings.STRICT = False import pdfminer.high_level import pdfminer.layout from pdfminer.image import ImageWriter @@ -73,28 +67,68 @@ def extract_text(files=[], outfile='-', def maketheparser(): parser = argparse.ArgumentParser(description=__doc__, add_help=True) - parser.add_argument("files", type=str, default=None, nargs="+", help="File to process.") - parser.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") - parser.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") - parser.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") - parser.add_argument("-m", "--maxpages", type=int, default=0, help="Maximum pages to parse") - parser.add_argument("-P", "--password", type=str, default="", help="Decryption password for PDF") - parser.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default \"-\" is stdout)") - parser.add_argument("-t", "--output_type", type=str, default="text", help="Output type: text|html|xml|tag (default is text)") - parser.add_argument("-c", "--codec", type=str, default="utf-8", help="Text encoding") - parser.add_argument("-s", "--scale", type=float, default=1.0, help="Scale") - parser.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") - parser.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") - parser.add_argument("-W", "--word-margin", type=float, default=None, help="LAParams word margin") - parser.add_argument("-M", "--char-margin", type=float, default=None, help="LAParams char margin") - parser.add_argument("-L", "--line-margin", type=float, default=None, help="LAParams line margin") - parser.add_argument("-F", "--boxes-flow", type=float, default=None, help="LAParams boxes flow") - parser.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") - parser.add_argument("-n", "--no-laparams", default=False, action="store_true", help="Pass None as LAParams") - parser.add_argument("-R", "--rotation", default=0, type=int, help="Rotation") - parser.add_argument("-O", "--output-dir", default=None, help="Output directory for images") - parser.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") - parser.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") + parser.add_argument("files", type=str, default=None, nargs="+", help="One or more paths to PDF files.") + + parser.add_argument("--debug", "-d", default=False, action="store_true", + help="Use debug logging level.") + parser.add_argument("--disable-caching", "-C", default=False, action="store_true", + help="If caching or resources, such as fonts, should be disabled.") + + parse_params = parser.add_argument_group('Parser', description='Used during PDF parsing') + parse_params.add_argument("--page-numbers", type=int, default=None, nargs="+", + help="A space-seperated list of page numbers to parse.") + parse_params.add_argument("--pagenos", "-p", type=str, + help="A comma-separated list of page numbers to parse. Included for legacy applications, " + "use --page-numbers for more idiomatic argument entry.") + parse_params.add_argument("--maxpages", "-m", type=int, default=0, + help="The maximum number of pages to parse.") + parse_params.add_argument("--password", "-P", type=str, default="", + help="The password to use for decrypting PDF file.") + parse_params.add_argument("--rotation", "-R", default=0, type=int, + help="The number of degrees to rotate the PDF before other types of processing.") + + la_params = parser.add_argument_group('Layout analysis', description='Used during layout analysis.') + la_params.add_argument("--no-laparams", "-n", default=False, action="store_true", + help="If layout analysis parameters should be ignored.") + la_params.add_argument("--detect-vertical", "-V", default=False, action="store_true", + help="If vertical text should be considered during layout analysis") + la_params.add_argument("--char-margin", "-M", type=float, default=2.0, + help="If two characters are closer together than this margin they are considered to be part " + "of the same word. The margin is specified relative to the width of the character.") + la_params.add_argument("--word-margin", "-W", type=float, default=0.1, + help="If two words are are closer together than this margin they are considered to be part " + "of the same line. A space is added in between for readability. The margin is " + "specified relative to the width of the word.") + la_params.add_argument("--line-margin", "-L", type=float, default=0.5, + help="If two lines are are close together they are considered to be part of the same " + "paragraph. The margin is specified relative to the height of a line.") + la_params.add_argument("--boxes-flow", "-F", type=float, default=0.5, + help="Specifies how much a horizontal and vertical position of a text matters when " + "determining the order of lines. The value should be within the range of -1.0 (only " + "horizontal position matters) to +1.0 (only vertical position matters).") + la_params.add_argument("--all-texts", "-A", default=True, action="store_true", + help="If layout analysis should be performed on text in figures.") + + output_params = parser.add_argument_group('Output', description='Used during output generation.') + output_params.add_argument("--outfile", "-o", type=str, default="-", + help="Path to file where output is written. Or \"-\" (default) to write to stdout.") + output_params.add_argument("--output_type", "-t", type=str, default="text", + help="Type of output to generate {text,html,xml,tag}.") + output_params.add_argument("--codec", "-c", type=str, default="utf-8", + help="Text encoding to use in output file.") + output_params.add_argument("--output-dir", "-O", default=None, + help="The output directory to put extracted images in. If not given, images are not " + "extracted.") + output_params.add_argument("--layoutmode", "-Y", default="normal", type=str, + help="Type of layout to use when generating html {normal,exact,loose}. If normal, " + "each line is positioned separately in the html. If exact, each character is " + "positioned separately in the html. If loose, same result as normal but with an " + "additional newline after each text line. Only used when output_type is html.") + output_params.add_argument("--scale", "-s", type=float, default=1.0, + help="The amount of zoom to use when generating html file. Only used when output_type " + "is html.") + output_params.add_argument("--strip-control", "-S", default=False, action="store_true", + help="Remove control statement from text. Only used when output_type is xml.") return parser diff --git a/tox.ini b/tox.ini index 09c7f80..8a47713 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,11 @@ [tox] -envlist = py{26, 27, 34, 35, 36} +envlist = py{27,34,35,36,37,38} [testenv] -extras = dev -commands = nosetests --nologcapture +extras = + dev + docs +commands = + nosetests --nologcapture + python -m sphinx -b html docs/source docs/build/html + python -m sphinx -b doctest docs/source docs/build/doctest