Use visible imports in highlevel.rst documentation (#609)
* add missing import for extract_text_to_fp * Replace testsetup with visible imports in documentation * Remove obsolete check for python version; python 2 is not supported anymore * (Unrelated to this MR) Remove sys from converter.py * Optimize imports * (Unrelated to this MR) fix line length error Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/610/head
parent
1d33c026e4
commit
7f54cefe02
|
@ -1,8 +1,3 @@
|
||||||
.. testsetup::
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from pdfminer.high_level import extract_text_to_fp, extract_text
|
|
||||||
|
|
||||||
.. _tutorial_highlevel:
|
.. _tutorial_highlevel:
|
||||||
|
|
||||||
Extract text from a PDF using Python
|
Extract text from a PDF using Python
|
||||||
|
@ -15,6 +10,7 @@ The most simple way to extract text from a PDF is to use
|
||||||
|
|
||||||
.. doctest::
|
.. doctest::
|
||||||
|
|
||||||
|
>>> from pdfminer.high_level import extract_text
|
||||||
>>> text = extract_text('samples/simple1.pdf')
|
>>> text = extract_text('samples/simple1.pdf')
|
||||||
>>> print(repr(text))
|
>>> print(repr(text))
|
||||||
'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c'
|
'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c'
|
||||||
|
@ -42,10 +38,8 @@ To read text from a PDF and print it on the command line:
|
||||||
|
|
||||||
.. doctest::
|
.. doctest::
|
||||||
|
|
||||||
>>> if sys.version_info > (3, 0):
|
>>> from io import StringIO
|
||||||
... from io import StringIO
|
>>> from pdfminer.high_level import extract_text_to_fp
|
||||||
... else:
|
|
||||||
... from io import BytesIO as StringIO
|
|
||||||
>>> output_string = StringIO()
|
>>> output_string = StringIO()
|
||||||
>>> with open('samples/simple1.pdf', 'rb') as fin:
|
>>> with open('samples/simple1.pdf', 'rb') as fin:
|
||||||
... extract_text_to_fp(fin, output_string)
|
... extract_text_to_fp(fin, output_string)
|
||||||
|
@ -56,10 +50,8 @@ Or to convert it to html and use layout analysis:
|
||||||
|
|
||||||
.. doctest::
|
.. doctest::
|
||||||
|
|
||||||
>>> if sys.version_info > (3, 0):
|
>>> from io import StringIO
|
||||||
... from io import StringIO
|
>>> from pdfminer.high_level import extract_text_to_fp
|
||||||
... else:
|
|
||||||
... from io import BytesIO as StringIO
|
|
||||||
>>> from pdfminer.layout import LAParams
|
>>> from pdfminer.layout import LAParams
|
||||||
>>> output_string = StringIO()
|
>>> output_string = StringIO()
|
||||||
>>> with open('samples/simple1.pdf', 'rb') as fin:
|
>>> with open('samples/simple1.pdf', 'rb') as fin:
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
from .layout import LTChar
|
from .layout import LTChar
|
||||||
|
@ -279,8 +278,6 @@ class HTMLConverter(PDFConverter):
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
if self.codec:
|
if self.codec:
|
||||||
text = text.encode(self.codec)
|
text = text.encode(self.codec)
|
||||||
if sys.version_info < (3, 0):
|
|
||||||
text = str(text)
|
|
||||||
self.outfp.write(text)
|
self.outfp.write(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -66,7 +66,8 @@ def dumpxml(out, obj, codec=None):
|
||||||
out.write('\n</props>\n')
|
out.write('\n</props>\n')
|
||||||
if codec == 'text':
|
if codec == 'text':
|
||||||
data = obj.get_data()
|
data = obj.get_data()
|
||||||
out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
|
out.write('<data size="%d">%s</data>\n'
|
||||||
|
% (len(data), escape(data)))
|
||||||
out.write('</stream>')
|
out.write('</stream>')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue