Use visible imports in highlevel.rst documentation (#609)

* add missing import for extract_text_to_fp

* Replace testsetup with visible imports in documentation

* Remove obsolete check for python version; python 2 is not supported anymore

* (Unrelated to this MR) Remove sys from converter.py

* Optimize imports

* (Unrelated to this MR) fix line length error

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/610/head
Fiete 2021-08-30 22:17:21 +02:00 committed by GitHub
parent 1d33c026e4
commit 7f54cefe02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 7 additions and 17 deletions

View File

@ -1,8 +1,3 @@
.. testsetup::
import sys
from pdfminer.high_level import extract_text_to_fp, extract_text
.. _tutorial_highlevel: .. _tutorial_highlevel:
Extract text from a PDF using Python Extract text from a PDF using Python
@ -15,6 +10,7 @@ The most simple way to extract text from a PDF is to use
.. doctest:: .. doctest::
>>> from pdfminer.high_level import extract_text
>>> text = extract_text('samples/simple1.pdf') >>> text = extract_text('samples/simple1.pdf')
>>> print(repr(text)) >>> print(repr(text))
'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c' 'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c'
@ -42,10 +38,8 @@ To read text from a PDF and print it on the command line:
.. doctest:: .. doctest::
>>> if sys.version_info > (3, 0): >>> from io import StringIO
... from io import StringIO >>> from pdfminer.high_level import extract_text_to_fp
... else:
... from io import BytesIO as StringIO
>>> output_string = StringIO() >>> output_string = StringIO()
>>> with open('samples/simple1.pdf', 'rb') as fin: >>> with open('samples/simple1.pdf', 'rb') as fin:
... extract_text_to_fp(fin, output_string) ... extract_text_to_fp(fin, output_string)
@ -56,10 +50,8 @@ Or to convert it to html and use layout analysis:
.. doctest:: .. doctest::
>>> if sys.version_info > (3, 0): >>> from io import StringIO
... from io import StringIO >>> from pdfminer.high_level import extract_text_to_fp
... else:
... from io import BytesIO as StringIO
>>> from pdfminer.layout import LAParams >>> from pdfminer.layout import LAParams
>>> output_string = StringIO() >>> output_string = StringIO()
>>> with open('samples/simple1.pdf', 'rb') as fin: >>> with open('samples/simple1.pdf', 'rb') as fin:

View File

@ -1,7 +1,6 @@
import io import io
import logging import logging
import re import re
import sys
from . import utils from . import utils
from .layout import LTChar from .layout import LTChar
@ -279,8 +278,6 @@ class HTMLConverter(PDFConverter):
def write(self, text): def write(self, text):
if self.codec: if self.codec:
text = text.encode(self.codec) text = text.encode(self.codec)
if sys.version_info < (3, 0):
text = str(text)
self.outfp.write(text) self.outfp.write(text)
return return

View File

@ -66,7 +66,8 @@ def dumpxml(out, obj, codec=None):
out.write('\n</props>\n') out.write('\n</props>\n')
if codec == 'text': if codec == 'text':
data = obj.get_data() data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), escape(data))) out.write('<data size="%d">%s</data>\n'
% (len(data), escape(data)))
out.write('</stream>') out.write('</stream>')
return return