Merge remote-tracking branch 'upstream/master'

pull/142/head
Martin Wolf 2018-06-20 13:27:18 +02:00
commit 26f80715ed
11 changed files with 47 additions and 37 deletions

View File

@ -1,6 +1,5 @@
language: python
python:
- "2.6"
- "2.7"
- "3.4"
- "3.5"

View File

@ -38,11 +38,11 @@ How to Install
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
* Install
$ pip install pdfminer.six
`pip install pdfminer.six`
* Run the following test:
$ pdf2txt.py samples/simple1.pdf
`pdf2txt.py samples/simple1.pdf`
Command Line Tools
@ -78,6 +78,7 @@ TODO
* PEP-8 and PEP-257 conformance.
* Better documentation.
* Performance improvements.
Terms and Conditions

View File

@ -112,7 +112,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
evenodd, gstate.scolor, gstate.ncolor))
return
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
try:
text = font.to_unichr(cid)
assert isinstance(text, six.text_type), str(type(text))
@ -120,7 +120,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate)
self.cur_item.add(item)
return item.adv
@ -520,8 +520,9 @@ class XMLConverter(PDFConverter):
render(child)
self.write('</textbox>\n')
elif isinstance(item, LTChar):
self.write('<text font="%s" bbox="%s" size="%.3f">' %
(enc(item.fontname, None), bbox2str(item.bbox), item.size))
self.write('<text font="%s" bbox="%s" colourspace="%s" ncolour="%s" size="%.3f">' %
(enc(item.fontname, None), bbox2str(item.bbox),
item.ncs.name, item.graphicstate.ncolor, item.size))
self.write_text(item.get_text())
self.write('</text>\n')
elif isinstance(item, LTText):

View File

@ -228,11 +228,13 @@ class LTAnno(LTItem, LTText):
class LTChar(LTComponent, LTText):
def __init__(self, matrix, font, fontsize, scaling, rise,
text, textwidth, textdisp):
text, textwidth, textdisp, ncs, graphicstate):
LTText.__init__(self)
self._text = text
self.matrix = matrix
self.fontname = font.fontname
self.ncs = ncs
self.graphicstate = graphicstate
self.adv = textwidth * fontsize * scaling
# compute the boundary rectangle.
if font.is_vertical():

View File

@ -1,4 +1,4 @@
import collections
from .psparser import LIT
import six #Python 2+3 compatibility
@ -21,17 +21,20 @@ class PDFColorSpace(object):
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
if six.PY2:
PREDEFINED_COLORSPACE = {}
for (name, n) in six.iteritems({
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,
'DeviceRGB': 3,
'DeviceCMYK': 4,
'DeviceGray': 1,
'Separation': 1,
'Indexed': 1,
'Pattern': 1,
}) :
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
else:
PREDEFINED_COLORSPACE = collections.OrderedDict()
for (name, n) in [
('DeviceGray', 1), # default value first
('CalRGB', 3),
('CalGray', 1),
('Lab', 3),
('DeviceRGB', 3),
('DeviceCMYK', 4),
('Separation', 1),
('Indexed', 1),
('Pattern', 1),
]:
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)

View File

@ -66,7 +66,7 @@ class PDFDevice(object):
##
class PDFTextDevice(PDFDevice):
def render_string(self, textstate, seq):
def render_string(self, textstate, seq, ncs, graphicstate):
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
@ -80,15 +80,16 @@ class PDFTextDevice(PDFDevice):
if font.is_vertical():
textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
else:
textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
return
def render_string_horizontal(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
font, fontsize, scaling, charspace, wordspace,
rise, dxscale, ncs, graphicstate):
(x, y) = pos
needcharspace = False
for obj in seq:
@ -100,14 +101,16 @@ class PDFTextDevice(PDFDevice):
if needcharspace:
x += charspace
x += self.render_char(utils.translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
font, fontsize, scaling, rise, cid,
ncs, graphicstate)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
return (x, y)
def render_string_vertical(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
font, fontsize, scaling, charspace, wordspace,
rise, dxscale, ncs, graphicstate):
(x, y) = pos
needcharspace = False
for obj in seq:
@ -119,13 +122,14 @@ class PDFTextDevice(PDFDevice):
if needcharspace:
y += charspace
y += self.render_char(utils.translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
font, fontsize, scaling, rise, cid,
ncs, graphicstate)
if cid == 32 and wordspace:
y += wordspace
needcharspace = True
return (x, y)
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
return 0

View File

@ -586,13 +586,13 @@ class PDFPageInterpreter(object):
# setgray-stroking
def do_G(self, gray):
self.graphicstate.color = gray
self.graphicstate.scolor = gray
#self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
self.graphicstate.color = gray
self.graphicstate.ncolor = gray
#self.do_cs(LITERAL_DEVICE_GRAY)
return
@ -769,7 +769,7 @@ class PDFPageInterpreter(object):
if settings.STRICT:
raise PDFInterpreterError('No font specified!')
return
self.device.render_string(self.textstate, seq)
self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy())
return
# show

View File

@ -338,7 +338,7 @@ class PSBaseParser(object):
m = EOL.search(s, i)
if not m:
self._curtoken += s[i:]
return (self._parse_comment, len(s))
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
self._parse1 = self._parse_main

View File

@ -275,7 +275,7 @@ def decode_text(s):
# enc
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
if six.PY3 and isinstance(x, bytes):
return ''
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
if codec:

View File

@ -68,7 +68,7 @@ def main(args=None):
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")

View File

@ -66,7 +66,7 @@ def main(args=None):
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
# params for pdf2txt
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")