Merge remote-tracking branch 'upstream/master'
commit
26f80715ed
|
@ -1,6 +1,5 @@
|
||||||
language: python
|
language: python
|
||||||
python:
|
python:
|
||||||
- "2.6"
|
|
||||||
- "2.7"
|
- "2.7"
|
||||||
- "3.4"
|
- "3.4"
|
||||||
- "3.5"
|
- "3.5"
|
||||||
|
|
|
@ -38,11 +38,11 @@ How to Install
|
||||||
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
|
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
|
||||||
* Install
|
* Install
|
||||||
|
|
||||||
$ pip install pdfminer.six
|
`pip install pdfminer.six`
|
||||||
|
|
||||||
* Run the following test:
|
* Run the following test:
|
||||||
|
|
||||||
$ pdf2txt.py samples/simple1.pdf
|
`pdf2txt.py samples/simple1.pdf`
|
||||||
|
|
||||||
|
|
||||||
Command Line Tools
|
Command Line Tools
|
||||||
|
@ -78,6 +78,7 @@ TODO
|
||||||
|
|
||||||
* PEP-8 and PEP-257 conformance.
|
* PEP-8 and PEP-257 conformance.
|
||||||
* Better documentation.
|
* Better documentation.
|
||||||
|
* Performance improvements.
|
||||||
|
|
||||||
|
|
||||||
Terms and Conditions
|
Terms and Conditions
|
||||||
|
|
|
@ -112,7 +112,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
evenodd, gstate.scolor, gstate.ncolor))
|
evenodd, gstate.scolor, gstate.ncolor))
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
|
||||||
try:
|
try:
|
||||||
text = font.to_unichr(cid)
|
text = font.to_unichr(cid)
|
||||||
assert isinstance(text, six.text_type), str(type(text))
|
assert isinstance(text, six.text_type), str(type(text))
|
||||||
|
@ -120,7 +120,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
text = self.handle_undefined_char(font, cid)
|
text = self.handle_undefined_char(font, cid)
|
||||||
textwidth = font.char_width(cid)
|
textwidth = font.char_width(cid)
|
||||||
textdisp = font.char_disp(cid)
|
textdisp = font.char_disp(cid)
|
||||||
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
|
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return item.adv
|
return item.adv
|
||||||
|
|
||||||
|
@ -520,8 +520,9 @@ class XMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
self.write('</textbox>\n')
|
self.write('</textbox>\n')
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.write('<text font="%s" bbox="%s" size="%.3f">' %
|
self.write('<text font="%s" bbox="%s" colourspace="%s" ncolour="%s" size="%.3f">' %
|
||||||
(enc(item.fontname, None), bbox2str(item.bbox), item.size))
|
(enc(item.fontname, None), bbox2str(item.bbox),
|
||||||
|
item.ncs.name, item.graphicstate.ncolor, item.size))
|
||||||
self.write_text(item.get_text())
|
self.write_text(item.get_text())
|
||||||
self.write('</text>\n')
|
self.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
|
|
|
@ -228,11 +228,13 @@ class LTAnno(LTItem, LTText):
|
||||||
class LTChar(LTComponent, LTText):
|
class LTChar(LTComponent, LTText):
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, scaling, rise,
|
def __init__(self, matrix, font, fontsize, scaling, rise,
|
||||||
text, textwidth, textdisp):
|
text, textwidth, textdisp, ncs, graphicstate):
|
||||||
LTText.__init__(self)
|
LTText.__init__(self)
|
||||||
self._text = text
|
self._text = text
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.fontname = font.fontname
|
self.fontname = font.fontname
|
||||||
|
self.ncs = ncs
|
||||||
|
self.graphicstate = graphicstate
|
||||||
self.adv = textwidth * fontsize * scaling
|
self.adv = textwidth * fontsize * scaling
|
||||||
# compute the boundary rectangle.
|
# compute the boundary rectangle.
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
|
import collections
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
|
|
||||||
import six #Python 2+3 compatibility
|
import six #Python 2+3 compatibility
|
||||||
|
@ -21,17 +21,20 @@ class PDFColorSpace(object):
|
||||||
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||||
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = {}
|
if six.PY2:
|
||||||
for (name, n) in six.iteritems({
|
PREDEFINED_COLORSPACE = {}
|
||||||
'CalRGB': 3,
|
else:
|
||||||
'CalGray': 1,
|
PREDEFINED_COLORSPACE = collections.OrderedDict()
|
||||||
'Lab': 3,
|
|
||||||
'DeviceRGB': 3,
|
for (name, n) in [
|
||||||
'DeviceCMYK': 4,
|
('DeviceGray', 1), # default value first
|
||||||
'DeviceGray': 1,
|
('CalRGB', 3),
|
||||||
'Separation': 1,
|
('CalGray', 1),
|
||||||
'Indexed': 1,
|
('Lab', 3),
|
||||||
'Pattern': 1,
|
('DeviceRGB', 3),
|
||||||
}) :
|
('DeviceCMYK', 4),
|
||||||
|
('Separation', 1),
|
||||||
|
('Indexed', 1),
|
||||||
|
('Pattern', 1),
|
||||||
|
]:
|
||||||
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
|
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
|
||||||
|
|
|
@ -66,7 +66,7 @@ class PDFDevice(object):
|
||||||
##
|
##
|
||||||
class PDFTextDevice(PDFDevice):
|
class PDFTextDevice(PDFDevice):
|
||||||
|
|
||||||
def render_string(self, textstate, seq):
|
def render_string(self, textstate, seq, ncs, graphicstate):
|
||||||
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
fontsize = textstate.fontsize
|
fontsize = textstate.fontsize
|
||||||
|
@ -80,15 +80,16 @@ class PDFTextDevice(PDFDevice):
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
textstate.linematrix = self.render_string_vertical(
|
textstate.linematrix = self.render_string_vertical(
|
||||||
seq, matrix, textstate.linematrix, font, fontsize,
|
seq, matrix, textstate.linematrix, font, fontsize,
|
||||||
scaling, charspace, wordspace, rise, dxscale)
|
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
|
||||||
else:
|
else:
|
||||||
textstate.linematrix = self.render_string_horizontal(
|
textstate.linematrix = self.render_string_horizontal(
|
||||||
seq, matrix, textstate.linematrix, font, fontsize,
|
seq, matrix, textstate.linematrix, font, fontsize,
|
||||||
scaling, charspace, wordspace, rise, dxscale)
|
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string_horizontal(self, seq, matrix, pos,
|
def render_string_horizontal(self, seq, matrix, pos,
|
||||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
font, fontsize, scaling, charspace, wordspace,
|
||||||
|
rise, dxscale, ncs, graphicstate):
|
||||||
(x, y) = pos
|
(x, y) = pos
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
|
@ -100,14 +101,16 @@ class PDFTextDevice(PDFDevice):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
x += charspace
|
x += charspace
|
||||||
x += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
x += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
||||||
font, fontsize, scaling, rise, cid)
|
font, fontsize, scaling, rise, cid,
|
||||||
|
ncs, graphicstate)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
x += wordspace
|
x += wordspace
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_string_vertical(self, seq, matrix, pos,
|
def render_string_vertical(self, seq, matrix, pos,
|
||||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
font, fontsize, scaling, charspace, wordspace,
|
||||||
|
rise, dxscale, ncs, graphicstate):
|
||||||
(x, y) = pos
|
(x, y) = pos
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
|
@ -119,13 +122,14 @@ class PDFTextDevice(PDFDevice):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
y += charspace
|
y += charspace
|
||||||
y += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
y += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
||||||
font, fontsize, scaling, rise, cid)
|
font, fontsize, scaling, rise, cid,
|
||||||
|
ncs, graphicstate)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
y += wordspace
|
y += wordspace
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -586,13 +586,13 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
# setgray-stroking
|
# setgray-stroking
|
||||||
def do_G(self, gray):
|
def do_G(self, gray):
|
||||||
self.graphicstate.color = gray
|
self.graphicstate.scolor = gray
|
||||||
#self.do_CS(LITERAL_DEVICE_GRAY)
|
#self.do_CS(LITERAL_DEVICE_GRAY)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setgray-non-stroking
|
# setgray-non-stroking
|
||||||
def do_g(self, gray):
|
def do_g(self, gray):
|
||||||
self.graphicstate.color = gray
|
self.graphicstate.ncolor = gray
|
||||||
#self.do_cs(LITERAL_DEVICE_GRAY)
|
#self.do_cs(LITERAL_DEVICE_GRAY)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -769,7 +769,7 @@ class PDFPageInterpreter(object):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFInterpreterError('No font specified!')
|
raise PDFInterpreterError('No font specified!')
|
||||||
return
|
return
|
||||||
self.device.render_string(self.textstate, seq)
|
self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy())
|
||||||
return
|
return
|
||||||
|
|
||||||
# show
|
# show
|
||||||
|
|
|
@ -338,7 +338,7 @@ class PSBaseParser(object):
|
||||||
m = EOL.search(s, i)
|
m = EOL.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
return (self._parse_comment, len(s))
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += s[i:j]
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
|
|
|
@ -275,7 +275,7 @@ def decode_text(s):
|
||||||
# enc
|
# enc
|
||||||
def enc(x, codec='ascii'):
|
def enc(x, codec='ascii'):
|
||||||
"""Encodes a string for SGML/XML/HTML"""
|
"""Encodes a string for SGML/XML/HTML"""
|
||||||
if isinstance(x, bytes):
|
if six.PY3 and isinstance(x, bytes):
|
||||||
return ''
|
return ''
|
||||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
||||||
if codec:
|
if codec:
|
||||||
|
|
|
@ -68,7 +68,7 @@ def main(args=None):
|
||||||
P = argparse.ArgumentParser(description=__doc__)
|
P = argparse.ArgumentParser(description=__doc__)
|
||||||
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
|
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
|
||||||
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||||
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
|
||||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||||
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||||
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
||||||
|
|
|
@ -66,7 +66,7 @@ def main(args=None):
|
||||||
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||||
|
|
||||||
# params for pdf2txt
|
# params for pdf2txt
|
||||||
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
|
||||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||||
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||||
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")
|
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")
|
||||||
|
|
Loading…
Reference in New Issue