diff --git a/.travis.yml b/.travis.yml index d4107e0..62b15ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.6" - "2.7" - "3.4" - "3.5" diff --git a/README.md b/README.md index f49dbca..2a96278 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,11 @@ How to Install * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six) * Install - $ pip install pdfminer.six + `pip install pdfminer.six` * Run the following test: - $ pdf2txt.py samples/simple1.pdf + `pdf2txt.py samples/simple1.pdf` Command Line Tools @@ -78,6 +78,7 @@ TODO * PEP-8 and PEP-257 conformance. * Better documentation. + * Performance improvements. Terms and Conditions diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 02545e8..4e51e37 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -112,7 +112,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor)) return - def render_char(self, matrix, font, fontsize, scaling, rise, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): try: text = font.to_unichr(cid) assert isinstance(text, six.text_type), str(type(text)) @@ -120,7 +120,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) - item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp) + item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate) self.cur_item.add(item) return item.adv @@ -520,8 +520,9 @@ class XMLConverter(PDFConverter): render(child) self.write('\n') elif isinstance(item, LTChar): - self.write('' % - (enc(item.fontname, None), bbox2str(item.bbox), item.size)) + self.write('' % + (enc(item.fontname, None), bbox2str(item.bbox), + item.ncs.name, item.graphicstate.ncolor, item.size)) self.write_text(item.get_text()) self.write('\n') elif isinstance(item, LTText): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 451d4e4..587c221 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -228,11 +228,13 @@ class LTAnno(LTItem, LTText): class LTChar(LTComponent, LTText): def __init__(self, matrix, font, fontsize, scaling, rise, - text, textwidth, textdisp): + text, textwidth, textdisp, ncs, graphicstate): LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname + self.ncs = ncs + self.graphicstate = graphicstate self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. if font.is_vertical(): diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index 6fe6eaa..ba09ba2 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -1,4 +1,4 @@ - +import collections from .psparser import LIT import six #Python 2+3 compatibility @@ -21,17 +21,20 @@ class PDFColorSpace(object): return '' % (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = {} -for (name, n) in six.iteritems({ - 'CalRGB': 3, - 'CalGray': 1, - 'Lab': 3, - 'DeviceRGB': 3, - 'DeviceCMYK': 4, - 'DeviceGray': 1, - 'Separation': 1, - 'Indexed': 1, - 'Pattern': 1, -}) : +if six.PY2: + PREDEFINED_COLORSPACE = {} +else: + PREDEFINED_COLORSPACE = collections.OrderedDict() + +for (name, n) in [ + ('DeviceGray', 1), # default value first + ('CalRGB', 3), + ('CalGray', 1), + ('Lab', 3), + ('DeviceRGB', 3), + ('DeviceCMYK', 4), + ('Separation', 1), + ('Indexed', 1), + ('Pattern', 1), +]: PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n) - \ No newline at end of file diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index dfdc930..ed54fd2 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -66,7 +66,7 @@ class PDFDevice(object): ## class PDFTextDevice(PDFDevice): - def render_string(self, textstate, seq): + def render_string(self, textstate, seq, ncs, graphicstate): matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize @@ -80,15 +80,16 @@ class PDFTextDevice(PDFDevice): if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale) + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale) + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) return def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False for obj in seq: @@ -100,14 +101,16 @@ class PDFTextDevice(PDFDevice): if needcharspace: x += charspace x += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid) + font, fontsize, scaling, rise, cid, + ncs, graphicstate) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y) def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False for obj in seq: @@ -119,13 +122,14 @@ class PDFTextDevice(PDFDevice): if needcharspace: y += charspace y += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid) + font, fontsize, scaling, rise, cid, + ncs, graphicstate) if cid == 32 and wordspace: y += wordspace needcharspace = True return (x, y) - def render_char(self, matrix, font, fontsize, scaling, rise, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): return 0 diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 0c2328d..a14f64a 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -586,13 +586,13 @@ class PDFPageInterpreter(object): # setgray-stroking def do_G(self, gray): - self.graphicstate.color = gray + self.graphicstate.scolor = gray #self.do_CS(LITERAL_DEVICE_GRAY) return # setgray-non-stroking def do_g(self, gray): - self.graphicstate.color = gray + self.graphicstate.ncolor = gray #self.do_cs(LITERAL_DEVICE_GRAY) return @@ -769,7 +769,7 @@ class PDFPageInterpreter(object): if settings.STRICT: raise PDFInterpreterError('No font specified!') return - self.device.render_string(self.textstate, seq) + self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy()) return # show diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 9b214af..7a28d84 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -338,7 +338,7 @@ class PSBaseParser(object): m = EOL.search(s, i) if not m: self._curtoken += s[i:] - return (self._parse_comment, len(s)) + return len(s) j = m.start(0) self._curtoken += s[i:j] self._parse1 = self._parse_main diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 9cbcbb3..0e948c2 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -275,7 +275,7 @@ def decode_text(s): # enc def enc(x, codec='ascii'): """Encodes a string for SGML/XML/HTML""" - if isinstance(x, bytes): + if six.PY3 and isinstance(x, bytes): return '' x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"') if codec: diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 1e8ec0b..3111c5c 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -68,7 +68,7 @@ def main(args=None): P = argparse.ArgumentParser(description=__doc__) P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.") P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") - P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF") diff --git a/tools/pdfdiff.py b/tools/pdfdiff.py index 17e41e8..b01e2f4 100644 --- a/tools/pdfdiff.py +++ b/tools/pdfdiff.py @@ -66,7 +66,7 @@ def main(args=None): P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") # params for pdf2txt - P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")