From ecc4d056754a528706f7bf43f8754c448ea2dd4b Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:34:33 -0500 Subject: [PATCH 01/28] Fix a unicode conversion bug. See https://github.com/euske/pdfminer/issues/75 --- pdfminer/psparser.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index c1ebe93..be715af 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -343,7 +343,15 @@ class PSBaseParser(object): self.hex = b'' self._parse1 = self._parse_literal_hex return j+1 - self._add_token(LIT(unicode(self._curtoken))) + + try: + # Try to interpret the token as a utf-8 string + utoken = self._curtoken.decode('utf-8') + except UnicodeDecodeError: + # We failed, there is possibly a corrupt PDF here. + if STRICT: raise + utoken = "" + self._add_token(LIT(utoken)) self._parse1 = self._parse_main return j From 1067cb9f9f433d6843ed8acffa08394bdbba3ad2 Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:36:26 -0500 Subject: [PATCH 02/28] Use a .gitignore file. --- .gitignore | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed8045c --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Intermediate documents +*.xps + +# Password and Key Files +*.pem +*.p12 + +# Compiled source # +################### +*.pyc +*.com +*.class +*.dll +*.exe +*.o +*.so + +# Mecurial Files? +*.i +*.d +*.mo +*.hg/ + +# Python data files # +*.shelf +*.shelve + +# Don't track these files, they are output from scripts + +# Subversion files +*.svn-base +all-wcprops +entries + +# Logs and databases # +###################### +bulkloader-log-* +*.log +*.sql +*.sql3 +*.sql3-journal +*.sqlite + +# OS generated files # +###################### +.DS_Store? +ehthumbs.db +Icon? +Thumbs.db + +# tmp files # +############# +~$*.doc +~WRL*.tmp \ No newline at end of file From 36977fbe0802b994069c800f9c55d253137c0aef Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:36:58 -0500 Subject: [PATCH 03/28] Add debug flags for much of the debug output. --- pdfminer/pdfdocument.py | 20 ++++++++++++++------ pdfminer/pdfinterp.py | 12 ++++++++---- pdfminer/pdfpage.py | 6 ++++-- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 2c3c274..66b575a 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -65,6 +65,8 @@ LITERAL_CATALOG = LIT('Catalog') ## class PDFBaseXRef(object): + debug = False + def get_trailer(self): raise NotImplementedError @@ -122,7 +124,7 @@ class PDFXRef(PDFBaseXRef): if use != b'n': continue self.offsets[objid] = (None, long(pos), int(genno)) - logging.info('xref objects: %r' % self.offsets) + if self.debug: logging.info('xref objects: %r' % self.offsets) self.load_trailer(parser) return @@ -173,7 +175,7 @@ class PDFXRefFallback(PDFXRef): if line.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) - logging.info('trailer: %r' % self.get_trailer()) + if self.debug: logging.info('trailer: %r' % self.get_trailer()) break m = self.PDFOBJ_CUE.match(line) if not m: @@ -212,6 +214,8 @@ class PDFXRefFallback(PDFXRef): ## class PDFXRefStream(PDFBaseXRef): + debug = False + def __init__(self): self.data = None self.entlen = None @@ -238,7 +242,8 @@ class PDFXRefStream(PDFBaseXRef): self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs - logging.info('xref stream: objid=%s, fields=%d,%d,%d' % + if self.debug: + logging.info('xref stream: objid=%s, fields=%d,%d,%d' % (', '.join(map(repr, self.ranges)), self.fl1, self.fl2, self.fl3)) return @@ -761,7 +766,8 @@ class PDFDocument(object): prev = line else: raise PDFNoValidXRef('Unexpected EOF') - logging.info('xref found: pos=%r' % prev) + if self.debug: + logging.info('xref found: pos=%r' % prev) return long(prev) # read xref table @@ -773,7 +779,8 @@ class PDFDocument(object): (pos, token) = parser.nexttoken() except PSEOF: raise PDFNoValidXRef('Unexpected EOF') - logging.info('read_xref_from: start=%d, token=%r' % (start, token)) + if self.debug: + logging.info('read_xref_from: start=%d, token=%r' % (start, token)) if isinstance(token, int): # XRefStream: PDF-1.5 parser.seek(pos) @@ -787,7 +794,8 @@ class PDFDocument(object): xref.load(parser) xrefs.append(xref) trailer = xref.get_trailer() - logging.info('trailer: %r' % trailer) + if self.debug: + logging.info('trailer: %r' % trailer) if 'XRefStm' in trailer: pos = int_value(trailer['XRefStm']) self.read_xref_from(parser, pos, xrefs) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 3b368e0..3f3f393 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -139,6 +139,8 @@ class PDFResourceManager(object): allocated multiple times. """ + debug = False + def __init__(self, caching=True): self.caching = caching self._cached_fonts = {} @@ -167,7 +169,8 @@ class PDFResourceManager(object): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: - logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) + if self.debug: + logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') @@ -799,7 +802,7 @@ class PDFPageInterpreter(object): if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return - logging.info('Processing xobj: %r' % xobj) + if self.debug: logging.info('Processing xobj: %r' % xobj) subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() @@ -822,7 +825,7 @@ class PDFPageInterpreter(object): return def process_page(self, page): - logging.info('Processing page: %r' % page) + if self.debug: logging.info('Processing page: %r' % page) (x0, y0, x1, y1) = page.mediabox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) @@ -841,7 +844,8 @@ class PDFPageInterpreter(object): # Render the content streams. # This method may be called recursively. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): - logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % + if self.debug: + logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % (resources, streams, ctm)) self.init_resources(resources) self.init_state(ctm) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index fcdf17b..a48767c 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -39,6 +39,8 @@ class PDFPage(object): beads: a chain that represents natural reading order. """ + debug = False + def __init__(self, doc, pageid, attrs): """Initialize a page object. @@ -86,12 +88,12 @@ class PDFPage(object): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: - logging.info('Pages: Kids=%r' % tree['Kids']) + if klass.debug: logging.info('Pages: Kids=%r' % tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: - logging.info('Page: %r' % tree) + if klass.debug: logging.info('Page: %r' % tree) yield (objid, tree) pages = False if 'Pages' in document.catalog: From 75206ba18d086684484a31951d9bb8ff6270513c Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Tue, 9 Dec 2014 22:49:13 +0900 Subject: [PATCH 04/28] Removed: .gitignore --- .gitignore | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index ed8045c..0000000 --- a/.gitignore +++ /dev/null @@ -1,54 +0,0 @@ -# Intermediate documents -*.xps - -# Password and Key Files -*.pem -*.p12 - -# Compiled source # -################### -*.pyc -*.com -*.class -*.dll -*.exe -*.o -*.so - -# Mecurial Files? -*.i -*.d -*.mo -*.hg/ - -# Python data files # -*.shelf -*.shelve - -# Don't track these files, they are output from scripts - -# Subversion files -*.svn-base -all-wcprops -entries - -# Logs and databases # -###################### -bulkloader-log-* -*.log -*.sql -*.sql3 -*.sql3-journal -*.sqlite - -# OS generated files # -###################### -.DS_Store? -ehthumbs.db -Icon? -Thumbs.db - -# tmp files # -############# -~$*.doc -~WRL*.tmp \ No newline at end of file From 01121124587d99601cf3368e9f82f096a9e5a98f Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Tue, 9 Dec 2014 22:55:47 +0900 Subject: [PATCH 05/28] Fixed: crash on invalid chr number. --- pdfminer/psparser.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index be715af..7232421 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -122,7 +122,7 @@ KEYWORD_DICT_END = KWD(b'>>') def literal_name(x): if not isinstance(x, PSLiteral): if STRICT: - raise PSTypeError('Literal required: %r' % x) + raise PSTypeError('Literal required: %r' % (x,)) else: return str(x) return x.name @@ -131,7 +131,7 @@ def literal_name(x): def keyword_name(x): if not isinstance(x, PSKeyword): if STRICT: - raise PSTypeError('Keyword required: %r' % x) + raise PSTypeError('Keyword required: %r' % (x,)) else: return str(x) return x.name @@ -361,7 +361,10 @@ class PSBaseParser(object): self.hex += c return i+1 if self.hex: - self._curtoken += chr(int(self.hex, 16)) + try: + self._curtoken += chr(int(self.hex, 16)) + except ValueError: + pass self._parse1 = self._parse_literal return i @@ -446,7 +449,10 @@ class PSBaseParser(object): self.oct += c return i+1 if self.oct: - self._curtoken += chr(int(self.oct, 8)) + try: + self._curtoken += chr(int(self.oct, 8)) + except ValueError: + pass self._parse1 = self._parse_string return i if c in ESC_STRING: @@ -479,9 +485,12 @@ class PSBaseParser(object): return len(s) j = m.start(0) self._curtoken += s[i:j] - token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), - SPC.sub(b'', self._curtoken)) - self._add_token(token) + try: + token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), + SPC.sub(b'', self._curtoken)) + self._add_token(token) + except ValueError: + pass self._parse1 = self._parse_main return j @@ -491,7 +500,7 @@ class PSBaseParser(object): self.charpos = self._parse1(self.buf, self.charpos) token = self._tokens.pop(0) if self.debug: - logging.debug('nexttoken: %r' % token) + logging.debug('nexttoken: %r' % (token,)) return token @@ -532,7 +541,7 @@ class PSStackParser(PSBaseParser): def add_results(self, *objs): if self.debug: - logging.debug('add_results: %r' % objs) + logging.debug('add_results: %r' % (objs,)) self.results.extend(objs) return @@ -585,7 +594,7 @@ class PSStackParser(PSBaseParser): try: (pos, objs) = self.end_type('d') if len(objs) % 2 != 0: - raise PSSyntaxError('Invalid dictionary construct: %r' % objs) + raise PSSyntaxError('Invalid dictionary construct: %r' % (objs,)) # construct a Python dictionary. d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None) self.push((pos, d)) @@ -613,7 +622,7 @@ class PSStackParser(PSBaseParser): self.flush() obj = self.results.pop(0) if self.debug: - logging.debug('nextobject: %r' % obj) + logging.debug('nextobject: %r' % (obj,)) return obj From 5cbdd915c7ea64fa7949518618d9d66eb186ae44 Mon Sep 17 00:00:00 2001 From: speedplane Date: Thu, 11 Dec 2014 00:53:33 -0500 Subject: [PATCH 06/28] Remove the dependancy on python2. Also, allow tests to be run on cygwin by checking for it, and converting unix2dos line endings. --- samples/Makefile | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/samples/Makefile b/samples/Makefile index d1b06c4..7984c37 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,9 +1,15 @@ # GNUMakefile for test +UNAME_S := $(shell uname -o) +CNVTXT=: +ifeq ($(UNAME_S),Cygwin) + CNVTXT=unix2dos +endif + RM=rm -f CMP=: ECHO=echo -PYTHON=python2 +PYTHON=python PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V @@ -40,10 +46,13 @@ tests: for i in $(TESTS); do \ $(ECHO) $$i; \ $(PDF2TXT) -t html -o $$i.html $$i.pdf || exit 1; \ + $(CNVTXT) $$i.html || exit 1; \ $(CMP) $$i.html $$i.html.ref || exit 1; \ $(PDF2TXT) -t xml -o $$i.xml $$i.pdf || exit 1; \ + $(CNVTXT) $$i.xml || exit 1; \ $(CMP) $$i.xml $$i.xml.ref || exit 1; \ - $(PDF2TXT) -t text -o $$i.txt $$i.pdf || eixt 1; \ + $(PDF2TXT) -t text -o $$i.txt $$i.pdf || exit 1; \ + $(CNVTXT) $$i.txt || exit 1; \ $(CMP) $$i.txt $$i.txt.ref || exit 1; \ done From 45170e718396502c45d13cb44bfde2b6f95e576f Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:49:13 -0500 Subject: [PATCH 07/28] There are a number of relatively complex changes here. Comments are in order of where the change appears. 1. When detecting text in a horizontal line, we already add a space between words if separated by more than word_margin apart. However now, we only do it if there is not already an existing space. This prevents multiple spaces being placed between words. 2. Detect a horizontal line if the line is zero width. This improves our detection of horizonal lines when looking for both horizontal and vertical. 3. Don't detect a vertical line if the previous letter is whitspace. Prevents double spaces being caught as vert lines. 4. Improve upon an unfortunate O(N^2) algorithm which I have seen taking many minutes to execute. Unfortunately, while the "fix" reduces algorithmic complexity, it isn't technically correct, so we only do it when we know things will take a long time. --- pdfminer/layout.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 32b706f..b2dde8c 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -368,9 +368,17 @@ class LTTextLineHorizontal(LTTextLine): def add(self, obj): if isinstance(obj, LTChar) and self.word_margin: + # Add a space between words if separated by more than word_margin + # apart. Use the max of obj width and height so narrow letters + # (i, l) are treated like wider letters, reducing extra spaces margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0-margin: - LTContainer.add(self, LTAnno(' ')) + # But only do it if there is not already a space. + last_was_alpha = self._objs and \ + isinstance(self._objs[-1], LTChar) and \ + self._objs[-1]._text == ' ' + if not last_was_alpha: + LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 LTTextLine.add(self, obj) return @@ -513,7 +521,9 @@ class LTLayoutContainer(LTContainer): (min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1)) and (obj0.hdistance(obj1) < - max(obj0.width, obj1.width) * laparams.char_margin)) + max(obj0.width, obj1.width) * laparams.char_margin) or + # If the line is zero width, default to horizontal + (max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0)) # valign: obj0 and obj1 is vertically aligned. # @@ -535,7 +545,10 @@ class LTLayoutContainer(LTContainer): (min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1)) and (obj0.vdistance(obj1) < - max(obj0.height, obj1.height) * laparams.char_margin)) + max(obj0.height, obj1.height) * laparams.char_margin) and + # Don't start a vertical line if the previous letter is + # whitspace. Prevents double spaces being caught as vert lines. + (line or obj0._text.strip())) if ((halign and isinstance(line, LTTextLineHorizontal)) or (valign and isinstance(line, LTTextLineVertical))): @@ -631,8 +644,19 @@ class LTLayoutContainer(LTContainer): (c,d,_,_) = t return (c,d) - # XXX this still takes O(n^2) :( + # The algorithm below still takes O(n^2) :( + # For now, if we have many boxes, split them into two and perform them + # separately. This will cause bugs, but will prevent hanging. + if len(boxes) > 100: + boxes = sorted(boxes, key=lambda obj: obj.y0) + # Divide in two, then perform grouping + # print "Making Recursive Call %d"%len(boxes) + bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2]) + top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:]) + boxes = bot_boxes + top_boxes + dists = [] + # Calculate the distance between each box for i in xrange(len(boxes)): obj1 = boxes[i] for j in xrange(i+1, len(boxes)): @@ -642,16 +666,20 @@ class LTLayoutContainer(LTContainer): dists = csort(dists, key=key_obj) plane = Plane(self.bbox) plane.extend(boxes) + # Start with the two closest objects while dists: (c, d, obj1, obj2) = dists.pop(0) + # If there are any objects in between, then skip these two if c == 0 and isany(obj1, obj2): dists.append((1, d, obj1, obj2)) continue + # Group these two closest objects if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) + # Remove the two individual objects plane.remove(obj1) plane.remove(obj2) dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists From 806ee603ff2b3edec0f7749fa5ee2e4636d10e47 Mon Sep 17 00:00:00 2001 From: speedplane Date: Fri, 12 Dec 2014 00:35:38 -0500 Subject: [PATCH 08/28] More fixes to layout. The compute neighbors function for horizontal lines is only intended to find neighbors on differing lines. However, it's entirely possible that horizontal neighbors could appear. This commit finds horizontal neighbors in a horizonal line and merges them together into a single horizontal line if necessary. This leads to much better text extraction if the PDF was created in a funky way. For example (test case coming), I have seen PDFs which are written almost like vertical columns, but the text is entirely horizontal. --- pdfminer/layout.py | 62 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index b2dde8c..3d52859 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import logging from .utils import INF from .utils import Plane from .utils import get_bound @@ -355,6 +356,9 @@ class LTTextLine(LTTextContainer): LTContainer.add(self, LTAnno('\n')) return + def group_line_neighbors(self, objs, ratio): + return objs + def find_neighbors(self, plane, ratio): raise NotImplementedError @@ -383,14 +387,52 @@ class LTTextLineHorizontal(LTTextLine): LTTextLine.add(self, obj) return + def is_neighbor(self, obj, d, same_line = False): + # Horizontal lines can only connect with horizontal lines + if not isinstance(obj, LTTextLineHorizontal): + return False + # Ensure they are vertically close + if abs(obj.height-self.height) >= d: + return False + # Ensure that they have similar start or stop x positions + if not (abs(obj.x0-self.x0) < d or + abs(obj.x1-self.x1) < d or + # Or that they intersect eachother horizontally. + (obj.x0 < self.x0 and obj.x1 > self.x0) or + (obj.x0 > self.x0 and obj.x0 < self.x1)): + return False + if same_line and not ( + # Ensure they have similar + (obj.y0 == self.y0) or (obj.y1 == self.y1) or + (obj.y0 < self.y0 and obj.y0 > self.y1) or + (obj.y0 > self.y0 and obj.y1 < self.y0)): + return False + return True + + def group_line_neighbors(self, objs, ratio): + ''' + Given a set of objects that may or may not be on the same line as this, + add the objects that are on the same line. + + Return the objects that are not on the same line. + ''' + d = ratio*self.height + other_lines = [] + for o in objs: + if o == self: + other_lines.append(o) + elif self.is_neighbor(o, d, same_line=True): + [self.add(oc) for oc in o] + # Clear out the old line + o._objs = [] + else: + other_lines.append(o) + return other_lines + def find_neighbors(self, plane, ratio): d = ratio*self.height objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d)) - return [obj for obj in objs - if (isinstance(obj, LTTextLineHorizontal) and - abs(obj.height-self.height) < d and - (abs(obj.x0-self.x0) < d or - abs(obj.x1-self.x1) < d))] + return [o for o in objs if self.is_neighbor(o, d)] class LTTextLineVertical(LTTextLine): @@ -577,19 +619,25 @@ class LTLayoutContainer(LTContainer): yield line return - # group_textlines: group neighboring lines to textboxes. + # group_textlines: group neighbouring lines to textboxes. def group_textlines(self, laparams, lines): plane = Plane(self.bbox) plane.extend(lines) boxes = {} + # for line in plane: + # print "line", ("".join([s._text for s in line])).encode('ascii', 'ignore') for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - if line not in neighbors: continue + if line not in neighbors: + logging.error("Line cannot find itself: %s"%line) + continue + neighbors = line.group_line_neighbors(neighbors, laparams.line_margin) members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) + # print "members: ", ["".join([o._text for o in line]) for line in members] if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: From 2199c254930c900790512cfbd77cca8a23a9c633 Mon Sep 17 00:00:00 2001 From: speedplane Date: Fri, 12 Dec 2014 00:29:57 -0500 Subject: [PATCH 09/28] Add my own .gitignore. --- .gitignore | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..db4561e --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ From 69afd3dd30ef6c8c6ace1ad8fe653763bbb002bd Mon Sep 17 00:00:00 2001 From: speedplane Date: Sun, 14 Dec 2014 01:23:44 -0500 Subject: [PATCH 10/28] Use a .gitignore file. --- .gitignore | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/.gitignore b/.gitignore index db4561e..c132931 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,58 @@ docs/_build/ # PyBuilder target/ +======= +# Intermediate documents +*.xps + +# Password and Key Files +*.pem +*.p12 + +# Compiled source # +################### +*.pyc +*.com +*.class +*.dll +*.exe +*.o +*.so + +# Mecurial Files? +*.i +*.d +*.mo +*.hg/ + +# Python data files # +*.shelf +*.shelve + +# Don't track these files, they are output from scripts + +# Subversion files +*.svn-base +all-wcprops +entries + +# Logs and databases # +###################### +bulkloader-log-* +*.log +*.sql +*.sql3 +*.sql3-journal +*.sqlite + +# OS generated files # +###################### +.DS_Store? +ehthumbs.db +Icon? +Thumbs.db + +# tmp files # +############# +~$*.doc +~WRL*.tmp \ No newline at end of file From 56094183514b9772f67bb1eadd96f539b56cdf4c Mon Sep 17 00:00:00 2001 From: speedplane Date: Sun, 14 Dec 2014 01:29:39 -0500 Subject: [PATCH 11/28] Add gz to gitignore. --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c132931..c81d6cf 100644 --- a/.gitignore +++ b/.gitignore @@ -106,4 +106,6 @@ Thumbs.db # tmp files # ############# ~$*.doc -~WRL*.tmp \ No newline at end of file +~WRL*.tmp + +*.gz \ No newline at end of file From 1dbe9ff7e7a71b0eaad39963ea8514b88d26c7c6 Mon Sep 17 00:00:00 2001 From: Ashley Blackmore Date: Wed, 18 Feb 2015 18:35:53 +0100 Subject: [PATCH 12/28] Update setup.py Install missing pycrypto lib --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index c9962fe..51779e7 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,9 @@ PDF parser that can be used for other purposes instead of text analysis.''', author='Yusuke Shinyama', author_email='yusuke at cs dot nyu dot edu', url='http://euske.github.io/pdfminer/index.html', + install_requires=[ + 'pycrypto', + ], packages=[ 'pdfminer', ], From 14fd0fd2d6ef4e709731377decc6a8c119e5e9d6 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Sun, 5 Apr 2015 19:02:02 +0900 Subject: [PATCH 13/28] Fixed: #84 (fontname was in unicode) --- pdfminer/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 3e515d6..28c2abf 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -315,7 +315,7 @@ class HTMLConverter(PDFConverter): if self._font is not None: self.write('') self.write('' % - (fontname, fontsize * self.scale * self.fontscale)) + (enc(fontname), fontsize * self.scale * self.fontscale)) self._font = font self.write_text(text) return From 9af4fe85e1427ec12be57d4ec7604a1973d26288 Mon Sep 17 00:00:00 2001 From: Pablo Castellano Date: Sun, 14 Jun 2015 17:01:03 +0200 Subject: [PATCH 14/28] README: Changed line about Python 3 support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 30aa5db..82a0d35 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Features How to Install -------------- - * Install Python 2.6 or newer. (**Python 3 is not supported.**) + * Install Python 2.6 or newer. (**For Python 3 support have a look at [pdfminer.six](https://github.com/goulu/pdfminer)**). * Download the source code. * Unpack it. * Run `setup.py`: From 63c9378b8b2f9d9d09c4686cf654ef68294e7764 Mon Sep 17 00:00:00 2001 From: Ivan Pozdeev Date: Mon, 10 Aug 2015 03:14:51 +0300 Subject: [PATCH 15/28] make ValueError's descriptive --- pdfminer/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index b53c1c1..307c5e7 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -11,7 +11,7 @@ from sys import maxint as INF def apply_png_predictor(pred, colors, columns, bitspercomponent, data): if bitspercomponent != 8: # unsupported - raise ValueError(bitspercomponent) + raise ValueError("Unsupported `bitspercomponent': %d"%bitspercomponent) nbytes = colors*columns*bitspercomponent//8 i = 0 buf = b'' @@ -43,7 +43,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): line2 += chr(c) else: # unsupported - raise ValueError(ft) + raise ValueError("Unsupported predictor value: %d"%ft) buf += line2 line0 = line2 return buf From 63bb3caec28113354afb23739a400ea2f3a6aff1 Mon Sep 17 00:00:00 2001 From: lucanaso Date: Wed, 9 Dec 2015 16:47:32 +0100 Subject: [PATCH 16/28] Fixed for rendering non breaking spaces (cid:160) As stated in the PDF specification ISO 32000-1, table in Annex D.2 Latin Character Set and Encodings page 653 to 656 (available here: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf): "The SPACE character shall also be encoded as 312 in MacRomanEncoding and as 240 in WinAnsiEncoding. This duplicate code shall signify a nonbreaking space; it shall be typographically the same as (U+003A) SPACE." The duplicate key was missing, therefore PDFMiner was returning the string "(cid:160)". This fix adds the duplicate key in latin_enc.py glyphlist.py does not need to be modified as it already contains a key for non breaking space https://github.com/lucanaso/pdfminer/blob/master/pdfminer/glyphlist.py#L2755. --- pdfminer/latin_enc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pdfminer/latin_enc.py b/pdfminer/latin_enc.py index 41d219c..52dadc1 100644 --- a/pdfminer/latin_enc.py +++ b/pdfminer/latin_enc.py @@ -162,6 +162,7 @@ ENCODING = [ ('mu', None, 181, 181, 181), ('multiply', None, None, 215, 215), ('n', 110, 110, 110, 110), + ('nbspace', None, 202, 160, None), ('nine', 57, 57, 57, 57), ('ntilde', None, 150, 241, 241), ('numbersign', 35, 35, 35, 35), From b0b8818a41652bc54c13d4643649c04f5a353f0e Mon Sep 17 00:00:00 2001 From: speedplane Date: Mon, 13 Jun 2016 23:35:11 -0400 Subject: [PATCH 17/28] Fix a bug with pdfminer which occurs when two or more filters are applied to a stream, even though no parameters are specified. The code would previously drop all of the streams after the first due to misapplication of the zip function. --- pdfminer/pdftypes.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 10c0777..af1435b 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -223,8 +223,13 @@ class PDFStream(PDFObject): return [] if not isinstance(filters, list): filters = [filters] - if not isinstance(params, list): + if not params: + # Make sure the parameters list is the same as filters. + params = [{}]*len(filters) + elif not isinstance(params, list): params = [params] + if STRICT and len(params) != len(filters): + raise PDFException("Parameters len filter mismatch") return zip(filters, params) def decode(self): From 2049462f6ff2f2be1efc4c306fb7947d16c173a1 Mon Sep 17 00:00:00 2001 From: speedplane Date: Mon, 13 Jun 2016 23:42:21 -0400 Subject: [PATCH 18/28] Revert changes unrelated to this branch. --- .gitignore | 111 --------------------------------------------- pdfminer/layout.py | 100 +++++----------------------------------- 2 files changed, 12 insertions(+), 199 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c81d6cf..0000000 --- a/.gitignore +++ /dev/null @@ -1,111 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo -*.pot - -# Django stuff: -*.log - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ -======= -# Intermediate documents -*.xps - -# Password and Key Files -*.pem -*.p12 - -# Compiled source # -################### -*.pyc -*.com -*.class -*.dll -*.exe -*.o -*.so - -# Mecurial Files? -*.i -*.d -*.mo -*.hg/ - -# Python data files # -*.shelf -*.shelve - -# Don't track these files, they are output from scripts - -# Subversion files -*.svn-base -all-wcprops -entries - -# Logs and databases # -###################### -bulkloader-log-* -*.log -*.sql -*.sql3 -*.sql3-journal -*.sqlite - -# OS generated files # -###################### -.DS_Store? -ehthumbs.db -Icon? -Thumbs.db - -# tmp files # -############# -~$*.doc -~WRL*.tmp - -*.gz \ No newline at end of file diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 3d52859..9426ad3 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -import logging from .utils import INF from .utils import Plane from .utils import get_bound @@ -356,9 +355,6 @@ class LTTextLine(LTTextContainer): LTContainer.add(self, LTAnno('\n')) return - def group_line_neighbors(self, objs, ratio): - return objs - def find_neighbors(self, plane, ratio): raise NotImplementedError @@ -372,67 +368,21 @@ class LTTextLineHorizontal(LTTextLine): def add(self, obj): if isinstance(obj, LTChar) and self.word_margin: - # Add a space between words if separated by more than word_margin - # apart. Use the max of obj width and height so narrow letters - # (i, l) are treated like wider letters, reducing extra spaces margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0-margin: - # But only do it if there is not already a space. - last_was_alpha = self._objs and \ - isinstance(self._objs[-1], LTChar) and \ - self._objs[-1]._text == ' ' - if not last_was_alpha: - LTContainer.add(self, LTAnno(' ')) + LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 LTTextLine.add(self, obj) return - def is_neighbor(self, obj, d, same_line = False): - # Horizontal lines can only connect with horizontal lines - if not isinstance(obj, LTTextLineHorizontal): - return False - # Ensure they are vertically close - if abs(obj.height-self.height) >= d: - return False - # Ensure that they have similar start or stop x positions - if not (abs(obj.x0-self.x0) < d or - abs(obj.x1-self.x1) < d or - # Or that they intersect eachother horizontally. - (obj.x0 < self.x0 and obj.x1 > self.x0) or - (obj.x0 > self.x0 and obj.x0 < self.x1)): - return False - if same_line and not ( - # Ensure they have similar - (obj.y0 == self.y0) or (obj.y1 == self.y1) or - (obj.y0 < self.y0 and obj.y0 > self.y1) or - (obj.y0 > self.y0 and obj.y1 < self.y0)): - return False - return True - - def group_line_neighbors(self, objs, ratio): - ''' - Given a set of objects that may or may not be on the same line as this, - add the objects that are on the same line. - - Return the objects that are not on the same line. - ''' - d = ratio*self.height - other_lines = [] - for o in objs: - if o == self: - other_lines.append(o) - elif self.is_neighbor(o, d, same_line=True): - [self.add(oc) for oc in o] - # Clear out the old line - o._objs = [] - else: - other_lines.append(o) - return other_lines - def find_neighbors(self, plane, ratio): d = ratio*self.height objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d)) - return [o for o in objs if self.is_neighbor(o, d)] + return [obj for obj in objs + if (isinstance(obj, LTTextLineHorizontal) and + abs(obj.height-self.height) < d and + (abs(obj.x0-self.x0) < d or + abs(obj.x1-self.x1) < d))] class LTTextLineVertical(LTTextLine): @@ -563,9 +513,7 @@ class LTLayoutContainer(LTContainer): (min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1)) and (obj0.hdistance(obj1) < - max(obj0.width, obj1.width) * laparams.char_margin) or - # If the line is zero width, default to horizontal - (max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0)) + max(obj0.width, obj1.width) * laparams.char_margin)) # valign: obj0 and obj1 is vertically aligned. # @@ -587,10 +535,7 @@ class LTLayoutContainer(LTContainer): (min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1)) and (obj0.vdistance(obj1) < - max(obj0.height, obj1.height) * laparams.char_margin) and - # Don't start a vertical line if the previous letter is - # whitspace. Prevents double spaces being caught as vert lines. - (line or obj0._text.strip())) + max(obj0.height, obj1.height) * laparams.char_margin)) if ((halign and isinstance(line, LTTextLineHorizontal)) or (valign and isinstance(line, LTTextLineVertical))): @@ -619,25 +564,19 @@ class LTLayoutContainer(LTContainer): yield line return - # group_textlines: group neighbouring lines to textboxes. + # group_textlines: group neighboring lines to textboxes. def group_textlines(self, laparams, lines): plane = Plane(self.bbox) plane.extend(lines) boxes = {} - # for line in plane: - # print "line", ("".join([s._text for s in line])).encode('ascii', 'ignore') for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - if line not in neighbors: - logging.error("Line cannot find itself: %s"%line) - continue - neighbors = line.group_line_neighbors(neighbors, laparams.line_margin) + if line not in neighbors: continue members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) - # print "members: ", ["".join([o._text for o in line]) for line in members] if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: @@ -692,19 +631,8 @@ class LTLayoutContainer(LTContainer): (c,d,_,_) = t return (c,d) - # The algorithm below still takes O(n^2) :( - # For now, if we have many boxes, split them into two and perform them - # separately. This will cause bugs, but will prevent hanging. - if len(boxes) > 100: - boxes = sorted(boxes, key=lambda obj: obj.y0) - # Divide in two, then perform grouping - # print "Making Recursive Call %d"%len(boxes) - bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2]) - top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:]) - boxes = bot_boxes + top_boxes - + # XXX this still takes O(n^2) :( dists = [] - # Calculate the distance between each box for i in xrange(len(boxes)): obj1 = boxes[i] for j in xrange(i+1, len(boxes)): @@ -714,20 +642,16 @@ class LTLayoutContainer(LTContainer): dists = csort(dists, key=key_obj) plane = Plane(self.bbox) plane.extend(boxes) - # Start with the two closest objects while dists: (c, d, obj1, obj2) = dists.pop(0) - # If there are any objects in between, then skip these two if c == 0 and isany(obj1, obj2): dists.append((1, d, obj1, obj2)) continue - # Group these two closest objects if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) - # Remove the two individual objects plane.remove(obj1) plane.remove(obj2) dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists @@ -801,4 +725,4 @@ class LTPage(LTLayoutContainer): def __repr__(self): return ('<%s(%r) %s rotate=%r>' % (self.__class__.__name__, self.pageid, - bbox2str(self.bbox), self.rotate)) + bbox2str(self.bbox), self.rotate)) \ No newline at end of file From 549b5607651625d45867d867f33d78a4cbc65352 Mon Sep 17 00:00:00 2001 From: speedplane Date: Mon, 13 Jun 2016 23:44:54 -0400 Subject: [PATCH 19/28] Revert changes unrelated to this feature. --- samples/Makefile | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/samples/Makefile b/samples/Makefile index 7984c37..a9d9ae0 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,15 +1,9 @@ # GNUMakefile for test -UNAME_S := $(shell uname -o) -CNVTXT=: -ifeq ($(UNAME_S),Cygwin) - CNVTXT=unix2dos -endif - RM=rm -f CMP=: ECHO=echo -PYTHON=python +PYTHON=python2 PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V @@ -46,13 +40,10 @@ tests: for i in $(TESTS); do \ $(ECHO) $$i; \ $(PDF2TXT) -t html -o $$i.html $$i.pdf || exit 1; \ - $(CNVTXT) $$i.html || exit 1; \ $(CMP) $$i.html $$i.html.ref || exit 1; \ $(PDF2TXT) -t xml -o $$i.xml $$i.pdf || exit 1; \ - $(CNVTXT) $$i.xml || exit 1; \ $(CMP) $$i.xml $$i.xml.ref || exit 1; \ - $(PDF2TXT) -t text -o $$i.txt $$i.pdf || exit 1; \ - $(CNVTXT) $$i.txt || exit 1; \ + $(PDF2TXT) -t text -o $$i.txt $$i.pdf || eixt 1; \ $(CMP) $$i.txt $$i.txt.ref || exit 1; \ done @@ -74,4 +65,4 @@ clean: done -for i in $(CRYPTS); do \ $(RM) $$i.1.xml $$i.2.xml; \ - done + done \ No newline at end of file From dcf07272a175f0c2911c62986e16ac9b4afeb1a6 Mon Sep 17 00:00:00 2001 From: speedplane Date: Mon, 13 Jun 2016 23:46:30 -0400 Subject: [PATCH 20/28] Revert changes unrelated to this feature. --- samples/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/Makefile b/samples/Makefile index a9d9ae0..d1b06c4 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -65,4 +65,4 @@ clean: done -for i in $(CRYPTS); do \ $(RM) $$i.1.xml $$i.2.xml; \ - done \ No newline at end of file + done From 395cdd7538b7bb1a61573862a7c403e8d088f3fc Mon Sep 17 00:00:00 2001 From: Daniel Berthereau Date: Mon, 27 Jun 2016 00:00:00 +0200 Subject: [PATCH 21/28] Fixed tests. --- samples/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/Makefile b/samples/Makefile index d1b06c4..ed84865 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -43,7 +43,7 @@ tests: $(CMP) $$i.html $$i.html.ref || exit 1; \ $(PDF2TXT) -t xml -o $$i.xml $$i.pdf || exit 1; \ $(CMP) $$i.xml $$i.xml.ref || exit 1; \ - $(PDF2TXT) -t text -o $$i.txt $$i.pdf || eixt 1; \ + $(PDF2TXT) -t text -o $$i.txt $$i.pdf || exit 1; \ $(CMP) $$i.txt $$i.txt.ref || exit 1; \ done From 29260020176f473e72f2049a553049f6d883a0eb Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 8 Sep 2016 16:34:53 +0530 Subject: [PATCH 22/28] Replace old Adobe glyphlist link --- pdfminer/glyphlist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/glyphlist.py b/pdfminer/glyphlist.py index b4b449c..10e1008 100644 --- a/pdfminer/glyphlist.py +++ b/pdfminer/glyphlist.py @@ -7,7 +7,7 @@ Unicode characters instead of using decimal/hex character code. The following data was taken by - $ wget http://www.adobe.com/devnet/opentype/archives/glyphlist.txt + $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt $ python tools/conv_glyphlist.py glyphlist.txt > glyphlist.py """ From 647a6c653cb05c6d74a13a462672aeaf03c2a7d3 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Sun, 11 Sep 2016 23:38:18 +0900 Subject: [PATCH 23/28] Added: LICENSE --- LICENSE | 22 ++++++++++++++++++++++ README.md | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3940067 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2004-2016 Yusuke Shinyama + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 30aa5db..4adde82 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ Terms and Conditions (This is so-called MIT/X License) -Copyright (c) 2004-2014 Yusuke Shinyama +Copyright (c) 2004-2016 Yusuke Shinyama Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation From 64fe538b24eca8b7fa0645cdc74e111b642b6051 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Sun, 11 Sep 2016 23:43:22 +0900 Subject: [PATCH 24/28] Fixed: #114 (UnicodeEncodeError in PSLiteral) --- pdfminer/psparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 7232421..825edcc 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -60,7 +60,7 @@ class PSLiteral(PSObject): return def __repr__(self): - return '/%s' % self.name + return '/%r' % self.name ## PSKeyword From 177a4ab937922423e8df89506880a68fe436c13e Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Sun, 11 Sep 2016 23:52:13 +0900 Subject: [PATCH 25/28] Fixed: #132 (PDFStream.get_filters: support multiple parameterless filters) --- pdfminer/pdftypes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index af1435b..5eda1fe 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -223,11 +223,9 @@ class PDFStream(PDFObject): return [] if not isinstance(filters, list): filters = [filters] - if not params: + if not isinstance(params, list): # Make sure the parameters list is the same as filters. - params = [{}]*len(filters) - elif not isinstance(params, list): - params = [params] + params = [params]*len(filters) if STRICT and len(params) != len(filters): raise PDFException("Parameters len filter mismatch") return zip(filters, params) From 5816b55023648f85c2b70d5352edae4cce797426 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Mon, 12 Sep 2016 00:14:58 +0900 Subject: [PATCH 26/28] MANIFEST: include LICENSE --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 985dd51..910eee5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include Makefile +include LICENSE include *.txt include *.py graft cmaprsrc From 5ddbecb551e93d81177ef8558fdf0427a5109f96 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Tue, 13 Sep 2016 16:25:09 +0200 Subject: [PATCH 27/28] Fix typos --- Makefile | 2 +- cmaprsrc/README.txt | 2 +- pdfminer/converter.py | 2 +- pdfminer/pdftypes.py | 2 +- pdfminer/psparser.py | 2 +- tools/pdf2html.cgi | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 2783a0f..0ffd84f 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -## Makefile (for maintainance purpose) +## Makefile (for maintenance purpose) ## PACKAGE=pdfminer diff --git a/cmaprsrc/README.txt b/cmaprsrc/README.txt index 3cfb23e..a003127 100644 --- a/cmaprsrc/README.txt +++ b/cmaprsrc/README.txt @@ -5,7 +5,7 @@ to decode text data written in CJK (Chinese, Japanese, Korean) language. CMap resources are now available freely from Adobe web site: http://opensource.adobe.com/wiki/display/cmap/CMap+Resources -The follwing files were extracted from the downloadable tarballs: +The following files were extracted from the downloadable tarballs: cid2code_Adobe_CNS1.txt: http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 28c2abf..30ceb22 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -185,7 +185,7 @@ class TextConverter(PDFConverter): return # Some dummy functions to save memory/CPU when all that is wanted - # is text. This stops all the image and drawing ouput from being + # is text. This stops all the image and drawing output from being # recorded and taking up RAM. def render_image(self, name, stream): if self.imagewriter is None: diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 5eda1fe..20d981d 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -110,7 +110,7 @@ def decipher_all(decipher, objid, genno, x): return x -# Type cheking +# Type checking def int_value(x): x = resolve1(x) if not isinstance(x, int): diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 825edcc..7270b45 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -247,7 +247,7 @@ class PSBaseParser(object): return (linepos, linebuf) def revreadlines(self): - """Fetches a next line backword. + """Fetches a next line backward. This is used to locate the trailers at the end of a file. """ diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index bfd591f..4ddba6d 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -197,7 +197,7 @@ class WebApp(object): convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) except Exception, e: - self.put('

Sorry, an error has occured: %s' % q(repr(e))) + self.put('

Sorry, an error has occurred: %s' % q(repr(e))) self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc())) finally: try: From 8150458718e9024c80b00e74965510b20206e588 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Mon, 26 Sep 2016 18:06:34 +0900 Subject: [PATCH 28/28] Added: a simpler ordering mode when 1 -Last Modified: Wed Jun 25 10:27:52 UTC 2014 +Last Modified: Mon Sep 26 09:04:15 UTC 2016 @@ -268,6 +268,7 @@ are M = 2.0, L = 0.5, and W = 0.1, respectively.

Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). +When this value is out of the range (e.g. +2), a simpler ordering rule is used. The default value is 0.5.

-C diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 9426ad3..6477eff 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -676,13 +676,20 @@ class LTLayoutContainer(LTContainer): for obj in empties: obj.analyze(laparams) textboxes = list(self.group_textlines(laparams, textlines)) - if textboxes: + if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes: self.groups = self.group_textboxes(laparams, textboxes) assigner = IndexAssigner() for group in self.groups: group.analyze(laparams) assigner.run(group) textboxes.sort(key=lambda box: box.index) + else: + def getkey(box): + if isinstance(box, LTTextBoxVertical): + return (0, -box.x1, box.y0) + else: + return (1, box.y0, box.x0) + textboxes.sort(key=getkey) self._objs = textboxes + otherobjs + empties return @@ -725,4 +732,4 @@ class LTPage(LTLayoutContainer): def __repr__(self): return ('<%s(%r) %s rotate=%r>' % (self.__class__.__name__, self.pageid, - bbox2str(self.bbox), self.rotate)) \ No newline at end of file + bbox2str(self.bbox), self.rotate))