From 6413eb7de4a0d9e96d0605d4c0d8f1680a8ad0ca Mon Sep 17 00:00:00 2001 From: Jim Morrison Date: Tue, 24 Jan 2012 16:18:36 -0800 Subject: [PATCH 1/3] Deal with CMYK images by converting them to RGB. PIL does not invert CMYK images as of PIL 1.1.7, so the invert happens in ImageWriter. --- pdfminer/image.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pdfminer/image.py b/pdfminer/image.py index 9faf0c0..c5f85c7 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -1,9 +1,13 @@ #!/usr/bin/env python2 +import cStringIO +import logging import sys import struct import os, os.path +from PIL import Image +from PIL import ImageChops from pdftypes import LITERALS_DCT_DECODE -from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB +from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK def align32(x): return ((x+3)/4)*4 @@ -77,7 +81,15 @@ class ImageWriter(object): path = os.path.join(self.outdir, name) fp = file(path, 'wb') if ext == '.jpg': - fp.write(stream.get_rawdata()) + raw_data = stream.get_rawdata() + if LITERAL_DEVICE_CMYK in image.colorspace: + ifp = cStringIO.StringIO(raw_data) + i = Image.open(ifp) + i = ImageChops.invert(i) + i = i.convert('RGB') + i.save(fp, 'JPEG') + else: + fp.write(raw_data) elif image.bits == 1: bmp = BMPWriter(fp, 1, width, height) data = stream.get_data() From 89c81db295993a34d9f3488d6ec0a04ebc505017 Mon Sep 17 00:00:00 2001 From: Humberto Pereira Date: Mon, 19 Mar 2012 16:42:58 -0300 Subject: [PATCH 2/3] PDFDocument.lookup_names.lookup didn't find 'Names' in some files --- pdfminer/pdfparser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index f034e65..1aa9aeb 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -553,10 +553,10 @@ class PDFDocument(object): if 'Limits' in d: (k1,k2) = list_value(d['Limits']) if key < k1 or k2 < key: return None - if 'Names' in d: - objs = list_value(d['Names']) - names = dict(choplist(2, objs)) - return names[key] + if 'Names' in d: + objs = list_value(d['Names']) + names = dict(choplist(2, objs)) + return names[key] if 'Kids' in d: for c in list_value(d['Kids']): v = lookup(dict_value(c)) From f77f196cd332307ae89f61f7861b8a73923a7835 Mon Sep 17 00:00:00 2001 From: jcushman Date: Fri, 22 Jun 2012 18:11:45 -0300 Subject: [PATCH 3/3] 2x faster group_textboxes function. --- pdfminer/layout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 14383f1..7e0f312 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -609,8 +609,8 @@ class LTLayoutContainer(LTContainer): group = LTTextGroupLRTB([obj1,obj2]) plane.remove(obj1) plane.remove(obj2) - dists = [ (c,d,o1,o2) for (c,d,o1,o2) in dists - if o1 in plane and o2 in plane ] + # this line is optimized -- don't change without profiling + dists = [ n for n in dists if n[2] in plane._objs and n[3] in plane._objs ] for other in plane: dists.append((0, dist(group,other), group, other)) dists.sort()