diff --git a/pdfminer/Makefile b/pdfminer/Makefile index 2ae0f6e..1d1661b 100644 --- a/pdfminer/Makefile +++ b/pdfminer/Makefile @@ -1,7 +1,9 @@ # Makefile for pdfminer +RM=rm -f + all: clean: - -rm *.pyc *.pyo + -$(RM) *.pyc *.pyo cd cmap && make clean diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index b4d4f92..24e097c 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -69,21 +69,27 @@ class PDFTextDevice(PDFDevice): scaling = textstate.scaling * .01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling + if font.is_multibyte(): + wordspace = 0 dxscale = .001 * fontsize * scaling + if font.is_vertical(): + textstate.linematrix = self.render_string_vertical( + seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) + else: + textstate.linematrix = self.render_string_horizontal( + seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale) + return + + def render_string_horizontal(self, seq, matrix, (x,y), + font, fontsize, scaling, charspace, wordspace, dxscale): chars = [] needspace = False - (x,y) = textstate.linematrix for obj in seq: if isinstance(obj, int) or isinstance(obj, float): (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, fontsize, charspace, scaling, chars) - x += dx + x += dx - obj*dxscale y += dy - d = -obj*dxscale - if font.is_vertical(): - y += d - else: - x += d chars = [] needspace = False else: @@ -94,31 +100,58 @@ class PDFTextDevice(PDFDevice): (cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) chars.append((char, cid)) - if cid == 32 and textstate.wordspace and not font.is_multibyte(): + if cid == 32 and wordspace: if needspace: - if font.is_vertical(): - y += charspace - else: - x += charspace + x += charspace (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, fontsize, charspace, scaling, chars) needspace = True - x += dx + x += dx + wordspace y += dy - if font.is_vertical(): - y += wordspace - else: - x += wordspace chars = [] if chars: if needspace: - if font.is_vertical(): - y += charspace - else: - x += charspace + x += charspace (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, fontsize, charspace, scaling, chars) x += dx y += dy - textstate.linematrix = (x,y) - return + return (x, y) + + def render_string_vertical(self, seq, matrix, (x,y), + font, fontsize, scaling, charspace, wordspace, dxscale): + chars = [] + needspace = False + for obj in seq: + if isinstance(obj, int) or isinstance(obj, float): + (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, + fontsize, charspace, scaling, chars) + x += dx + y += dy - obj*dxscale + chars = [] + needspace = False + else: + for cid in font.decode(obj): + try: + char = font.to_unichr(cid) + except PDFUnicodeNotDefined, e: + (cidcoding, cid) = e.args + char = self.handle_undefined_char(cidcoding, cid) + chars.append((char, cid)) + if cid == 32 and wordspace: + if needspace: + y += charspace + (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, + fontsize, charspace, scaling, chars) + needspace = True + x += dx + y += dy + wordspace + chars = [] + if chars: + if needspace: + y += charspace + (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, + fontsize, charspace, scaling, chars) + x += dx + y += dy + return (x, y) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index e6bdf16..5158761 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -668,6 +668,11 @@ class PDFPageInterpreter(object): def do_ID(self): # never called return def do_EI(self, obj): + if 'W' in obj and 'H' in obj: + iobjid = str(id(obj)) + self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY) + self.device.render_image(iobjid, obj) + self.device.end_figure(iobjid) return # invoke an XObject diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 78f7b5d..a4f7da1 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -169,10 +169,13 @@ class PDFStream(PDFObject): def __contains__(self, name): return name in self.attrs + def __getitem__(self, name): return self.attrs[name] + def get(self, name, default=None): return self.attrs.get(name, default) + def get_any(self, names, default=None): for name in names: if name in self.attrs: @@ -216,6 +219,9 @@ class PDFStream(PDFObject): data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) + elif f in LITERALS_CCITTFAX_DECODE: + #data = ccittfaxdecode(data) + raise PDFNotImplementedError('Unsupported filter: %r' % f) elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') diff --git a/samples/Makefile b/samples/Makefile index fe7f4dc..b7490d6 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,5 +1,7 @@ # GNUMakefile for test +RM=rm -f +CMP=cmp PYTHON=python PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 @@ -39,18 +41,21 @@ XMLS= \ all: htmls texts xmls clean: - -rm $(HTMLS) - -rm $(TEXTS) - -rm $(XMLS) + -$(RM) $(HTMLS) + -$(RM) $(TEXTS) + -$(RM) $(XMLS) htmls: $(HTMLS) texts: $(TEXTS) xmls: $(XMLS) .SUFFIXES: .pdf .html .xml .txt + .pdf.html: $(PDF2TXT) -t html $< > $@ + .pdf.xml: $(PDF2TXT) -t xml $< > $@ + .pdf.txt: $(PDF2TXT) -t text $< > $@ diff --git a/tools/Makefile b/tools/Makefile index 8299398..1f232e0 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,6 +1,8 @@ # Makefile for tools +RM=rm -f + all: clean: - -rm *.pyc *.pyo + -$(RM) *.pyc *.pyo