diff --git a/MANIFEST.in b/MANIFEST.in index d996938..26ba2aa 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,9 @@ include Makefile include LICENSE +include *.txt include *.md include *.py +graft cmaprsrc graft docs graft pdfminer graft samples diff --git a/Makefile b/Makefile index 2b6a4eb..640624c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -## Makefile (for maintainance purpose) +## Makefile (for maintenance purpose) ## PACKAGE=pdfminer diff --git a/cmaprsrc/README.txt b/cmaprsrc/README.txt index 3cfb23e..a003127 100644 --- a/cmaprsrc/README.txt +++ b/cmaprsrc/README.txt @@ -5,7 +5,7 @@ to decode text data written in CJK (Chinese, Japanese, Korean) language. CMap resources are now available freely from Adobe web site: http://opensource.adobe.com/wiki/display/cmap/CMap+Resources -The follwing files were extracted from the downloadable tarballs: +The following files were extracted from the downloadable tarballs: cid2code_Adobe_CNS1.txt: http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 1e15542..baf78b7 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -214,7 +214,7 @@ class TextConverter(PDFConverter): return # Some dummy functions to save memory/CPU when all that is wanted - # is text. This stops all the image and drawing ouput from being + # is text. This stops all the image and drawing output from being # recorded and taking up RAM. def render_image(self, name, stream): if self.imagewriter is None: @@ -349,7 +349,7 @@ class HTMLConverter(PDFConverter): if self._font is not None: self.write('') self.write('' % - (fontname, fontsize * self.scale * self.fontscale)) + (enc(fontname), fontsize * self.scale * self.fontscale)) self._font = font self.write_text(text) return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 88fe370..198d034 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -682,13 +682,20 @@ class LTLayoutContainer(LTContainer): for obj in empties: obj.analyze(laparams) textboxes = list(self.group_textlines(laparams, textlines)) - if textboxes: + if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes: self.groups = self.group_textboxes(laparams, textboxes) assigner = IndexAssigner() for group in self.groups: group.analyze(laparams) assigner.run(group) textboxes.sort(key=lambda box: box.index) + else: + def getkey(box): + if isinstance(box, LTTextBoxVertical): + return (0, -box.x1, box.y0) + else: + return (1, box.y0, box.x0) + textboxes.sort(key=getkey) self._objs = textboxes + otherobjs + empties return diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index ea04615..265866c 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -197,7 +197,7 @@ class WebApp(object): convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) except Exception, e: - self.put('

Sorry, an error has occured: %s' % q(repr(e))) + self.put('

Sorry, an error has occurred: %s' % q(repr(e))) self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc())) finally: try: