diff --git a/Makefile b/Makefile index 625ea39..5b53e22 100644 --- a/Makefile +++ b/Makefile @@ -30,15 +30,9 @@ commit: clean check: cd $(PACKAGE) && make check -sdist: clean - $(PYTHON) setup.py sdist - register: clean $(PYTHON) setup.py sdist upload register -VERSION=`$(PYTHON) $(PACKAGE)/__init__.py` -DISTFILE=$(PACKAGE)-$(VERSION).tar.gz WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE) -publish: sdist - $(CP) dist/$(DISTFILE) $(WEBDIR) - $(CP) docs/*.html $(WEBDIR)/index.html +publish: + $(CP) docs/*.html $(WEBDIR) diff --git a/docs/miningpdf.html b/docs/miningpdf.html new file mode 100644 index 0000000..7f2372f --- /dev/null +++ b/docs/miningpdf.html @@ -0,0 +1,121 @@ + + + + +Mining PDF files + + + +

Mining PDF files

+

+ +

+Homepage + +

+ +Last Modified: Sat Nov 14 21:09:01 JST 2009 + +
+ +

What is PDF?

+

+

What PDF is ...

+ + +

What PDF is not ...

+ + +

Structure of PDF

+

+From a data structure's point of view, PDF is a total mess in the +computer history. Originally, Adobe had a document format called +PostScript (which is also more like "graphics" format rather than +text format). It has nice graphic representation and is able to +express commercial quality typesetting. However, it has to be for +a specific printer and its file size tends to get bloated because +almost everything is represented as text. PDF is Adobe's attempt +to create a less printer dependent format with a reduced data size +(that's why it was named "portable" document format). To some +degree, PDF can be seen as a "compressed" version of PostScript +with seekable index tables. Since its drawing model and concepts +(coordinations, color spaces, etc.) remains pretty much the same +as its precedessor, Adobe decided to reuse the original PostScript +notation partially in PDF. However, this eclectic position ended +up with a disastrous situation. + +

Format Disaster

+

+When designing a data format, there are two different strategies: +using text or using binary. They both have obvious merits and +demerits. The biggest merit of having textual representation is +that they are human readable and can be modified with any text +editor. The demerits of textual representation is its bloted size, +especially if you want to put something like pictures and +multimedia data like audio or video. Another demerit of textual +representation is that you need a program to serialize/deserialize +(parse) the data, which can be very complex and buggy. On the +other hand, binary representation normally doesn't require a +complex parser and takes much less space than texts. However, +they're not readable for humans. Now, Adobe decided to take the +good parts from both worlds by making PDF a partially text and +partially binary format, and as a result, PDF inherits the +drawbacks of both worlds without having much of their merits, i.e. +PDF is a human *unreadable* document format that still requires a +complex and error-prone parser and has a bloated file size. +

+Adobe has been probably aware of this problem from early on, and +they tried to fix this over years. So they gradually dropped text +representations and more inclided toward binaries. For example, +in PDF specification 1.5, they introduce a new notation called +"object stream" (which is different from a "stream object" that +was already there in the specification). + +However, by this time there are already tons of PDFs that were +produced by the original standard, which still requires every PDF +viewer to support. + +

Problem of Text Extraction from PDF Documents

+

+Many people tend to think that a PDF document is somewhat similar +to a Word or HTML document, which is not true. In fact, the primary +focus of PDF is printing and showing on a computer display, so +it is extremely versatile for showing the details of "looks" +of text typography, picture and graphics. All the texts in a PDF document is +just a bunch of string objects floating at various locations on a +blank slate. There is no text flow control and no contexual clue +about its content, except few special "tagged" PDF documents with +extra annotations that denote headlines or page boundaries, which +require specialized tools to create. +

+(OpenOffice, for example, has ability to create tagged PDF +documents. But the degree of the annotations is varied depending +on its implementation, and in many cases it is not possible to +obtain the full layout information by only using tags.) +

+Besides tagged documents, PDF doesn't care the order of text +strings rendered in a page. You can completely jumble up every +piece of strings in a PDF and still make it look like a +perfect document on the surface. Even worse, PDF allows a word to +be split in the middle and drawn as multiple unrelated strings in +order to represent precise text positioning. For example, a +certain word processing software creates a PDF that splits a word +"You" into two separate strings "Y" and "ou" because of the subtle +kerning between the letters. +

+So there's a huge problem associated with extracting texts properly +from PDF files. They require almost similar kinds of analysis +to optical character recognition (OCR). + + +


+
Yusuke Shinyama
+ diff --git a/pdfminer/cmap.py b/pdfminer/cmap.py index 695ad51..f80284c 100644 --- a/pdfminer/cmap.py +++ b/pdfminer/cmap.py @@ -15,7 +15,6 @@ import sys import re import os import os.path -from sys import stderr from struct import pack, unpack from psparser import PSStackParser from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF @@ -24,8 +23,7 @@ from psparser import literal_name, keyword_name from fontmetrics import FONT_METRICS from latin_enc import ENCODING from glyphlist import charname2unicode -from utils import choplist -from utils import nunpack +from utils import choplist, nunpack try: import cdb except ImportError: @@ -38,16 +36,19 @@ class CMapError(Exception): pass ## find_cmap_path ## def find_cmap_path(): - try: - return os.environ['CMAP_PATH'] - except KeyError: - pass - basedir = os.path.dirname(__file__) - return os.path.join(basedir, 'CMap') + """Returns the location of CMap directory.""" + for path in (os.environ['CMAP_PATH'], + os.path.join(os.path.dirname(__file__), 'CMap')): + if os.path.isdir(path): + return path + raise IOError +## name2unicode +## STRIP_NAME = re.compile(r'[0-9]+') def name2unicode(name): + """Converts Adobe glyph names to Unicode numbers.""" if name in charname2unicode: return charname2unicode[name] m = STRIP_NAME.search(name) @@ -97,7 +98,7 @@ class CMap(object): def decode(self, bytes): if self.debug: - print >>stderr, 'decode: %r, %r' % (self, bytes) + print >>sys.stderr, 'decode: %r, %r' % (self, bytes) x = '' for c in bytes: if x: @@ -179,7 +180,7 @@ class CDBCMap(CMap): def decode(self, bytes): if self.debug: - print >>stderr, 'decode: %r, %r' % (self, bytes) + print >>sys.stderr, 'decode: %r, %r' % (self, bytes) x = '' for c in bytes: if x: @@ -227,11 +228,11 @@ class CMapDB(object): cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb') if os.path.exists(cdbname): if 1 <= self.debug: - print >>stderr, 'Opening: CDBCMap %r...' % cdbname + print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname cmap = CDBCMap(cdbname) elif os.path.exists(fname): if 1 <= self.debug: - print >>stderr, 'Reading: CMap %r...' % fname + print >>sys.stderr, 'Reading: CMap %r...' % fname cmap = CMap() fp = file(fname, 'rb') CMapParser(cmap, fp).run() @@ -423,10 +424,11 @@ class EncodingDB(object): ## CMap -> CMapCDB conversion ## -def dumpcdb(cmap, cdbfile, verbose=1): +def dump_cdb(cmap, cdbfile, verbose=1): + """Writes a CMap object into a cdb file.""" m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') if verbose: - print >>stderr, 'Writing: %r...' % cdbfile + print >>sys.stderr, 'Writing: %r...' % cdbfile for (k,v) in cmap.getall_attrs(): m.add('/'+k, repr(v)) for (code,cid) in cmap.getall_code2cid(): @@ -437,44 +439,55 @@ def dumpcdb(cmap, cdbfile, verbose=1): return def convert_cmap(cmapdir, outputdir, force=False): + """Convert all CMap source files in a directory into cdb files.""" CMapDB.initialize(cmapdir) for fname in os.listdir(cmapdir): if '.' in fname: continue cmapname = os.path.basename(fname) cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb') if not force and os.path.exists(cdbname): - print >>stderr, 'Skipping: %r' % cmapname + print >>sys.stderr, 'Skipping: %r' % cmapname continue - print >>stderr, 'Reading: %r...' % cmapname + print >>sys.stderr, 'Reading: %r...' % cmapname cmap = CMapDB.get_cmap(cmapname) - dumpcdb(cmap, cdbname) + dump_cdb(cmap, cdbname) return def main(argv): + """Converts CMap files into cdb files. + + usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]] + """ + import getopt def usage(): - print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0] + print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'C:D:f') + (opts, args) = getopt.getopt(argv[1:], 'f') except getopt.GetoptError: return usage() if args: cmapdir = args.pop(0) else: - cmapdir = find_cmap_path() - outputdir = cmapdir + try: + cmapdir = find_cmap_path() + except IOError: + print >>sys.stderr, 'cannot find CMap directory' + return 1 + if args: + outputdir = args.pop(0) + else: + outputdir = cmapdir force = False for (k, v) in opts: if k == '-f': force = True - elif k == '-C': cmapdir = v - elif k == '-D': outputdir = v if not os.path.isdir(cmapdir): - print >>stderr, 'directory does not exist: %r' % cmapdir - return 111 + print >>sys.stderr, 'directory does not exist: %r' % cmapdir + return 1 if not os.path.isdir(outputdir): - print >>stderr, 'directory does not exist: %r' % outputdir - return 111 + print >>sys.stderr, 'directory does not exist: %r' % outputdir + return 1 return convert_cmap(cmapdir, outputdir, force=force) if __name__ == '__main__': sys.exit(main(sys.argv))