#!/usr/bin/env python
"""
Split a PostgreSQL plain dump file into files for each of its objects.

This is used to compare two SQL database contents: both contents are dumped to a
plain text file, and they are then processed through this script which generates
a file for each database object, which can then be compared using a directory
diff program.
"""

# stdlib imports
import os, re
from os.path import join, exists


# Note: this code is pulled straight out of the xxdiff library.
# Copied here to avoid the dependency.
sec_re = re.compile('^-- (?:Data for )?Name:\s*([^\s;]+);\s*Type:\s*([^;]+);(.*)$', re.M)
com_re = re.compile('--.*$', re.M)
ct_re = re.compile('^CREATE TABLE.*?(\\().*?(\\);)', re.M|re.S)

def parse_dump(dbdump, sort_columns=False):
    """
    Parse a PostgreSQL database dump, extracting all its section into a list of
    (name, type, contents) tuples. The entries are built from the Name and Type
    fields in the header comments. Contents of data chunks for tables are
    returned as type 'DATA', so if you're going to turn this into a map, be
    careful to filter out the 'data' entries or concatenate them to the schema
    (whichever is appropriate for your application).
    """
    # Class to contain info about chunks.
    class Chunk:
        def __init__(self, mo):
            self.mo = mo
            self.name, self.typ = mo.group(1, 2)

    # Parse chunks.
    chunks = map(lambda mo: Chunk(mo), sec_re.finditer(dbdump))
    for c1, c2 in consepairs(chunks):
        c1.contents = dbdump[c1.mo.end():c2.mo.start()]
    chunks[-1].contents = dbdump[chunks[-1].mo.end():]

    # Remove comments in the contents.
    for c in chunks:
        c.descline = c.mo.group(0)
        c.contents = com_re.sub('', c.contents).strip() + '\n'
        c.mo = None # release the match objects

        # Sort columns in CREATE TABLE statements.
        if sort_columns:
            mo = ct_re.match(c.contents)
            if mo:
                pre, post = c.contents[:mo.end(1)], c.contents[mo.start(2):]
                columns = c.contents[mo.end(1):mo.start(2)].strip()
                line_cols = map(lambda x: x.endswith(',') and x or '%s,' % x,
                                map(str.strip, columns.splitlines()))
                line_cols.sort()
                c.contents = (pre + '\n' +
                              ''.join('   %s\n' % x for x in line_cols) +
                              post)

    return list((c.name, c.typ, c.descline + '\n\n' + c.contents)
                for c in chunks)

# Note: this code is pulled straight out of the xxdiff library.
# Copied here to avoid the dependency.
def consepairs(seq):
    """
    Iterate over consecutive pairs.
    """
    it = iter(seq)
    prev = it.next()
    for el in it:
        yield prev, el
        prev = el

def main():
    import optparse
    parser = optparse.OptionParser(__doc__.strip())
    opts, args = parser.parse_args()

    if len(args) not in (1, 2):
        parser.error("You must specify a single dumpfile, "
                     "and optionally an output directory.")
    fn = args[0]
    outdir = args[1] if len(args) == 2 else os.getcwd()

    if not exists(outdir):
        os.makedirs(outdir)

    contents = parse_dump(open(fn).read())
    for name, type, contents in contents:
        fn = '%s.%s' % (name, type.replace(' ', '_').lower())
        print fn
        f = open(join(outdir, fn), 'w')
        f.write(contents)
        f.close()

if __name__ == '__main__':
    main()

