#!/usr/bin/env python
#
#  Copyright (C) 2006  Martin Blais
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""svndropempty [<options>] [<path> ...]

A Subversion dumpfile filter that will drop empty revisions from the stream.

Credits
-------

Some source code taken from Simon Tatham's svndumpfilter2.
"""

__author__ = 'Martin Blais <blais@furius.ca>'
# CREDITS:
#
# "Patrick Dreyer" <pdreyer at icomasoft dot com>:
#    Added and tested the --renumber-revs option.
#
# "Roger Norling" <Roger dot Norling at miun dot se>
#    Submitted a patch to fix a bug in renumbering.

# stdlib imports
import sys, os, re, string, hashlib
from os.path import basename


# Constants for versions.
# Note: v3 does not really exist, see this for details:
# http://svn.haxx.se/dev/archive-2004-11/1111.shtml
__supported_versions__ = ('2', '3')

fmtequiv = {'1': 1,
            '2': 2,
            '3': 2}

format_warning = False


# Note: from Simon Tatham.
class Lump:
    """
    A class and some functions to handle a single lump of
    RFC822-ish-headers-plus-data read from an SVN dump file.
    """
    def __init__(self):
        self.hdrlist = []
        self.hdrdict = {}
        self.prop = ""
        self.text = ""
        self.proplist = []
        self.propdict = {}

    def sethdr(self, key, val):
        """
        Set header 'key' to 'val'.
        """
        if not self.hdrdict.has_key(key):
            self.hdrlist.append(key)
        self.hdrdict[key] = val

    def delhdr(self, key):
        """
        Delete the header 'key'.
        """
        if self.hdrdict.has_key(key):
            del self.hdrdict[key]
            self.hdrlist.remove(key)

    def propparse(self):
        """
        Parse the properties of the lump.
        """
        index = 0
        while 1:
            if self.prop[index:index+2] == "K ":
                wantval = 1
            elif self.prop[index:index+2] == "D ":
                wantval = 0
            elif self.prop[index:index+9] == "PROPS-END":
                break
            else:
                raise RuntimeError("Unrecognised record in props section")
            nlpos = string.find(self.prop, "\n", index)
            assert nlpos > 0
            namelen = string.atoi(self.prop[index+2:nlpos])
            assert self.prop[nlpos+1+namelen] == "\n"
            name = self.prop[nlpos+1:nlpos+1+namelen]
            index = nlpos+2+namelen
            if wantval:
                assert self.prop[index:index+2] == "V "
                nlpos = string.find(self.prop, "\n", index)
                assert nlpos > 0
                proplen = string.atoi(self.prop[index+2:nlpos])
                assert self.prop[nlpos+1+proplen] == "\n"
                prop = self.prop[nlpos+1:nlpos+1+proplen]
                index = nlpos+2+proplen
            else:
                prop = None
            self.proplist.append(name)
            self.propdict[name] = prop

    def setprop(self, key, val):
        """
        Set property 'key' to 'val'.
        """
        if not self.propdict.has_key(key):
            self.proplist.append(key)
        self.propdict[key] = val

    def delprop(self, key):
        """
        Delete property 'key'.
        """
        if self.propdict.has_key(key):
            del self.propdict[key]
            self.proplist.remove(key)

    def correct_headers(self):
        """
        Adjust the headers, from updated contents.
        """
        # First reconstitute the properties block.
        self.prop = ""
        if not opts.prune_properties or len(self.proplist) > 0:
            for key in self.proplist:
                val = self.propdict[key]
                if val is None:
                    self.prop += "D %d\n%s\n" % (len(key), key)
                else:
                    self.prop += "K %d\n%s\n" % (len(key), key)
                    self.prop += "V %d\n%s\n" % (len(val), val)
            self.prop = self.prop + "PROPS-END\n"

        # Now fix up the content length headers.
        if len(self.prop) > 0:
            self.sethdr("Prop-content-length", str(len(self.prop)))
        else:
            self.delhdr("Prop-content-length")

        if len(self.text) > 0 or \
           (self.hdrdict.get('Node-action', None) == 'add' and
            self.hdrdict.get('Node-kind', None) == 'file' and
            not self.hdrdict.get('Node-copyfrom-path', None)):

            self.sethdr("Text-content-length", str(len(self.text)))
            m = hashlib.md5()
            m.update(self.text)
            self.sethdr("Text-content-md5", m.hexdigest())
        else:
            self.delhdr("Text-content-length")
            self.delhdr("Text-content-md5")

        if len(self.prop) > 0 or len(self.text) > 0:
            self.sethdr("Content-length", str(len(self.prop)+len(self.text)))
        else:
            self.delhdr("Content-length")


format_re = re.compile('SVN-fs-dump-format-version: (\d+)\s*$')
uuid_re = re.compile('UUID: ([0-9a-f\-]+)\s*$')

def read_dump_header(f):
    """
    Match and read a dumpfile's header and return the format versin and file's
    UUID.
    """
    mo_version = format_re.match(f.readline())
    assert mo_version
    f.readline()
    mo_uuid = uuid_re.match(f.readline())
    assert mo_uuid
    f.readline()

    text = '%s\n%s\n' % (mo_version.string, mo_uuid.string)
    return mo_version.group(1), mo_uuid.group(1), text


header_re = re.compile('([a-zA-Z0-9\-]+): (.*)$')

# Note: from Simon Tatham.
def read_rfc822_headers(f):
    """
    Read a set of RFC822 headers from the given file.  We return a dict and the
    set of original lines that were parsed to obtain the contents.
    """
    ret = Lump()

    lines = []
    while 1:
        s = f.readline()
        if not s:
            return None, [] # end of file

        # Watch for the newline char that ends the headers.
        if s == '\n':
            if len(ret.hdrlist) > 0:
                break # newline after headers ends them
            else:
                continue # newline before headers is simply ignored

        lines.append(s)

        mo = header_re.match(s)
        if mo is None:
            raise SystemExit("Error: Parsing header: %s" % s)

        ret.sethdr(*mo.groups())

    return ret, lines

# Note: from Simon Tatham.
def read_lump(f):
    """
    Read a single lump from the given file.

    Note: there is a single empty line that is used to conclude the RFC headers,
    and it is not part of the rest.  Then you have the properties, which are of
    exactly the property length, and right away follows the contents of exactly
    the length of the content length.  Then follows two newline characters and
    then the next lump starts.
    """
    lump, lines = read_rfc822_headers(f)
    if lump is None:
        return None
    pcl = int(lump.hdrdict.get("Prop-content-length", "0"))
    tcl = int(lump.hdrdict.get("Text-content-length", "0"))
    if pcl > 0:
        lump.prop = f.read(pcl)
        lump.propparse()
    if tcl > 0:
        lump.text = f.read(tcl)

    lump.orig_text = os.linesep.join(lines) + lump.prop + lump.text

    return lump


def write_lump(f, lump):
    """
    Write a single lump to the given file.
    """
    # Make sure that the lengths are adjusted appropriately.
    lump.correct_headers()
    for key in lump.hdrlist:
        val = lump.hdrdict[key]
        f.write(key + ": " + val + "\n")
    f.write("\n")

    # Render the payload.
    f.write(lump.prop)
    f.write(lump.text)

    # Add newlines at the end of chunks, for readers.
    f.write('\n')
    if not lump.hdrdict.has_key("Revision-number"):
        f.write('\n')


class LumpReader(object):
    """
    An iterator class that does a bit of lookahead to determine the type of the
    lumps.
    """
    def __init__(self, fr):

        self.fr = fr
        """File to read from."""

        self.lumpsread = []
        """List of extra lumps that have already been read and that are waiting
        to be served."""

        self.nbread = 0
        self.nbread_revs = 0
        self.nbread_other = 0
        """Statistics on the number of lumps read."""

    def next(self):
        """
        Returns the next lump in line.
        """
        try:
            lump = self.lumpsread.pop(-1)
        except IndexError:
            lump = read_lump(self.fr)
            if lump:
                self.nbread += 1
                if lump.hdrdict.has_key('Revision-number'):
                    self.nbread_revs += 1
                    if opts.debug:
                        print >> sys.stderr, ("Revision: %s" %
                                              lump.hdrdict['Revision-number'])
                else:
                    self.nbread_other += 1
                    if opts.debug:
                        print >> sys.stderr, "  Lump"

        return lump

    def pushback(self, lump):
        """
        Pushes back one lump on the list of lumps to be read next.
        """
        self.lumpsread.append(lump)

    def iter_rev(self):
        """
        Iterate the current lumps until the next revision lump.
        """
        return RevLumpReader(self)


class RevLumpReader(object):
    """
    Iterator of lumps that stops when it gets at another revision.
    """
    def __init__(self, reader):
        self.reader = reader

    def __iter__(self):
        return self

    def next(self):
        lump = self.reader.next()
        if lump is None:
            raise StopIteration

        elif lump.hdrdict.has_key('Revision-number'):
            self.reader.pushback(lump)
            raise StopIteration
        else:
            return lump


def adjust_copyfrom(lump, revs, flog, renumbering):
    """
    Adjust the Node-copyfrom-rev value to the last valid revision before it, as
    given in the sorted list 'revs'.

    The problem is that if the source revision is not present in the repository,
    Subversion cannot load it into a repository.  You would get the following
    error message::

       svnadmin: Relative source revision -165 is not available in
                 current repository

    """

    if lump.hdrdict.has_key('Node-copyfrom-rev'):
        fromrev = int(lump.hdrdict['Node-copyfrom-rev'])
        # Roger Norling: lookup what number we saved the original rev number as
        if opts.renumber_revs:
            fromrev = renumbering[fromrev]

        for rev in reversed(revs):
            if rev <= fromrev:
                break
        else:
            raise SystemExit("Error: Fatal error looking for revision '%s' in "
                             "valid revisions" % fromrev)

        # Sanity check.
        assert rev <= fromrev
        if opts.debug:
            print >> flog, '   Adjusted %s to %s' % (fromrev, rev)
        lump.hdrdict['Node-copyfrom-rev'] = str(rev)


def parse_options():
    """
    Parse and validate the options.
    """
    global progname
    progname = basename(sys.argv[0])

    import optparse
    parser = optparse.OptionParser(__doc__.strip())

    parser.add_option('--quiet', action='store_true',
                      help="Do not display filtering statistics.")

    parser.add_option('--debug', action='store_true',
                      help=optparse.SUPPRESS_HELP)

    parser.add_option('--renumber-revs', action='store_true',
                      help="Renumber revisions left after filtering.")

    opts, args = parser.parse_args()

    opts.prune_properties = True

    return opts

def main():
    """
    Main program that just reads the lumps and copies them out.
    """
    global opts
    opts = parse_options()

    # Open in and out files.
    fr = sys.stdin
    fw = sys.stdout
    flog = sys.stderr

    # Read the dumpfile header.
    format, uuid, text = read_dump_header(fr)
    fw.write(text)

    if format not in __supported_versions__:
        # Note: you could update this script easily to support other formats, it
        # will probably be trivial to do so.
        raise SystemExit("Error: dump file in format '%s' not supported." %
                         format)

    dropped = []
    """A list of the revision numbers that were dropped."""

    # Process the dump file.
    reader = LumpReader(fr)
    revs = [] # Valid output revisions (i.e. not dropped).
    renumbering = {} # [Roger Norling]: used for looking up renumbered revs
    orgRev = 0
    nb_written = 0
    while 1:
        # Read one revision at a time.
        revlump = reader.next()
        if revlump is None:
            break # At EOF

        revno = int(revlump.hdrdict['Revision-number'])

        if opts.renumber_revs:
            # [Roger Norling]: increase revision number. remember original rev
            # number, but assume the rev will be dropped.
            orgRev = revno
            renumbering[orgRev] = len(revs)
            revno = len(revs) + 1
            revlump.hdrdict['Revision-number'] = str(revno)

        # Read the revision's contained lumps.
        reviter = reader.iter_rev()
        try:
            # Read the first lump after the revision.
            lump = reviter.next()

            # If this did not blow up, the revision contains some lumps therein.
            # Write the revision lump and all the remaining ones.
            write_lump(fw, revlump)
            revs.append(revno)

            # [Roger Norling]: since the revision wasn't dropped, adjust
            # renumbering.
            if opts.renumber_revs:
                renumbering[orgRev] = revno

            adjust_copyfrom(lump, revs, flog, renumbering)
            write_lump(fw, lump)

            nb_written += 2
        except StopIteration:
            dropped.append(revno)
        else:
            # Write all the remaining lumps within the revision.
            for lump in reviter:
                adjust_copyfrom(lump, revs, flog, renumbering)
                write_lump(fw, lump)
                nb_written += 1

    fr.close()
    fw.close()

    # Sanity check.
    assert len(revs) + len(dropped) == reader.nbread_revs
    assert (reader.nbread == nb_written + len(dropped))

    if not opts.quiet:
        # Print summary of dropped nodes.
        print >> flog, 'Dropped %d revisions(s):' % len(dropped)
        for revno in dropped:
            print >> flog, "   '%d'" % revno
        print >> flog


if __name__ == '__main__':
    main()