#!/usr/bin/env python
"""
Given a root directory, recurse in it and find all the duplicate files, files
that have the same contents, but not necessarily the same filename.
"""

# stdlib imports
import sys, os
from os.path import *
from collections import defaultdict
from filecmp import cmp


def main():
    import optparse
    parser = optparse.OptionParser(__doc__.strip())

    parser.add_option('-i', '--ignore', action='append', default=[],
                      help="Prune files/directories with the given name.")

    parser.add_option('-s', '--same-name-only', action='store_true',
                      help="If set, only find duplicates that have the "
                      "same basename.")

    parser.add_option('-a', '--action', action='store', type='choice', choices=('nothing', 'delete', 'hardlink'),
                      help="Action to carry out: delete: delete longest filename, hardlink: replace with hard links.")

    parser.add_option('-L', '--list-hard-links', action='store_true',
                      help="By default, this ignores hard links; list hard-linked files as if different.")

    opts, args = parser.parse_args()

    if not args:
        parser.error("You must specify directories to search.")
    find_duplicates(args, opts.same_name_only, opts.action, opts.list_hard_links, opts.ignore)


def find_files(args, ignores):
    for ddir in args:
        if ddir in ignores:
            continue
        if isdir(ddir):
            for root, dirs, files in os.walk(ddir):
                for dn in dirs:
                    if dn in ignores:
                        dirs.remove(dn)
                for fn in files:
                    if fn in ignores:
                        continue
                    yield join(root, fn)
        else:
            yield ddir


def find_duplicates(dirs, same_name_only, action, list_hard_links, ignores):
    "Find the duplicate files in the given root directory."

    # A map of size -> filename.
    allfiles = []
    szmap = defaultdict(list)

    # Organize all filenames according to size.
    for fn in find_files(dirs, ignores):
        if not isfile(fn):
            continue
        allfiles.append(fn)
        sz = getsize(fn)
        szmap[sz].append(fn)

    # Compare all sets of files with the same size.
    dups = []
    for sz, files in sorted(szmap.iteritems()):
        if len(files) <= 1:
            continue
        ## print >> sys.stderr, sz, len(files)

        files.sort()
        for i in xrange(0, len(files)-1):
            fn1 = files[i]
            for fn2 in files[i+1:]:
                if same_name_only and basename(fn1) != basename(fn2):
                    continue

                if not list_hard_links:
                    st1 = os.stat(fn1)
                    st2 = os.stat(fn2)
                    if st1.st_ino == st2.st_ino:
                        continue

                if cmp(fn1, fn2, shallow=False):
                    dups.append((fn1, fn2, sz))

    dups.sort()
    for fn1, fn2, sz in dups:
        print sz
        print '   ', fn1
        print '   ', fn2
        print

    if action == 'nothing':
        pass

    elif action == 'delete':
        remlist = []
        for fn1, fn2, sz in dups:
            if len(fn1) == len(fn2):
                print >> sys.stderr, "Warning: Can't decide which to delete:"
                print >> sys.stderr, "  ", fn1
                print >> sys.stderr, "  ", fn2
                continue
            remlist.append(fn1 if len(fn1) > len(fn2) else fn2)

        for fn in remlist:
            print '   ', fn

        print '\nAre you sure?',
        if raw_input().lower() in ('y', 'yes'):
            for fn in remlist:
                print 'Deleting', fn
                os.remove(fn)


    elif action == 'hardlink':
        for fn1, fn2, sz in dups:
            print "Replacing '%s' with a hard link." % fn2
            os.remove(fn2)
            os.link(fn1, fn2)


if __name__ == '__main__':
    main()

