#!/usr/bin/python
"""
find_duplicate_code -- Find similar code fragments
"""
##
## Usage:
##     find_duplicate_code [OPTION]... PATHNAME...
## 
## Options:
##     --abstract-scalars   unify numbers and strings in input text (default: no)
##     --abstract-symbols   unify identifier names in input text (default: no)
##     --amalgam            build a big file from all known source files to
##                          compare each file with each other file
##     -t N, --treshold=N   set the min. number of duplicate lines (default: 4)
##     -o F, --output=F     write log to file F instead of stdout
##     -n, --dry-run        do not actually compare, only find and read files
##     --help, --version    print help or version information
## 
## Example: Find duplicate code in known source files
##
##      $ find_duplicate_code /path/to/sources
##
## Example: Find duplicates in all files of the working directory
## 
##      $ find_duplicate_code *
## 
## Example: Find duplicate code lines in C++-files
##
##      $ find /path/to/c++files -type f -regex '^.+\(cpp\|h\)$' | \
##          xargs find_duplicate_code --treshold=5
##
## Example: Output
## 
## Here  find_duplicate_code has examined  file libutils.sh.  Three consecutive
## lines of code  ("chunks") have been found that are  duplicate at least once.
## Chunks 1 and 3 are duplicated once,  chunk 3 is duplicated twice. As you can
## see the output is structured and can be processed further.
## 
##   libutils.sh: 1180 line(s)
##     chunk 1, 215-218 (4)
##       215:  local tpath="${path%/}"
##       216:  while [ "${tpath}" != "${path}" ]; do
##       217:  tpath="${path}"; path="${tpath%/}"
##       218:  done
##         duplicated at lines 231-234 (4)
##     chunk 2, 224-226 (3)
##       224:  {        #
##       225:  # @param PATH
##       226:  #
##         duplicated at lines 248-250 (3)
##         duplicated at lines 259-261 (3)
##     chunk 3, 248-250 (3)
##       248:  {        #
##       249:  # @param PATH
##       250:  #
##         duplicated at lines 259-261 (3)
## 
## Notes:
## 
## The comparison algorithm is line-based and is described in section
## "ALGORITHM" below.
## 
## This version of the script does not yet implement "--amalgam" and
## "--abstract-symbols".
##
#########################
## $Author: Andreas Spindler <info@visualco.de>$
## $Created: 2011-08-11 16:54:03$
## $Maintained at: http://www.visualco.de$

import sys, os, locale, re, string, time

VERSION = '0.3'
MIN_PYTHON_VERSION = 2.4                        # least Python version required

opt_abstract_scalars = False
opt_abstract_symbols = False
opt_treshold = 4
opt_similarity = 0.95         # how similar two blocks of code must be declared
opt_verbose = 0
opt_dry_run = 0
opt_amalgam = 0
opt_known_globs = ('*.[cChH]', '*.cs', '*.c[px][px]', 
                   '*.java', '*.js',
                   '*.p[lym]', 
                   '*.txt',
                   '*.sh', '*.el', '*.inc',
                   '*.vbs'
                   )

########################################################################
##
## DOMAIN-LEVEL TYPES
##
########################################################################

class Document:
    """Represents a source file to be compared with others."""
    def __init__(self, f):
        if f == '-' or f is None:
            self.pathname = f
            self.dirname = None
        else:
            assert isinstance(f, str)
            f = os.path.abspath(f)
            if os.path.isdir(f):
                raise Exception(f + " is a directory")
            if not os.path.exists(f):
                raise Exception(f + " not found")
            self.pathname = os.path.realpath(f) # absolute, real (existing) pathname
            self.dirname = os.path.dirname(self.pathname) # absolute, real directory name
        self.pieces = None                     # lines -> pieces strings split
        self.lines = []                        # pieces joind to abstract lines
        self.tokens = []                       # lines -> token stream

    def Slurp(self):
        """Read all input lines."""
        if self.pathname == '-':
            self.slurped = [ line.strip() for line in sys.stdin.readlines() ]
        else:
            if not os.path.exists(self.pathname):
                raise Exception(self.pathname + " cannot be opened for reading")
            self.slurped = [ line.strip() for line in open(self.pathname, 'r').readlines() ]

    def Parse(self):
        """Tokenize  input lines. Split at spaces, preserving quoted
        strings. When ABSTRACT_SCALARS is true insert abstractions for scalars
        too: NUMBER(<value>) and STRING(<value>)"""
        # Note that neither module `shlex' nor `csv' keep quoted characters,
        # hence the below regular expression. It does not handle \" properly
        # (e.g. test"\"file" or \"test"). With backreferences it might by
        # faster.
        self.pieces = []
        self.lines = []
        for l in self.slurped:
            p = []
            if len(l):
                fragments = [ s for s in re.split("( |\\\".*?\\\"|'.*?')", l) if l.strip() ]
                words = [ ]
                for s in fragments:
                    # Fragments are lines splitted at quoted strings.
                    # TODO: Split each fragment `s' further at whitespaces to
                    # get a word list.
                    words.append(s)
                for w in words:
                    if not len(w):
                        continue
                    if len(w) == 1 and w[0] == ' ':
                        continue
                    if opt_abstract_scalars:
                        if w[0] == '"' or w[0] == "'":
                            w = 'STRING'
                    if opt_abstract_symbols:
                        if self.language:
                            pass # TODO: from the word list change each name
                                 # that is not a keyword to "identifier". It is
                                 # therefore necessay to know the language of
                                 # the input text.
                    p.append(w)
            self.pieces.append(p)
            self.lines.append(string.join(p,''))

class DocumentSet:
    """Represents all unique input files."""
    def __init__(self):
        self.inputfiles = []
        self.documents = []                         # path -> `Document'
        self.outfile = '-'                          # where to write the result
        self.home = os.environ.get('HOME')
        if self.home is not None:
            self.config_filename = os.path.join(self.home, '.config-file')
    def SetInput(self, f):
        """Find all concrete filenames in F.  F can be a list, tuple, directory-name or filename."""
        if isinstance(f, list) or isinstance(f, tuple):  # flatten lists/tuples
            for i in f:
                self.SetInput(i)
        else:
            if isinstance(f, str):
                if os.path.isdir(f):
                    # Recursively read known sources files (as defined by
                    # `opt_known_globs').
                    if opt_verbose:
                        print os.path.abspath(f) + ": found directory"
                        sys.stdout.flush
                    import glob      # http://docs.python.org/library/glob.html
                    for x in [ y for y in os.listdir(f) ]:
                        x = os.path.join(f, x)
                        if os.path.isdir(x):
                            self.SetInput(x)
                        else:
                            self.SetInput([ glob.glob(os.path.join(f, x)) for x in opt_known_globs ])
                else:                                 # `f' is not a directory.
                    if opt_verbose:
                        print os.path.abspath(f) + ": found source file"
                        sys.stdout.flush
                    self.inputfiles.append(f)
            else:                                  # assume file object (stdin)
                self.inputfiles.append('-')

    def SetOutput(self, f):
        if isinstance(f, str):
            self.outfile = os.path.abspath(f)
        else:                                      # assume file object or None
            self.outfile = '-'

    def CompileDocuments(self):
        """Create `Document' per input files."""
        self.documents = []
        for f in self.inputfiles:
            self.documents.append(Document(f))

########################################################################
##
## UTILITY FUNCTIONS
##
########################################################################

def symbolize(s):
    """Drop non-symbol characters and convert to lowercase."""
    return re.sub(r'(?u)[^\w\-_]', '', s).lower()

def is_array(obj):
    """Return True if OBJ is list or tuple type."""
    return isinstance(obj, list) or isinstance(obj, tuple)

def strip_list(s):
    """Return list with empty items from start and end of list removed."""
    for i in range(len(s)):                                   # strip from head
        if s[i]: break
        else: return []
    s = s[i:]
    for i in range(len(s) -1, -1, -1):                        # strip from tail
        if s[i]: break
        else: return []
    return s[:i+1]

def uniqify(l):
    """Remove duplicates from a list."""
    return list(set(l))

def join_string_list(lines1, lines2):
    """
    Append list or tuple of strings LINES2 to list LINES1. Join the
    last non-blank item in 'lines1' with the first non-blank item in
    LINES2 into a single string.
    """
    assert is_array(lines1)
    assert is_array(lines2)
    lines1 = strip_list(lines1)
    lines2 = strip_list(lines2)
    if not lines1 or not lines2:
        return list(lines1) + list(lines2)
    result = list(lines1[:-1])
    result.append(lines1[-1] + lines2[0])
    result += list(lines2[1:])
    return result

def time_string(t):
    """Convert seconds since the Epoch to formatted local time
    string."""
    t = time.localtime(t)
    s = time.strftime('%H:%M:%S',t)
    if time.daylight:
        result = s + ' ' + time.tzname[1]
    else:
        result = s + ' ' + time.tzname[0]
    # Attempt to convert the localtime to the output encoding.
    try:
        result = char_encode(result.decode(locale.getdefaultlocale()[1]))
    except Exception:
        pass
    return result

def date_string(t):
    """Convert seconds since the Epoch to formatted local date
    string."""
    t = time.localtime(t)
    return time.strftime('%Y-%m-%d',t)

def file_exists(fname, dirname):
    """True if file FNAME resides inside dirname."""
    assert os.path.isfile(fname)
    # Empty dirname (not to be confused with None) is the current
    # dirname.
    if dirname == '':
        dirname = os.getcwd()
    else:
        assert os.path.isdir(dirname)
        dirname = os.path.realpath(dirname)
    fname = os.path.realpath(fname)
    return os.path.commonprefix((dirname, fname)) == dirname

########################################################################
##
## ALGORITHM
##
########################################################################

def find_duplicate_code_ranges(D):
    """Find duplicate lines in a `Document' D. Returns list of
duplicates or `None'."""
    #
    # Algorithm:
    #
    #     for i = first to last line
    #         a = line indexed by i
    #         if a not empty
    #             k = i
    #             for j = i + 1 to last line
    #                 b = line indexed by j
    #                 if b not empty
    #                     if a == b
    #                         push original text indexed by k to c
    #                         k = k + 1
    #                         a = line indexed by k
    #                     else
    #                         if size of c >= threshold
    #                             print information               
    #                             k = i (reset to first line to be matched)
    #                             clear c
    #
    # Notes:
    #
    # Possibly this algorithm has been found earlier; I don't know. It
    # reads input files line by line. For an exact analysis, however,
    # I think it is more efficient to compare token streams. The
    # related algorithm then:
    # 
    # - In functions, find statements (if, while, for etc.)
    # 
    # - Ignore local variables and local typedefs
    # 
    # - Let each statement form the root of an abstract syntax tree
    # 
    # - When comparing functions compare the set of AST's per function
    #
    # This algorithm is more efficient, since it results in less
    # "false positives" than when comparing line-by-line. Miminizing
    # false positives, becomes relevant when comparing millions of
    # code lines.
    #
    assert opt_treshold > 1
    R = []
    i = j = 0
    end = len(D.lines)
    chunk_num = 0
    while i < end:                                           # from top to down
        a = D.lines[i]
        chunk_size = 0
        #print "% 5u: *** '%s'" % (i + 1, D.lines[i])
        if len(a):
            k = i
            j = i + 1
            c = []
            count = 0                                                 
            while j < end:
                b = D.lines[j]
                # if chunk_size:
                #     print "j=%d  k=%d  b='%s'   a = '%s'" %(j, k, b, a)
                if not len(b):                             # ingore empty lines
                    count += 1
                    j += 1
                elif b == a:
                    c.append(D.slurped[k])
                    count += 1
                    k += 1                      # advance to next line to match
                    a = D.lines[k]
                    j += 1
                else:                       # some non-empty line did not match
                    if len(c) >= opt_treshold:
                        if not chunk_size:
                            chunk_size = len(c)
                            chunk_num += 1
                            print "    chunk %u, %u-%u (%u)" % (chunk_num, i + 1, k, chunk_size)
                            for l in range(i, k):
                                print "% 9u:  %s" % (l + 1, c[l - i])
                        start = j - count + 1
                        print "        duplicated at lines %u-%u (%u)" % (start, j, count)
                    else:
                        j += 1
                    ## Back to first line that must match.
                    k = i
                    a = D.lines[k]
                    c = []
                    count = 0
            ## Jump over the minimum number of matched lines or goto
            ## next line. print " compared line %u" % (i + 1)
            i += max(1, chunk_size)
            if chunk_size:
                assert chunk_size >= opt_treshold
                # print "    continuing at line %u"
        else:
            i += 1
    if len(R):
        return R
    return None

########################################################################
##
## MAIN CODE
##
########################################################################

class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg

Ds = DocumentSet()

def main(cmd, opts, rawfilenames):
    """
    Execute CMD with command-line options and arguments. OPTS and FILES conform
    to getopt-values.
    """
    # Locate the executable and configuration files directory.
    global Ds
    if not os.path.exists(cmd):
        raise Exception("Non-existing command '%s'" % cmd)

    # Application-related options and input files. Throws unless a file exists.
    for k, v in opts:
        if k in ('-o', '--output'):
            Ds.SetOutput(v)
            if not isinstance(v, str):
                sys.stdout = v                     # bind stdout to file object
    rawfilenames = uniqify(rawfilenames)
    Ds.SetInput(rawfilenames)
    for v in Ds.inputfiles:
        if not isinstance(v, str):
            sys.stdin = v                           # bind stdin to file object
    Ds.CompileDocuments()

    # Do the work:
    #     (1) slurp input files, create `Document' objects
    #     (2) split lines at words into strings
    #     (3) remove comments, optimize lines
    #     (4) find duplicates
    if opt_verbose:
        t = time.time()
        ## print time_string(t), date_string(t)
        ## print 'input paths = %s' % (string.join(Ds.inputfiles))
        ## print 'output log = %s' % (Ds.outfile)
    sys.stdout.flush

    # Step 1-2
    for D in Ds.documents:
        print "%s: reading" % (D.pathname)
        sys.stdout.flush
        D.Slurp() 
        D.Parse()

    # Step 3
    for D in Ds.documents:
        pass # TODO: when a line contains no [A-Za-z0-9"'] assume it contains
             # operators/gibberish/braces. Then concat it with previous line
             # and keep this line empty (so that the indeces of `D.slurped' and
             # `D.lines' are still identical).

    # Step 4
    for D in Ds.documents:
        print '%s: %u line(s)' % (D.pathname, len(D.lines))
        sys.stdout.flush
        if not opt_dry_run:
            find_duplicate_code_ranges(D)
            sys.stdout.flush
    return 0

if __name__ == '__main__':
    # Parse command-line options, then process trivial options, then `main()'.
    import getopt
    R = 0
    if float(sys.version[:3]) < MIN_PYTHON_VERSION:
        message.stderr('FAILED: Python 2.4+ required')
        sys.exit(1)
    else:
        stdin, stdout = sys.stdin, sys.stdout
        try:
            try:
                cmd = sys.argv[0]
                opts, rest = getopt.getopt(sys.argv[1:],
                                           'ht:nvo:', 
                                           ['help', 'dry', 'verbose', 'version',
                                            'abstract-scalars', 'abstract-symbols', 'amalgam',
                                            'treshold=',
                                            'output='])
            except getopt.GetoptError, msg:
                raise Usage(msg)
            for k in [opt[0] for opt in opts]:
                if k in ('--help', '-h'):
                    raise Usage("No man-page available")
                elif k in ('--version', '-v'):
                    ## print(cmd + ' ' + VERSION)
                    print(VERSION)
                    exit(0)
                else:
                    opt_dry_run = k in ('--dry', '-n')
                    opt_verbose = k in ('--verbose')
                    opt_amalgam = k in ('--amalgam')
                    opt_abstract_scalars = k in ('--abstract-scalars')
                    opt_abstract_symbols = k in ('--abstract-symbols')
                    for k, v in opts:
                        if k in ('--treshold', '-t'):
                            opt_treshold = int(v)
            if len(rest) == 0:
                raise Usage('Missing file argument(s)')
            if opt_abstract_symbols:
                raise Usage('Sorry, "--abstract-symbols" not yet implemented in this version')
            if opt_amalgam:
                raise Usage('Sorry, "--amalgam" not yet implemented in this version')
            stdout.flush
            try: R = main(cmd, opts, rest)
            except KeyboardInterrupt: R = 1
        except Usage, err:
            print >>sys.stderr, """
%(msg)s
Usage:
    %(cmd)s [OPTION]... PATHNAME...

Options:
    --abstract-scalars   unify numbers and strings in input text (default: no)
    -t N, --treshold=N   set the min. # for duplicate lines (default: 4)
    -n, --dry-run        do not actually compare, only find and read files
    -v, --verbose
    --help, --version    print help or version information

Description:

Duplicate code lines  are only found per file unless  the `--amalgam' option is
specified.  Each  PATHNAME can  be  a  directory- or  filename.  In  case of  a
directory-name finds all known source files in the directory.

Known source files:
    %(globs)s

Examples:

Find duplicate code tracks in all files of the current directory
    $ find_duplicate_code *

Find duplicate code in all known source files
    $ find_duplicate_code /path/to/sources

Analyze C++-files
    $ find /path/to/c++files -type f -regex '^.+\(cpp\|h\)$' | \\
          xargs find_duplicate_code --treshold=5

""" % {"cmd": cmd,
       "msg": err.msg,
       "globs": opt_known_globs}
            R = 2
        finally:
            sys.stdin, sys.stdout = stdin, stdout
        if R == 0:
            print "OK(%d)" % (R)
        elif R < 0:
            print "FAILED(%d)" % (R)
        sys.exit(R)

### find_duplicate_code ends here