#! /usr/bin/env python

import os
import re
import sys
import argparse
# Deprecated since Python3.5 use math module.
from fractions import gcd

parser = argparse.ArgumentParser(
    description='''Compare benches generated by 'vcsn score'.''',
    epilog='''Install the `colorama` Python module to get colored output.''')
parser.add_argument('file', nargs='+',
                    type=str, default=None,
                    help='Bench file (from vcsn score) to compare')
parser.add_argument('-a', '--all', action='store_true',
                    help='Report also benches with no differences')
parser.add_argument('-c', '--color', dest='color', action='store',
                    default='auto',
                    choices=['auto', 'always', 'never'],
                    help='Whether to use colors in the output')
parser.add_argument('-O', '--only', metavar='RE',
                    type=re.compile, default='.*',
                    help='Report only benches whose title is matched by RE')
parser.add_argument('-t', '--threshold', metavar='PERCENT',
                    type=float, default=10,
                    help='''Highlight good and bad scores with associated
                    threshold.  Defaults to 10%%.''')
parser.add_argument('--csv', type=str,
                    help='Create a csv file')
args = parser.parse_args()

# Colors support.
green = ''
red = ''
std = ''
if args.color == 'always' or args.color == 'auto' and sys.stdout.isatty():
    try:
        from colorama import Fore, Style
        green = Fore.GREEN + Style.BRIGHT
        red = Fore.RED + Style.BRIGHT
        std = Style.RESET_ALL
    except:
        pass

bench = dict()
# bench-id => number of iterations (e.g., `20` for 20x).
number = {}
benc_csv = dict()


def lcm(numbers):
    res = 1
    for num in numbers:
        res = (num * res) // gcd(num, res)
    return res


def normalize(k):
    '''Fix a bench fix, i.e., fix errors, update APIs etc.
    '''
    # Separate with ' # ' only.
    k = ' # '.join(map(str.strip, k.split('#', 2)))

    # The right symbol for repeated &.
    k = re.sub(r'a\*\*(\d+) ', r'a & \1', k)
    # Fix: spello.
    k = re.sub(r'de_buijn', 'de_bruijn', k)
    # Fix: extraneous paren.
    k = re.sub(r'ladybird\(21\)\)', 'ladybird(21)', k)
    # Fix: Incorrect use of .format.
    k = re.sub(r'(a.(?:product|shuffle)\(a\) # a = std\(\{\}\).format\(r\))',
               lambda m: m.group(1).replace('{}', '[a-e]?{50})'),
               k)
    # Fix: now use 's' to denote a string, instead of 'a'.
    k = re.sub(r'read\(a\) # a =',
               r'read(s) # s =', k)
    # Now we display the number of repetitions.
    k = re.sub(r'(# a = de_bruijn\(150\))$', r'\1, 1000x', k)
    k = re.sub(r'(# e = "\(\\e\+a\)" \* 500)$', r'\1, 100x', k)
    k = re.sub(
        r'(# r = b\.expression\("\(\\e\+a\)" \* 500\))$', r'\1, 1000x', k)
    # Now, instead of "   on [a-z]  -> Z", ", c = [a-z] -> Z".
    k = re.sub(r' +on (\[.*?\][?*]?) *-> *([BQZ])',
               r', c = \1 -> \2', k)
    k = re.sub(r'a = lal\(a-zA-Z0-9\).ladybird\(18\)',
               r'a = ladybird(18), c = [a-zA-Z0-9] -> B', k)
    # We never worked on Q in score, it was a typo.  And working with
    # B is good enough anyway and more relevant.
    k = re.sub(r'(determinize.*de_bruijn\(\d+\)), c = \[abc\] -> [BQ]',
               r'\1', k)

    # derived_term.
    k = k.replace('derived_term()', 'derived_term("derivation")')
    k = k.replace('linear()', 'derived_term("expansion")')

    # For a while we displayed 'a.sort() # a = std([a-e]?{600})' but
    # were actually running 'a.shortest(5)'.
    k = re.sub(r'a.sort\(\) (# a = std\(\[a-e\]\?\{600\}\))',
               r'a.shortest(5) \1', k)
    # and we were not reporting the context, although it's not B.
    k = re.sub(r'(a\.shortest\(5\) # a = std\(\[a-e\]\?\{600\}\))$',
               r'\1, c = [a-e] -> Z',
               k)

    # The syntax of contexts has changed.
    k = re.sub(r'lal_char\(abc\)(_|, )b', '[abc] -> B', k)

    k = k.replace('ratexp', 'expression')

    k = k.replace('a.num_sccs', 'a.scc')

    k = k.replace('a.accessible ', 'a.accessible() ')

    # Now we display the context.
    k = re.sub(r'(a.minimize\("(moore|signature)"\) # a = std\(.*?\))$',
               r'\1, c = [a-k] -> B',
               k)

    k = k.replace('product', 'conjunction')

    k = k.replace('a.expression()', 'a.expression("associative")')
    k = re.sub(r'(a\.expression\("\w+")\)',
               r'\1, "naive")',
               k)

    # has_twins_property is benched on an expression using the
    # associative identities.
    k = re.sub(r'(a.has_twins_property.* # a = std\([^,]*?)\)',
               r'\1, "associative")', k)

    # has_twins_property was run on Zmin, but with Q displayed.
    k = re.sub(r'(a.has_twins_property.* # .*)Q,',
               r'\1Zmin,', k)

    # is_ambiguous and is_cycle_ambiguous run on Z, but with B
    # displayed.
    k = re.sub(r'(a.is_(cycle_)?ambiguous.* # .*)B,',
               r'\1Z,', k)

    # ZMIN -> Zmin.
    k = re.sub(r'([NRZ])MIN', r'\1min', k)

    # Nicer notation for tuples.
    k = k.replace("'(a, x)'{2000}'(b, y)'", "(a|x){2000}(b|y)")

    # Useless parens.
    k = k.replace("(['(a,x)'-'(b,y)']*){600}", "['(a,x)'-'(b,y)']*{600}")
    k = k.replace("(['(a,x)'-'(b,y)']{1000})*", "['(a,x)'-'(b,y)']{1000}*")

    # Use of `;` instead of `,`.
    k = re.sub(r'(a\.compose\(a2\).*);', r'\1,', k)

    # It makes more sense to run these algos on Nmin, because we can
    # use the fastest implementations without having to check for
    # preconditions.  In the past, we used the fastest implementations
    # blindly.
    k = re.sub(r'(lightest_automaton.* ->) Zmin', r'\1 Nmin', k)

    # For a while, we thought we were working on a different
    # expression, but it was reusing the previous one.  Easy to see
    # since the context is wrong: it does not go up to `z`.  And use
    # Nmin now.
    k = k.replace('a.lightest() # a = std([a-z]?{300}), c = [a-e] -> Z',
                  'a.lightest() # a = std([a-e]?{150}), c = [a-e] -> Nmin')

    # Final format.
    k = '{:20s} # {}'.format(*map(str.strip, k.split('#', 2)))
    return k


def read_file(fn):
    '''Read one `vcsn score` generated file named `fn`.  Store in `bench`.

    Each line looks like:

        0.12s: a.is_proper()        # a = "", 200000x

    So split in `v` (0.12s) and `k` for the rest, normalized.
    '''
    with open(fn) as f:
        for line in f:
            v, k = map(str.strip, line.split(':', 1))
            # Get rid of "s", we know the unit.  And make it a float.
            if v[-1] == 's':
                v = float(v[:-1])
            # Fix errors in algo descriptions.
            k = normalize(k)
            # Number of iterations of the test.
            num = re.search(', ([0-9]+)x', k)
            num = int(num.group(1)) if num else 1
            k = re.sub(', ([0-9]+)x', '', k)
            if k not in bench:
                bench[k] = dict()
            bench[k][fn] = {'value': v, 'num': num}


def read_files(files):
    # Read the score files.
    for fn in files:
        read_file(fn)
    # Normalize the number of iterations: if some test was run 2x
    # and then 5x, display both results in 10x.
    for k in bench:
        # For each bench, the number of times it was run.
        nums = [bench[k][f]['num']
                for f in bench[k] if bench[k][f]['value'] not in ['N/A', 'FAIL']]
        number[k] = lcm(nums)
        # Normalize each bench.
        for f in bench[k]:
            if bench[k][f]['value'] not in ['N/A', 'FAIL']:
                bench[k][f]['value'] *= number[k] // bench[k][f]['num']
                bench[k][f]['num'] = number[k]


def csv(keys, fn):
    '''Save the benches about `keys` in CSV format in file named `fn`.'''
    fl = sys.stdout if args.csv in ['', '-'] else open(args.csv, 'wa')
    fl.write(' ,')
    for k in keys:
        str = '"' + k.replace('"', '""') + '"'
        fl.write(str)
        if k != keys[-1]:
            fl.write(',')
    fl.write('\n')
    for f in args.file:
        fl.write(os.path.basename(f))
        fl.write(',')
        for k in keys:
            v = bench[k][f] if k in bench and f in bench[k] else ''
            fl.write(v)
            if k != keys[-1]:
                fl.write(',')
        fl.write('\n')


def text(keys):
    # For each bench-case, compare all the recorded scores.
    for k in keys:
        # All the benches.
        bs = [bench[k][f] if f in bench[k] else {'value': 'N/A', 'num': 0}
              for f in args.file]
        # All the valid benches.
        bfs = [b['value']
               for b in bs if b['value'] not in ['N/A', 'FAIL']]

        # If all the same (not even N/A or FAIL), there is nothing to
        # say.
        if not args.all and len(set(bfs)) == 1:
            continue

        # Compute average of benchs.
        avg = sum(bfs) / len(bfs)
        for b in bs:
            if b['value'] in ['FAIL', 'N/A']:
                print('{:>5}'.format(b['value']), end=' ')
            else:
                color = ''
                score = b['value']
                if score <= avg - args.threshold / 100 * avg:
                    color = green
                elif args.threshold / 100 * avg + avg <= score:
                    color = red
                print(color + '{:5.2f}'.format(score) + std, end=' ')
        if number[k] != 1:
            k += ', {}x'.format(number[k])
        print(k)


# Main.

read_files(args.file)
# The keys we are interested in.
keys = [k for k in sorted(bench.keys()) if args.only.search(k)]

# Print the score table.
if args.csv is not None:
    csv(keys, args.csv)
else:
    text(keys)
