#!/usr/local/bin/python
#
# Copyright (c) 2003 Neil Blakey-Milner
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#

import cgi
import re
import sys

#sibling imports
import agentparsing
import config
import excludes
import geographic
import graphing
import refsearch
import util

accessPattern = re.compile('^(?P<ip>[^ ]*) - (?P<auth>[^ ]+) \[(?P<date>[^\]]*)\] \"(?P<url>.+)\" (?P<code>\d+) (?P<size>\d+|-) \"(?P<referer>.*)\" \"(?P<browser>.*)\"$')
command = re.compile('GET (?P<uri>[^ ]+) HTTP.*')

def logline(line):
    """Create a dictionary of useful information based on a log entry."""
    b = accessPattern.match(line)
    if b is None:
        print line
        raise ValueError

    ret = {}
    ret["ip"] = b.groupdict()["ip"]
    ret["date"] = b.groupdict()["date"] # mx.DateTime.Parser.DateTimeFromString(b.groups()[1])
    ret["url"] = b.groupdict()["url"]
    ret["uri"] = None
    u = command.search(b.groupdict()["url"])
    if u:
        ret["uri"] = u.groupdict()["uri"]
    ret["referer"] = b.groupdict()["referer"]
    ret["agent"] = b.groupdict()["browser"]
    #print ret["agent"]
    ret["code"] = b.groupdict()["code"]
    ret["size"] = b.groupdict()["size"]
    return ret

def getURL(aLine):
    return aLine["url"]

def getReferer(aLine):
    return aLine["referer"]

def isSearch(aReferer):
    if aReferer in config.SEARCHENGINES:
        return 1
    return 0

def isPage(aLine):
    """Determine if a log entry is an HTML page, qualified for page
    hits."""
    url = getURL(aLine)
    if util.any_of(url, config.HTMLEXCLUDEPATTERNS):
        return 0
    return util.any_of(url, config.HTMLPATTERNS)

def isDownload(aLine):
    """Determine if a log entry is a download."""
    url = getURL(aLine)
    if util.any_of(url, config.DOWNLOADEXCLUDEPATTERNS):
        return 0
    return util.any_of(url, ('/files/',))

def parseLog(stream):
    """Gather all the necessary information from a log file."""
    data = {}
    referers = {}
    searches = {}
    parsedLines = []
    myurls = {}
    data['hits'] = 0
    downloads = {}
    totaldownloads = 0
    data['transferred'] = 0
    data['saved'] = 0
    browserDict = {}
    ips = {}
    bots = {}
    entry_pages = {}
    for rawline in stream:
        line = logline(rawline)

        if excludes.excludeFailures(line) or \
            excludes.excludedIP(line) or \
            excludes.excludedURL(line) or \
            config.EXCLUDEDBYFUNC(line):
            continue

        if excludes.excludedBot(line):
            name = agentparsing.getBotName(line["agent"])
            bots.setdefault(name, 0)
            bots[name] = bots[name] + 1
            continue
            
        uri = line["uri"]
        if not isPage(line):
            if uri and isDownload(line):
                totaldownloads = totaldownloads + 1
                downloads.setdefault(uri, 0)
                downloads[uri] = downloads[uri] + 1
            continue

        if line["agent"] == "-" and not uri:
            # For weird things like CONNECT
            continue

        if uri:
            myurls.setdefault(uri, 0)
            myurls[uri] = myurls[uri] + 1

        data['hits'] = data['hits'] + 1

        if line['code'] == "304":
            # XXX: Todo: something better?
            data['saved'] = data['saved'] + 16000
        else:
            if line['size'] != "-":
                data['transferred'] = data['transferred'] + int(line['size'])
            else:
                print line['code']

        referer = getReferer(line)
        simpleReferer = util.simplifyReferer(referer)
        if not excludes.excludeReferer(simpleReferer):
            referers.setdefault(simpleReferer, [])
            referers[simpleReferer].append(referer)
    
        if isSearch(simpleReferer):
            searches.setdefault(simpleReferer, [])
            searches[simpleReferer].append(referer)

        if simpleReferer not in config.MYREFERER:
            if uri:
                entry_pages.setdefault(uri, 0)
                entry_pages[uri] = entry_pages[uri] + 1

        ips.setdefault(line["ip"], 0)
        ips[line["ip"]] = ips[line["ip"]] + 1

        browserDict[(line["ip"], line["agent"])] = 1

    refbypage = {}

    for site, urls in referers.items():
        for url in urls:
            refbypage.setdefault(url, 0)
            refbypage[url] = refbypage[url] + 1

    data['refbysite'] = {}
    for k, v in referers.items():
        data['refbysite'][k] = len(v)
    data['refbypage'] = refbypage
    data['searches'] = refsearch.getSearches(searches)
    data['searchesbyengine'] = {}
    for k, v in searches.items():
        data['searchesbyengine'][k] = len(v)
    data['searchreferer'] = searches
    data['visitors'] = len(browserDict)
    data['browsers'], extra = agentparsing.buildDictionary(
        [agent for ip, agent in browserDict.keys()],
        config.BROWSERS)
    data['countries'] = geographic.buildDictionary(
        [ip for ip, agent in browserDict.keys()])
    data['unknownbrowsers'] = extra['unknown']
    data['mozbrowsers'] = extra['mozbrowsers']
    data['os'], extra = agentparsing.buildDictionary(
        [agent for ip, agent in browserDict.keys()],
        config.PLATFORMS)
    data['unknownos'] = extra['unknown']
    data['downloadurls'] = downloads
    data['downloads'] = totaldownloads
    data['urls'] = myurls
    data['visitsbybot'] = bots
    data['entry_pages'] = entry_pages
    data['ips'] = {}
    for k, v in ips.items():
        if k in config.IP_TO_NAME.keys():
            name = config.IP_TO_NAME[k]
        else:
            name = k
        data['ips'].setdefault(name, 0)
        data['ips'][name] = data['ips'][name] + v
    return data

def summaryText(data):
    """Print summary information to the screen."""
    from pprint import pformat
    print "Refering sites:\n--------\n"
    refs = [(b,a) for a,b in data['refbysite'].items()]
    refs.sort()
    refs.reverse()
    for numrefs, ref in refs[:10]:
        print "%4d   %s" % (numrefs, ref)

    print "\n\nRefering pages:\n--------\n"
    refs = [(b,a) for a,b in data['refbypage'].items()]
    refs.sort()
    refs.reverse()
    for numrefs, ref in refs[:10]:
        print "%4d   %s" % (numrefs, ref)

    print "\n\nSearches:\n--------\n"
    searches = [(b,a) for a,b in data['searches'].items()]
    searches.sort()
    searches.reverse()
    for numsearches, search in searches[:10]:
        print "%4d   %s" % (numsearches, search)

    print "\n\nBrowsers:\n--------\n"
    browsers = data['browsers']
    del browsers['None']
    browsers = [(b,a) for a,b in browsers.items()]
    browsers.sort()
    browsers.reverse()
    for numbrowsers, browser in browsers[:10]:
        print "%4d   %s" % (numbrowsers, browser)

    print "\n\nOperating Systems:\n--------\n"
    oses = data['os']
    del oses['None']
    oses = [(b,a) for a,b in oses.items()]
    oses.sort()
    oses.reverse()
    for numos, os in oses[:10]:
        print "%4d   %s" % (numos, os)

    print "\n\nURLs:\n--------\n"
    urls = [(b,a) for a,b in data['urls'].items()]
    urls.sort()
    urls.reverse()
    for numurls, url in urls[:10]:
        print "%4d   %s" % (numurls, url)

def page(data, filename, title, image, key, value):
    if data is None:
        return
    refs = [(b, a) for a, b in data.items()]
    refs.sort()
    refs.reverse()

    output = open("%s/%s"% (config.STATSOUTPUTDIR, filename), "w")

    print >>output, config.TEMPLATE_START % (title)
    print >>output, """<h1>%s</h1>""" % (title)
    if image and graphing.charting():
        print >>output, """<div class="refgraph">"""
        print >>output, """<img src="%s" title="%s" alt="" />""" % (image, title)
        print >>output, """</div>"""
    print >>output, """<table class="referers" summary="%s" cellspacing="0" cellpadding="3" border="0" width="100%%">""" % (title)
    print >>output, """<tr class="even"><th class="nameheader">%s</th><th class="countheader">%s</th></tr>""" % (key, value)
    count = 0
    for numrefs, ref in refs[:100]:
        if count % 2:
            oddOrEven = "even"
        else:
            oddOrEven = "odd"
        count = count + 1
        print >>output, """<tr class="%s">""" % (oddOrEven)
        print >>output, """<td class="name">%s</td><td class="count">%d</td>""" % (ref, numrefs)
        print >>output, """</tr>"""
    print >>output, """</table>"""
    print >>output, config.TEMPLATE_END

def markUpReferer(aReferer):
    return """<a href="%s" title="%s">%s</a>""" % (
        aReferer,
        aReferer,
        util.concat(aReferer, config.MAXREFLENGTH)
    )

def escapeKeys(aDict):
    """Escape all referer, searches, and URL information so that they're
    not interpreted as HTML (ie, Cross Site Scripting)."""
    retDict = {}
    for key in aDict.keys():
        newkey = cgi.escape(key)
        retDict[newkey] = aDict[key]
    return retDict

def summaryHTML(data):
    """Create summary information in HTML pages and graphs."""
    olddata = data
    data = {}
    for entry in olddata.keys():
        if type(olddata[entry]) != type({}):
            data[entry] = olddata[entry]
            continue
        escapedDict = escapeKeys(olddata[entry])
        if entry == "refbypage":
            refbypage = {}
            for k, v in escapedDict.items():
                refbypage[markUpReferer(k)] = v
            data[entry] = refbypage
        else:
            data[entry] = escapedDict

    for report in config.REPORTS:
        page(
            data[report["variable"]], 
            report["filename"],
            report["title"],
            report["image"],
            report["key"],
            report["value"]
        )

    title = "Statistics"
    output = open("%s/index.html" % (config.STATSOUTPUTDIR), "w")
    print >>output, config.TEMPLATE_START % (title)
    print >>output, """<h1>%s</h1>""" % (title)
    print >>output, """<h2>Summary</h2>"""
    print >>output, """<ul>"""
    print >>output, """<li>Page hits: %d""" % (data["hits"])
    print >>output, """<li>Different pages served: %d""" % (len(data["urls"]))
    print >>output, """<li>Downloads: %d""" % (data["downloads"])
    print >>output, """<li>Different files downloaded: %d""" % (len(data["downloadurls"]))
    print >>output, """<li>Unique visitors: %d""" % (data["visitors"])
    print >>output, """<li>Bandwidth used: %d""" % (data["transferred"])
    print >>output, """<li>Bandwidth saved by 304: %d""" % (data["saved"])
    dr = 0
    for k, v in data["refbysite"].items():
        dr = dr + v
    print >>output, """<li>Direct referals: %d""" % (dr)

    sr = 0
    for k, v in data['searchreferer'].items():
        sr = sr + len(v)

    print >>output, """<li>Search engine referals: %d""" % (sr)
    print >>output, """</ul>"""
    print >>output, """<h2>Reports</h2>"""
    print >>output, """<ul>"""
    for report in config.REPORTS:
        print >>output, """<li><a href="%s">%s</a></li>""" % (
            report["filename"], report["title"])
    print >>output, """</ul>"""
    print >>output, config.TEMPLATE_END
    if graphing.charting():
        graphing.makegraphs(data)

def main(argv = None):
    if not argv:
        argv = sys.argv
    stream = sys.stdin
    if len(argv) > 1:
        stream = open(argv[1])
    data = parseLog(stream)
    summaryHTML(data)
    #summaryText(data)

if __name__ == "__main__":
    sys.exit(main())
