#!/bin/bash
# nlp-awk.sh
# Reads an Apache log file and summarizes it
# 0.8.5
# Written by: Ryan Cunningham (ryanc@inmotionhosting.com)
# 0.8.3: Fix for URLs that have spaces in them which breaks absolutely everything
# 0.8.4: Works properly with mawk or gawk
# 0.8.5: Changed FS to a regex. In order to compensate for requests that contain
#        quotation marks - 10M lines in 1:46.59

# Output is a follows:
# 0 - Number of hits by hour
# 1 - HTTP response codes
# 2 - IPs
# 3 - User Agents
# 4 - Requests
# 5 - Requests for non-static content, query strings stripped off

if [ "$1" == "stdin" ]; then
    FILE=/dev/stdin
else
    FILE=$1
fi

if [[ ! -z $2 ]]; then
	DAY=$2
else
	DAY=""
fi
if command mawk 2>/dev/null; then
    INT=mawk
else
    INT=awk
fi

$INT -v "INT=$INT"  -v "day=$DAY" 'BEGIN { FS=" \"|\" "; a[0] = ""; b[0] = ""; c[0] = "";
req[""] = 0; ip[""]= 0; ua[""] = 0; rcode[""] = 0; hits[""] = 0; nstatic[""] = 0; refs[""] = 0; OFS="|";
VOFF=1; }
$1 ~ day{
{
    ## The first field and last three should always be usable
    # Split fields by spaces
    split($1, a, " ");
    split($(NF-2), c, " ");
    # if fields dont split cleanly on FS
    if (NF != "5") {
        # This is ugly but it should almost never have to actually be run
        lbound = index($0, "]") + 3;
        $2 = substr($0, lbound, index($0, $(NF-2)) - lbound - 2);
    }
    blen = split($2, b, " ");
    if ($2 != "-") {
        mth_res = substr($2, 1, index($2, b[blen]) - 2);
        qindex = index($2, "?") - VOFF; # gawk and mawk need different values here... sometimes
        if (qindex > 0) {
            uri = substr($2, 0, qindex);
        } else { uri = mth_res; }
    } else {
        mth_res = $2;
        uri = $2;
    }
    ip_addr = a[1];
    hourly = substr(a[4], 2, 14);
    res_code = c[1];
    u_agent = $NF;

    ip[ip_addr]++;
    hits[hourly]++;
    req[res_code OFS mth_res]++;
    rcode[res_code]++;
    ua[u_agent]++;
    nstatic[res_code OFS uri]++;
}}
END {
    for (i in hits) { if (hits[i] != 0) { print 0 OFS 10000000 OFS i OFS hits[i]; } }
    for (i in rcode) { if (rcode[i] != 0) { print 1 OFS 20000000 OFS i OFS rcode[i]; } }
    for (i in ip) { if (ip[i] != 0) { print 2 OFS ip[i] OFS i; } }
    for (i in ua) { if (ua[i] != 0) { print 3 OFS ua[i] OFS i; } }
    for (i in req) { if (req[i] != 0) { print 4 OFS req[i] OFS i; } }
#    for (i in refs) { if (refs[i] != 0) { print 6 OFS refs[i] OFS i; } }
    for (i in nstatic) {
        if (match(tolower(i), /(jpg|jpeg|gif|png|ico|txt|pdf|swf|xml|css|js)$/) == 0) {
             if (nstatic[i] != 0) { print 5 OFS nstatic[i] OFS i; }
        }
    }
}' $FILE 2>/dev/null | sort -t'|' -k2,2rn 2>/dev/null # Pipe stderr to /dev/null so it won't print to the controlling terminal
