#!/usr/bin/env python import argparse from datetime import datetime import gzip import json import os import re import sys import time LOGFILE = "/var/log/nginx/melpa/melpa.access.log" LOGREGEX = r'(?P[\d.]+) [ -]+ \[(?P[\w/: -]+)\] ' \ r'"GET /packages/(?P[^ ]+)-[0-9.]+.(?:el|tar) ' \ r'HTTP/\d.\d" 200' def json_handler(obj): if isinstance(obj, datetime): return obj.isoformat() elif isinstance(obj, set): return list(obj) raise TypeError( 'Object of type {0} with value {0} is not JSON serializable'.format( type(obj), repr(obj))) def json_dump(data, jsonfile, indent=None): """ jsonfiy `data` """ return json.dump(data, jsonfile, default=json_handler, indent=indent) def datetime_parser(dct): for key, val in dct.items(): if isinstance(val, list): dct[key] = set(val) return dct def json_load(jsonfile): return json.load(jsonfile, object_hook=datetime_parser) def parse_val(val): try: return datetime.strptime(val, "%Y-%m-%dT%H:%M:%S") except ValueError: return val def parse_logfile(logfilename, pkg_ip_time): """ """ if logfilename.endswith("gz"): logfile = gzip.open(logfilename, 'r') else: logfile = open(logfilename, 'r') logre = re.compile(LOGREGEX) count = 0 for line in logfile: match = logre.match(line) if match is None: continue ip = match.group('ip') dtstamp = int(time.mktime( datetime.strptime(match.group('date').split()[0], "%d/%b/%Y:%H:%M:%S").timetuple())) pkg = match.group('package') pkg_ip_time.setdefault(pkg, {}).setdefault(ip, set()).add(dtstamp) count += 1 return count def main(): """main function""" parser = argparse.ArgumentParser(description='MELPA Log File Parser') parser.add_argument('logs', metavar="logs", type=unicode, nargs="*", help="Log files to parse.", default=[LOGFILE]) args = parser.parse_args() # load old data file if os.path.exists("download_log.json.gz"): pkg_ip_time = json_load(gzip.open("download_log.json.gz")) else: pkg_ip_time = {} # parse each parameter for logfile in args.logs: sys.stdout.write("processing logfile {0}... ".format(logfile)) sys.stdout.flush() count = parse_logfile(logfile, pkg_ip_time) sys.stdout.write("{0}\n".format(count)) # dump new data file json_dump(pkg_ip_time, gzip.open("download_log.json.gz", 'w')) # calculate current package totals pkgcount = {p: len(i) for p, i in pkg_ip_time.iteritems()} json_dump(pkgcount, open("download_counts.json", 'w'), indent=1) if __name__ == '__main__': main()