use sqlite3 for processing log files

This commit is contained in:
Donald Curtis 2014-10-23 17:52:28 -04:00
parent 71ccdd7959
commit 8acf077daf
2 changed files with 14 additions and 15 deletions

1
.gitignore vendored
View file

@ -17,3 +17,4 @@
/.ecukes-failing-scenarios /.ecukes-failing-scenarios
/sandbox /sandbox
*~ *~
/download_log.db

View file

@ -10,6 +10,7 @@ import re
import sys import sys
import time import time
import tempfile import tempfile
import sqlite3
from operator import or_ from operator import or_
LOGFILE = "/home/melpa/log/melpa.access.log" LOGFILE = "/home/melpa/log/melpa.access.log"
@ -57,7 +58,7 @@ def ip_to_number(ip):
return reduce(or_, ((int(n) << (i*8)) for i, n in enumerate( return reduce(or_, ((int(n) << (i*8)) for i, n in enumerate(
reversed(ip.split('.')))), 0) reversed(ip.split('.')))), 0)
def parse_logfile(logfilename, pkg_ip_time): def parse_logfile(logfilename, curs):
""" """
""" """
if logfilename.endswith("gz"): if logfilename.endswith("gz"):
@ -66,7 +67,6 @@ def parse_logfile(logfilename, pkg_ip_time):
logfile = open(logfilename, 'r') logfile = open(logfilename, 'r')
logre = re.compile(LOGREGEX) logre = re.compile(LOGREGEX)
count = 0 count = 0
for line in logfile: for line in logfile:
@ -82,8 +82,7 @@ def parse_logfile(logfilename, pkg_ip_time):
"%d/%b/%Y:%H:%M:%S").timetuple())) "%d/%b/%Y:%H:%M:%S").timetuple()))
pkg = match.group('package') pkg = match.group('package')
pkg_ip_time.setdefault(pkg, set()).add(ip) curs.execute("INSERT OR IGNORE INTO pkg_ip VALUES (?, ?)", (pkg, ip))
count += 1 count += 1
return count return count
@ -113,26 +112,25 @@ def main():
file(pidfile, 'w').write(pid) file(pidfile, 'w').write(pid)
# load old data file new_db = not os.path.exists("download_log.db")
if os.path.exists("download_log.json.gz"): conn = sqlite3.connect("download_log.db")
pkg_ip_time = json_load(gzip.open("download_log.json.gz")) curs = conn.cursor()
else: if new_db:
pkg_ip_time = {} sys.stdout.write("creating database...\n")
curs.execute('''CREATE TABLE pkg_ip (package, ip, PRIMARY KEY (package, ip))''')
conn.commit()
# parse each parameter # parse each parameter
for logfile in args.logs: for logfile in args.logs:
sys.stdout.write("processing logfile {0}... ".format(logfile)) sys.stdout.write("processing logfile {0}... ".format(logfile))
sys.stdout.flush() sys.stdout.flush()
count = parse_logfile(logfile, pkg_ip_time) count = parse_logfile(logfile, curs)
sys.stdout.write("{0}\n".format(count)) sys.stdout.write("{0}\n".format(count))
conn.commit()
# dump new data file
json_dump(pkg_ip_time, gzip.open("download_log.json.gz", 'w'))
# calculate current package totals # calculate current package totals
pkgcount = {p: len(i) for p, i in pkg_ip_time.iteritems()} pkgcount = {p: c for p,c in curs.execute("SELECT package, count(ip) FROM pkg_ip GROUP BY 1")}
json_dump(pkgcount, open("html/download_counts.json", 'w'), indent=1) json_dump(pkgcount, open("html/download_counts.json", 'w'), indent=1)
os.unlink(pidfile) os.unlink(pidfile)