From 6a6ae94e3d2c5aee5b6cf0f60d26c4f0b4276173 Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Mon, 16 Dec 2013 23:12:50 +0000
Subject: [PATCH 01/27] Refactored the fingerprint module

---
 dejavu/fingerprint.py | 221 ++++++++++++++++++++++--------------------
 1 file changed, 116 insertions(+), 105 deletions(-)
 mode change 100644 => 100755 dejavu/fingerprint.py

diff --git a/dejavu/fingerprint.py b/dejavu/fingerprint.py
old mode 100644
new mode 100755
index d1f78cc..41451f3
--- a/dejavu/fingerprint.py
+++ b/dejavu/fingerprint.py
@@ -13,19 +13,117 @@ import time
 import hashlib
 import pickle
 
+IDX_FREQ_I = 0
+IDX_TIME_J = 1
+
+DEFAULT_FS = 44100
+DEFAULT_WINDOW_SIZE = 4096
+DEFAULT_OVERLAP_RATIO = 0.5
+DEFAULT_FAN_VALUE = 15
+
+DEFAULT_AMP_MIN = 10
+PEAK_NEIGHBORHOOD_SIZE = 20
+MIN_HASH_TIME_DELTA = 0
+
+def fingerprint(channel_samples,
+            Fs=DEFAULT_FS, 
+            wsize=DEFAULT_WINDOW_SIZE, 
+            wratio=DEFAULT_OVERLAP_RATIO, 
+            fan_value=DEFAULT_FAN_VALUE, 
+            amp_min=DEFAULT_AMP_MIN):
+    """
+        FFT the channel, log transform output, find local maxima, then return
+        locally sensitive hashes. 
+    """
+    # FFT the signal and extract frequency components
+    arr2D = mlab.specgram(
+        channel_samples, 
+        NFFT=wsize, 
+        Fs=Fs,
+        window=mlab.window_hanning,
+        noverlap=int(wsize * wratio))[0]
+
+    # apply log transform since specgram() returns linear array
+    arr2D = 10 * np.log10(arr2D)
+    arr2D[arr2D == -np.inf] = 0 # replace infs with zeros
+    
+    # find local maxima
+    local_maxima = get_2D_peaks(arr2D, plot=False, amp_min=amp_min)
+
+    # return hashes
+    return generate_hashes(local_maxima, fan_value=fan_value)
+
+def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN):
+
+    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.morphology.iterate_structure.html#scipy.ndimage.morphology.iterate_structure
+    struct = generate_binary_structure(2, 1)
+    neighborhood = iterate_structure(struct, PEAK_NEIGHBORHOOD_SIZE)
+
+    # find local maxima using our fliter shape
+    local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D 
+    background = (arr2D == 0)
+    eroded_background = binary_erosion(background, structure=neighborhood, border_value=1)
+    detected_peaks = local_max - eroded_background # this is a boolean mask of arr2D with True at peaks
+
+    # extract peaks
+    amps = arr2D[detected_peaks]
+    j, i = np.where(detected_peaks) 
+
+    # filter peaks
+    amps = amps.flatten()
+    peaks = zip(i, j, amps)
+    peaks_filtered = [x for x in peaks if x[2] > amp_min] # freq, time, amp
+    
+    # get indices for frequency and time
+    frequency_idx = [x[1] for x in peaks_filtered]
+    time_idx = [x[0] for x in peaks_filtered]
+
+    if plot:
+        # scatter of the peaks
+        fig, ax = plt.subplots()
+        ax.imshow(arr2D)
+        ax.scatter(time_idx, frequency_idx)
+        ax.set_xlabel('Time')
+        ax.set_ylabel('Frequency')
+        ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke");
+        plt.gca().invert_yaxis()
+        plt.show()
+
+    return zip(frequency_idx, time_idx)
+
+def generate_hashes(peaks, fan_value=DEFAULT_FAN_VALUE):
+    """
+    Hash list structure:
+       sha1-hash[0:20]    time_offset
+    [(e05b341a9b77a51fd26, 32), ... ]
+    """
+    fingerprinted = set() # to avoid rehashing same pairs
+    hashes = []
+
+    for i in range(len(peaks)):
+        for j in range(fan_value):
+            if i+j < len(peaks) and not (i, i+j) in fingerprinted:
+                
+                freq1 = peaks[i][IDX_FREQ_I]
+                freq2 = peaks[i+j][IDX_FREQ_I]
+                t1 = peaks[i][IDX_TIME_J]
+                t2 = peaks[i+j][IDX_TIME_J]
+                t_delta = t2 - t1
+                
+                if t_delta >= MIN_HASH_TIME_DELTA:
+                    h = hashlib.sha1("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta)))
+                    hashes.append((h.hexdigest()[0:20], t1))
+                
+                # ensure we don't repeat hashing
+                fingerprinted.add((i, i+j))
+    return hashes
+
+# TODO: move all of the below to a class with DB access
+
+
 class Fingerprinter():
 
-    IDX_FREQ_I = 0
-    IDX_TIME_J = 1
-    
-    DEFAULT_FS = 44100
-    DEFAULT_WINDOW_SIZE = 4096
-    DEFAULT_OVERLAP_RATIO = 0.5
-    DEFAULT_FAN_VALUE = 15
-    
-    DEFAULT_AMP_MIN = 10
-    PEAK_NEIGHBORHOOD_SIZE = 20
-    MIN_HASH_TIME_DELTA = 0
+
 
     def __init__(self, config, 
             Fs=DEFAULT_FS, 
@@ -55,104 +153,15 @@ class Fingerprinter():
         hashes = self.process_channel(samples, song_id=sid)
         print "Generated %d hashes" % len(hashes)
         self.db.insert_hashes(hashes)
-
+    
+    # TODO: put this in another module
     def match(self, samples):
         """Used for matching unknown songs"""
         hashes = self.process_channel(samples)
         matches = self.db.return_matches(hashes)
         return matches
 
-    def process_channel(self, channel_samples, song_id=None):
-        """
-            FFT the channel, log transform output, find local maxima, then return
-            locally sensitive hashes. 
-        """
-        # FFT the signal and extract frequency components
-        arr2D = mlab.specgram(
-            channel_samples, 
-            NFFT=self.window_size, 
-            Fs=self.Fs,
-            window=mlab.window_hanning,
-            noverlap=self.noverlap)[0]
-
-        # apply log transform since specgram() returns linear array
-        arr2D = 10 * np.log10(arr2D)
-        arr2D[arr2D == -np.inf] = 0 # replace infs with zeros
-        
-        # find local maxima
-        local_maxima = self.get_2D_peaks(arr2D, plot=False)
-
-        # return hashes
-        return self.generate_hashes(local_maxima, song_id=song_id)
-
-    def get_2D_peaks(self, arr2D, plot=False):
-
-        # http://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.morphology.iterate_structure.html#scipy.ndimage.morphology.iterate_structure
-        struct = generate_binary_structure(2, 1)
-        neighborhood = iterate_structure(struct, Fingerprinter.PEAK_NEIGHBORHOOD_SIZE) 
-
-        # find local maxima using our fliter shape
-        local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D 
-        background = (arr2D == 0)
-        eroded_background = binary_erosion(background, structure=neighborhood, border_value=1)
-        detected_peaks = local_max - eroded_background # this is a boolean mask of arr2D with True at peaks
-
-        # extract peaks
-        amps = arr2D[detected_peaks]
-        j, i = np.where(detected_peaks) 
-
-        # filter peaks
-        amps = amps.flatten()
-        peaks = zip(i, j, amps)
-        peaks_filtered = [x for x in peaks if x[2] > self.amp_min] # freq, time, amp
-        
-        # get indices for frequency and time
-        frequency_idx = [x[1] for x in peaks_filtered]
-        time_idx = [x[0] for x in peaks_filtered]
-
-        if plot:
-            # scatter of the peaks
-            fig, ax = plt.subplots()
-            ax.imshow(arr2D)
-            ax.scatter(time_idx, frequency_idx)
-            ax.set_xlabel('Time')
-            ax.set_ylabel('Frequency')
-            ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke");
-            plt.gca().invert_yaxis()
-            plt.show()
-
-        return zip(frequency_idx, time_idx)
-
-    def generate_hashes(self, peaks, song_id=None):
-        """
-        Hash list structure:
-           sha1-hash[0:20]    song_id, time_offset
-        [(e05b341a9b77a51fd26,   (3, 32)), ... ]
-        """
-        fingerprinted = set() # to avoid rehashing same pairs
-        hashes = []
-
-        for i in range(len(peaks)):
-            for j in range(self.fan_value):
-                if i+j < len(peaks) and not (i, i+j) in fingerprinted:
-
-                    freq1 = peaks[i][Fingerprinter.IDX_FREQ_I]
-                    freq2 = peaks[i+j][Fingerprinter.IDX_FREQ_I]
-                    t1 = peaks[i][Fingerprinter.IDX_TIME_J]
-                    t2 = peaks[i+j][Fingerprinter.IDX_TIME_J]
-                    t_delta = t2 - t1
-                    
-                    if t_delta >= Fingerprinter.MIN_HASH_TIME_DELTA:
-                        h = hashlib.sha1("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta)))
-                        hashes.append((h.hexdigest()[0:20], (song_id, t1)))
-                    
-                    # ensure we don't repeat hashing
-                    fingerprinted.add((i, i+j))
-        return hashes
-
-    def insert_into_db(self, key, value):
-        self.db.insert(key, value)
-
+    # TODO: this function has nothing to do with fingerprinting. is it needed?
     def print_stats(self):
 
         iterable = self.db.get_iterable_kv_pairs()
@@ -168,10 +177,12 @@ class Fingerprinter():
         for song_id, count in counter.iteritems():
             song_name = self.song_names[song_id]
             print "%s has %d spectrogram peaks" % (song_name, count)
-
+    
+    # this does... what? this seems to only be used for the above function
     def set_song_names(self, wpaths):
         self.song_names = wpaths
-        
+    
+    # TODO: put this in another module
     def align_matches(self, matches, starttime, record_seconds=0, verbose=False):
         """
             Finds hash matches that align in time with other matches and finds

From a4ed61265889b63bd9bd76370f847dc50c5c341c Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Mon, 16 Dec 2013 23:38:58 +0000
Subject: [PATCH 02/27] Moved main Dejavu class from dejavu.control to dejavu

---
 dejavu/__init__.py    | 176 ++++++++++++++++++++++++++++++++++++++++++
 dejavu/control.py     | 107 -------------------------
 dejavu/fingerprint.py |  58 +-------------
 go.py                 |   2 +-
 4 files changed, 178 insertions(+), 165 deletions(-)
 mode change 100644 => 100755 dejavu/__init__.py
 delete mode 100644 dejavu/control.py
 mode change 100644 => 100755 go.py

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
old mode 100644
new mode 100755
index e69de29..b44897d
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -0,0 +1,176 @@
+from dejavu.database import SQLDatabase
+from dejavu.convert import Converter
+import dejavu.fingerprint as fingerprint
+from scipy.io import wavfile
+from multiprocessing import Process
+import wave, os
+import random
+
+DEBUG = False
+
+class Dejavu():
+
+    def __init__(self, config):
+    
+        self.config = config
+        
+        # initialize db
+        database = SQLDatabase(
+            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME),
+            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME),
+            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD),
+            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE))
+        self.db = database
+        
+        # create components
+        self.converter = Converter()
+        #self.fingerprinter = Fingerprinter(self.config)
+        self.db.setup()
+
+        # get songs previously indexed
+        self.songs = self.db.get_songs()
+        self.songnames_set = set() # to know which ones we've computed before
+        if self.songs:
+            for song in self.songs:
+                song_id = song[SQLDatabase.FIELD_SONG_ID]
+                song_name = song[SQLDatabase.FIELD_SONGNAME]
+                self.songnames_set.add(song_name)
+                print "Added: %s to the set of fingerprinted songs..." % song_name
+
+    def chunkify(self, lst, n):
+        """
+            Splits a list into roughly n equal parts. 
+            http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
+        """
+        return [lst[i::n] for i in xrange(n)]
+
+    def fingerprint(self, path, output, extensions, nprocesses):
+
+        # convert files, shuffle order
+        files = self.converter.find_files(path, extensions)
+        random.shuffle(files)
+        files_split = self.chunkify(files, nprocesses)
+
+        # split into processes here
+        processes = []
+        for i in range(nprocesses):
+
+            # need database instance since mysql connections shouldn't be shared across processes
+            sql_connection = SQLDatabase( 
+                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME),
+                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME),
+                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD),
+                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE))
+
+            # create process and start it
+            p = Process(target=self.fingerprint_worker, args=(files_split[i], sql_connection, output))
+            p.start()
+            processes.append(p)
+
+        # wait for all processes to complete
+        for p in processes:
+            p.join()
+            
+        # delete orphans
+        # print "Done fingerprinting. Deleting orphaned fingerprints..."
+        # TODO: need a more performant query in database.py for the 
+        #self.fingerprinter.db.delete_orphans()
+
+    def fingerprint_worker(self, files, sql_connection, output):
+
+        for filename, extension in files:
+
+            # if there are already fingerprints in database, don't re-fingerprint or convert
+            song_name = os.path.basename(filename).split(".")[0]
+            if DEBUG and song_name in self.songnames_set: 
+                print("-> Already fingerprinted, continuing...")
+                continue
+
+            # convert to WAV
+            wavout_path = self.converter.convert(filename, extension, Converter.WAV, output, song_name)
+
+            # insert song name into database
+            song_id = sql_connection.insert_song(song_name)
+
+            # for each channel perform FFT analysis and fingerprinting
+            channels, Fs = self.extract_channels(wavout_path)
+            for c in range(len(channels)):
+                channel = channels[c]
+                print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name)
+                hashes = fingerprint.fingerprint(channel, Fs=Fs)
+                sql_connection.insert_hashes(song_id, hashes)
+
+            # only after done fingerprinting do confirm
+            sql_connection.set_song_fingerprinted(song_id)
+
+    def extract_channels(self, path):
+        """
+            Reads channels from disk.
+            Returns a tuple with (channels, sample_rate)
+        """
+        channels = []
+        Fs, frames = wavfile.read(path)
+        wave_object = wave.open(path)
+        nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
+        #assert Fs == self.fingerprinter.Fs
+
+        for channel in range(nchannels):
+            channels.append(frames[:, channel])
+        return (channels, Fs)
+    
+    def match(self, samples, Fs=fingerprint.DEFAULT_FS):
+        hashes = fingerprint.fingerprint(samples, Fs=Fs)
+        return self.db.return_matches(hashes)
+    
+    def align_matches(self, matches, starttime, record_seconds=None):
+        """
+            Finds hash matches that align in time with other matches and finds
+            consensus about which hashes are "true" signal from the audio.
+            
+            Returns a dictionary with match information.
+        """
+        # align by diffs
+        diff_counter = {}
+        largest = 0
+        largest_count = 0
+        song_id = -1
+        for tup in matches:
+            sid, diff = tup
+            if not diff in diff_counter:
+                diff_counter[diff] = {}
+            if not sid in diff_counter[diff]:
+                diff_counter[diff][sid] = 0
+            diff_counter[diff][sid] += 1
+
+            if diff_counter[diff][sid] > largest_count:
+                largest = diff
+                largest_count = diff_counter[diff][sid]
+                song_id = sid
+
+        if DEBUG: 
+            print("Diff is %d with %d offset-aligned matches" % (largest, largest_count))
+        
+        # extract idenfication      
+        song = self.db.get_song_by_id(song_id)
+        if song:
+            songname = song.get(SQLDatabase.FIELD_SONGNAME, None)
+        else:
+            return None
+        songname = songname.replace("_", " ")
+        elapsed = time.time() - starttime
+        
+        if DEBUG: 
+            print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed))
+        
+        # return match info
+        song = {
+            "song_id" : song_id,
+            "song_name" : songname,
+            "match_time" : elapsed,
+            "confidence" : largest_count
+        }
+        
+        if record_seconds: 
+            song['record_time'] = record_seconds
+            
+        return song
\ No newline at end of file
diff --git a/dejavu/control.py b/dejavu/control.py
deleted file mode 100644
index 606e366..0000000
--- a/dejavu/control.py
+++ /dev/null
@@ -1,107 +0,0 @@
-from dejavu.database import SQLDatabase
-from dejavu.convert import Converter
-from dejavu.fingerprint import Fingerprinter
-from scipy.io import wavfile
-from multiprocessing import Process
-import wave, os
-import random
-
-class Dejavu():
-
-    def __init__(self, config):
-    
-        self.config = config
-
-        # create components
-        self.converter = Converter()
-        self.fingerprinter = Fingerprinter(self.config)
-        self.fingerprinter.db.setup()
-
-        # get songs previously indexed
-        self.songs = self.fingerprinter.db.get_songs()
-        self.songnames_set = set() # to know which ones we've computed before
-        if self.songs:
-            for song in self.songs:
-                song_id = song[SQLDatabase.FIELD_SONG_ID]
-                song_name = song[SQLDatabase.FIELD_SONGNAME]
-                self.songnames_set.add(song_name)
-                print "Added: %s to the set of fingerprinted songs..." % song_name
-
-    def chunkify(self, lst, n):
-        """
-            Splits a list into roughly n equal parts. 
-            http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
-        """
-        return [lst[i::n] for i in xrange(n)]
-
-    def fingerprint(self, path, output, extensions, nprocesses):
-
-        # convert files, shuffle order
-        files = self.converter.find_files(path, extensions)
-        random.shuffle(files)
-        files_split = self.chunkify(files, nprocesses)
-
-        # split into processes here
-        processes = []
-        for i in range(nprocesses):
-
-            # need database instance since mysql connections shouldn't be shared across processes
-            sql_connection = SQLDatabase( 
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME),
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME),
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD),
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE))
-
-            # create process and start it
-            p = Process(target=self.fingerprint_worker, args=(files_split[i], sql_connection, output))
-            p.start()
-            processes.append(p)
-
-        # wait for all processes to complete
-        for p in processes:
-            p.join()
-            
-        # delete orphans
-        # print "Done fingerprinting. Deleting orphaned fingerprints..."
-        # TODO: need a more performant query in database.py for the 
-        #self.fingerprinter.db.delete_orphans()
-
-    def fingerprint_worker(self, files, sql_connection, output):
-
-        for filename, extension in files:
-
-            # if there are already fingerprints in database, don't re-fingerprint or convert
-            song_name = os.path.basename(filename).split(".")[0]
-            if song_name in self.songnames_set: 
-                print "-> Already fingerprinted, continuing..."
-                continue
-
-            # convert to WAV
-            wavout_path = self.converter.convert(filename, extension, Converter.WAV, output, song_name)
-
-            # insert song name into database
-            song_id = sql_connection.insert_song(song_name)
-
-            # for each channel perform FFT analysis and fingerprinting
-            channels = self.extract_channels(wavout_path)
-            for c in range(len(channels)):
-                channel = channels[c]
-                print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name)
-                self.fingerprinter.fingerprint(channel, wavout_path, song_id, c+1)
-
-            # only after done fingerprinting do confirm
-            sql_connection.set_song_fingerprinted(song_id)
-
-    def extract_channels(self, path):
-        """
-            Reads channels from disk.
-        """
-        channels = []
-        Fs, frames = wavfile.read(path)
-        wave_object = wave.open(path)
-        nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
-        assert Fs == self.fingerprinter.Fs
-
-        for channel in range(nchannels):
-            channels.append(frames[:, channel])
-        return channels
\ No newline at end of file
diff --git a/dejavu/fingerprint.py b/dejavu/fingerprint.py
index 41451f3..6108799 100755
--- a/dejavu/fingerprint.py
+++ b/dejavu/fingerprint.py
@@ -133,12 +133,7 @@ class Fingerprinter():
             amp_min=DEFAULT_AMP_MIN):
 
         self.config = config
-        database = SQLDatabase(
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME),
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME),
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD),
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE))
-        self.db = database
+        
 
         self.Fs = Fs
         self.dt = 1.0 / self.Fs
@@ -183,55 +178,4 @@ class Fingerprinter():
         self.song_names = wpaths
     
     # TODO: put this in another module
-    def align_matches(self, matches, starttime, record_seconds=0, verbose=False):
-        """
-            Finds hash matches that align in time with other matches and finds
-            consensus about which hashes are "true" signal from the audio.
-            
-            Returns a dictionary with match information.
-        """
-        # align by diffs
-        diff_counter = {}
-        largest = 0
-        largest_count = 0
-        song_id = -1
-        for tup in matches:
-            sid, diff = tup
-            if not diff in diff_counter:
-                diff_counter[diff] = {}
-            if not sid in diff_counter[diff]:
-                diff_counter[diff][sid] = 0
-            diff_counter[diff][sid] += 1
 
-            if diff_counter[diff][sid] > largest_count:
-                largest = diff
-                largest_count = diff_counter[diff][sid]
-                song_id = sid
-
-        if verbose: 
-            print "Diff is %d with %d offset-aligned matches" % (largest, largest_count)
-        
-        # extract idenfication      
-        song = self.db.get_song_by_id(song_id)
-        if song:
-            songname = song.get(SQLDatabase.FIELD_SONGNAME, None)
-        else:
-            return None
-        songname = songname.replace("_", " ")
-        elapsed = time.time() - starttime
-        
-        if verbose: 
-            print "Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed)
-        
-        # return match info
-        song = {
-            "song_id" : song_id,
-            "song_name" : songname,
-            "match_time" : elapsed,
-            "confidence" : largest_count
-        }
-        
-        if record_seconds: 
-            song['record_time'] = record_seconds
-            
-        return song
diff --git a/go.py b/go.py
old mode 100644
new mode 100755
index 2aaee89..9108f4e
--- a/go.py
+++ b/go.py
@@ -1,4 +1,4 @@
-from dejavu.control import Dejavu
+from dejavu import Dejavu
 from ConfigParser import ConfigParser
 import warnings
 warnings.filterwarnings("ignore")

From 0bd7219b872aa85dbab030d3ff21d4185e26f901 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Tue, 17 Dec 2013 01:39:03 +0100
Subject: [PATCH 03/27] Cleaned up the database driver.

- The SQLDatabase class now uses a context manager for mysql access.
- Most of the error handling is done by the context manager now
- Optimized several methods that returned a list into returning a generator
- Optimized return_matches to use an IN query instead.
- Other small fixes.
---
 dejavu/cursor.py   |  52 ++++++
 dejavu/database.py | 397 ++++++++++++++++++++++-----------------------
 2 files changed, 243 insertions(+), 206 deletions(-)
 create mode 100644 dejavu/cursor.py

diff --git a/dejavu/cursor.py b/dejavu/cursor.py
new file mode 100644
index 0000000..1b46b52
--- /dev/null
+++ b/dejavu/cursor.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import Queue
+
+import pymysql
+import pymysql.cursors
+
+
+def cursor_factory(**factory_options):
+    def cursor(**options):
+        options.update(factory_options)
+        return Cursor(**options)
+    return cursor
+
+
+class Cursor(object):
+    """
+    Establishes a connection to the database and returns an open cursor.
+
+
+    ```python
+    # Use as context manager
+    with Cursor() as cur:
+        cur.execute(query)
+    ```
+    """
+    _cache = Queue.Queue(maxsize=5)
+
+    def __init__(self, cursor_type=pymysql.cursors.DictCursor, **options):
+        super(Cursor, self).__init__()
+
+        try:
+            conn = self._cache.get_nowait()
+        except Queue.Empty:
+            conn = pymysql.connect(**options)
+
+        self.conn = conn
+        self.cursor_type = cursor_type
+
+    def __enter__(self):
+        self.cursor = self.conn.cursor(self.cursor_type)
+        return self.cursor
+
+    def __exit__(self, type, value, traceback):
+        self.cursor.close()
+        self.conn.commit()
+
+        # Put it back on the queue
+        try:
+            self._cache.put_nowait(self.conn)
+        except Queue.Full:
+            self.conn.close()
diff --git a/dejavu/database.py b/dejavu/database.py
index f03e33e..018ddd4 100644
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -1,40 +1,44 @@
-import MySQLdb as mysql
-import MySQLdb.cursors as cursors
-import os
+from __future__ import absolute_import
+from binascii import unhexlify
 
-class SQLDatabase():
+class Database(object):
+    def __init__(self):
+        super(Database, self).__init__()
+
+
+class SQLDatabase(Database):
     """
     Queries:
 
     1) Find duplicates (shouldn't be any, though):
 
-        select `hash`, `song_id`, `offset`, count(*) cnt 
-        from fingerprints 
-        group by `hash`, `song_id`, `offset` 
+        select `hash`, `song_id`, `offset`, count(*) cnt
+        from fingerprints
+        group by `hash`, `song_id`, `offset`
         having cnt > 1
         order by cnt asc;
 
     2) Get number of hashes by song:
 
-        select song_id, song_name, count(song_id) as num 
-        from fingerprints 
+        select song_id, song_name, count(song_id) as num
+        from fingerprints
         natural join songs
-        group by song_id 
+        group by song_id
         order by count(song_id) desc;
 
     3) get hashes with highest number of collisions
 
-        select 
-            hash, 
-            count(distinct song_id) as n 
-        from fingerprints 
-        group by `hash` 
+        select
+            hash,
+            count(distinct song_id) as n
+        from fingerprints
+        group by `hash`
         order by n DESC;
 
     => 26 different songs with same fingerprint (392 times):
-    
-        select songs.song_name, fingerprints.offset 
-        from fingerprints natural join songs 
+
+        select songs.song_name, fingerprints.offset
+        from fingerprints natural join songs
         where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73";
     """
 
@@ -57,269 +61,250 @@ class SQLDatabase():
     FIELD_FINGERPRINTED = "fingerprinted"
 
     # creates
-    CREATE_FINGERPRINTS_TABLE = """ 
-    CREATE TABLE IF NOT EXISTS `%s` ( 
-         `%s` binary(10) not null,
-         `%s` mediumint unsigned not null, 
-         `%s` int unsigned not null, 
+    CREATE_FINGERPRINTS_TABLE = """
+        CREATE TABLE IF NOT EXISTS `%s` (
+             `%s` binary(10) not null,
+             `%s` mediumint unsigned not null,
+             `%s` int unsigned not null,
          INDEX(%s),
          UNIQUE(%s, %s, %s)
-    );""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, 
-            FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
-            FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH)
-    
+    );""" % (
+        FINGERPRINTS_TABLENAME, FIELD_HASH,
+        FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
+        FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
+    )
+
     CREATE_SONGS_TABLE = """
-    CREATE TABLE IF NOT EXISTS `%s` (
-        `%s` mediumint unsigned not null auto_increment, 
-        `%s` varchar(250) not null,
-        `%s` tinyint default 0,
+        CREATE TABLE IF NOT EXISTS `%s` (
+            `%s` mediumint unsigned not null auto_increment,
+            `%s` varchar(250) not null,
+            `%s` tinyint default 0,
         PRIMARY KEY (`%s`),
         UNIQUE KEY `%s` (`%s`)
-    );""" % (SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
-            FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID)
+    );""" % (
+        SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
+        FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID,
+    )
+
+    # inserts (ignores duplicates)
+    INSERT_FINGERPRINT = """
+        INSERT IGNORE INTO %s (%s, %s, %s) VALUES
+            (UNHEX(%%s), %%s, %%s);
+    """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET)
 
-    # inserts
-    INSERT_FINGERPRINT = "INSERT IGNORE INTO %s (%s, %s, %s) VALUES (UNHEX(%%s), %%s, %%s)" % (
-        FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) # ignore duplicates and don't insert them
     INSERT_SONG = "INSERT INTO %s (%s) VALUES (%%s);" % (
         SONGS_TABLENAME, FIELD_SONGNAME)
 
     # selects
-    SELECT = "SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH)
-    SELECT_ALL = "SELECT %s, %s FROM %s;" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME)
-    SELECT_SONG = "SELECT %s FROM %s WHERE %s = %%s" % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)
-    SELECT_NUM_FINGERPRINTS = "SELECT COUNT(*) as n FROM %s" % (FINGERPRINTS_TABLENAME)
-    
-    SELECT_UNIQUE_SONG_IDS = "SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;" % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
-    SELECT_SONGS = "SELECT %s, %s FROM %s WHERE %s = 1;" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED)
+    SELECT = """
+        SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
+    """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH)
+
+    SELECT_MULTIPLE = """
+        SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
+    """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET,
+           FINGERPRINTS_TABLENAME, FIELD_HASH)
+
+    SELECT_ALL = """
+        SELECT %s, %s FROM %s;
+    """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME)
+
+    SELECT_SONG = """
+        SELECT %s FROM %s WHERE %s = %%s
+    """ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)
+
+    SELECT_NUM_FINGERPRINTS = """
+        SELECT COUNT(*) as n FROM %s
+    """ % (FINGERPRINTS_TABLENAME)
+
+    SELECT_UNIQUE_SONG_IDS = """
+        SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
+    """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
+
+    SELECT_SONGS = """
+        SELECT %s, %s FROM %s WHERE %s = 1;
+    """ % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED)
 
     # drops
     DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
     DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME
 
     # update
-    UPDATE_SONG_FINGERPRINTED = "UPDATE %s SET %s = 1 WHERE %s = %%s" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID)
+    UPDATE_SONG_FINGERPRINTED = """
+        UPDATE %s SET %s = 1 WHERE %s = %%s
+    """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID)
 
     # delete
-    DELETE_UNFINGERPRINTED = "DELETE FROM %s WHERE %s = 0;" % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
+    DELETE_UNFINGERPRINTED = """
+        DELETE FROM %s WHERE %s = 0;
+    """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
+
     DELETE_ORPHANS = """
-    delete from fingerprints 
-    where not exists (
-        select * from songs where fingerprints.song_id  = songs.song_id
-    )"""
-    
-    def __init__(self, hostname, username, password, database):
-        # connect
-        self.database = database
-        try:
-            # http://www.halfcooked.com/mt/archives/000969.html
-            self.connection = mysql.connect(
-                hostname, username, password, 
-                database, cursorclass=cursors.DictCursor)  
+        delete from fingerprints
+        where not exists (
+            select * from songs where fingerprints.song_id  = songs.song_id
+        );
+    """
 
-            self.connection.autocommit(False) # for fast bulk inserts
-            self.cursor = self.connection.cursor()
-
-        except mysql.Error, e:
-            print "Connection error %d: %s" % (e.args[0], e.args[1])
+    def __init__(self, cursor):
+        super(SQLDatabase, self).__init__()
+        self.cursor = cursor
 
     def setup(self):
-        try:
-            # create fingerprints table
-            self.cursor.execute("USE %s;" % self.database)
-            self.cursor.execute(SQLDatabase.CREATE_FINGERPRINTS_TABLE)
-            self.cursor.execute(SQLDatabase.CREATE_SONGS_TABLE)
-            self.delete_unfingerprinted_songs()
-            self.connection.commit()
-        except mysql.Error, e:
-            print "Connection error %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
+        with self.cursor() as cur:
+            cur.execute(self.CREATE_FINGERPRINTS_TABLE)
+            cur.execute(self.CREATE_SONGS_TABLE)
+            cur.execute(self.DELETE_UNFINGERPRINTED)
 
     def empty(self):
         """
             Drops all tables and re-adds them. Be carfeul with this!
         """
-        try:
-            self.cursor.execute("USE %s;" % self.database)
+        with self.cursor() as cur:
+            cur.execute(self.DROP_FINGERPRINTS)
+            cur.execute(self.DROP_SONGS)
 
-            # drop tables
-            self.cursor.execute(SQLDatabase.DROP_FINGERPRINTS)
-            self.cursor.execute(SQLDatabase.DROP_SONGS)
+        self.setup()
 
-            # recreate
-            self.cursor.execute(SQLDatabase.CREATE_FINGERPRINTS_TABLE)
-            self.cursor.execute(SQLDatabase.CREATE_SONGS_TABLE)
-            self.connection.commit()
-
-        except mysql.Error, e:
-            print "Error in empty(), %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
-            
     def delete_orphans(self):
-        try:
-            self.cursor = self.connection.cursor()
-            ### TODO: SQLDatabase.DELETE_ORPHANS is not performant enough, need better query
-            ###     to delete fingerprints for which no song is tied to.
-            #self.cursor.execute(SQLDatabase.DELETE_ORPHANS)
-            #self.connection.commit()
-        except mysql.Error, e:
-            print "Error in delete_orphans(), %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
-    
+        # TODO: SQLDatabase.DELETE_ORPHANS is not
+        # performant enough, need better query to
+        # delete fingerprints for which no song is tied to.
+
+        # with self.cursor() as cur:
+        #   cur.execute(self.DELETE_ORPHANS)
+        pass
+
     def delete_unfingerprinted_songs(self):
-        try:
-            self.cursor = self.connection.cursor()
-            self.cursor.execute(SQLDatabase.DELETE_UNFINGERPRINTED)
-            self.connection.commit()
-        except mysql.Error, e:
-            print "Error in delete_unfingerprinted_songs(), %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
+        with self.cursor() as cur:
+            cur.execute(self.DELETE_UNFINGERPRINTED)
 
     def get_num_songs(self):
         """
-            Returns number of songs the database has fingerprinted.
+        Returns number of songs the database has fingerprinted.
         """
-        try:
-            self.cursor = self.connection.cursor()
-            self.cursor.execute(SQLDatabase.SELECT_UNIQUE_SONG_IDS)
-            record = self.cursor.fetchone()
-            return int(record['n'])
-        except mysql.Error, e:
-            print "Error in get_num_songs(), %d: %s" % (e.args[0], e.args[1])
-            
+        with self.cursor() as cur:
+            cur.execute(self.SELECT_UNIQUE_SONG_IDS)
+
+            for row in cur:
+                return row['n']
+
     def get_num_fingerprints(self):
         """
-            Returns number of fingerprints the database has fingerprinted.
+        Returns number of fingerprints the database has fingerprinted.
         """
-        try:
-            self.cursor = self.connection.cursor()
-            self.cursor.execute(SQLDatabase.SELECT_NUM_FINGERPRINTS)
-            record = self.cursor.fetchone()
-            return int(record['n'])
-        except mysql.Error, e:
-            print "Error in get_num_songs(), %d: %s" % (e.args[0], e.args[1])
-    
+        with self.cursor() as cur:
+            cur.execute(self.SELECT_NUM_FINGERPRINTS)
 
-    def set_song_fingerprinted(self, song_id):
+            for row in cur:
+                return row['n']
+
+    def set_song_fingerprinted(self, sid):
         """
-            Set the fingerprinted flag to TRUE (1) once a song has been completely
-            fingerprinted in the database. 
+        Set the fingerprinted flag to TRUE (1) once a song has been completely
+        fingerprinted in the database.
         """
-        try:
-            self.cursor = self.connection.cursor()
-            self.cursor.execute(SQLDatabase.UPDATE_SONG_FINGERPRINTED, song_id)
-            self.connection.commit()
-        except mysql.Error, e:
-            print "Error in  set_song_fingerprinted(), %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
+        with self.cursor() as cur:
+            cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,))
 
     def get_songs(self):
         """
-            Return songs that have the fingerprinted flag set TRUE (1). 
+        Return songs that have the fingerprinted flag set TRUE (1).
         """
-        try:
-            self.cursor.execute(SQLDatabase.SELECT_SONGS)
-            return self.cursor.fetchall()
-        except mysql.Error, e:
-            print "Error in get_songs(), %d: %s" % (e.args[0], e.args[1])
-            return None
-            
+        with self.cursor() as cur:
+            cur.execute(self.SELECT_SONGS)
+            for row in cur:
+                yield row
+
     def get_song_by_id(self, sid):
         """
-            Returns song by its ID.
+        Returns song by its ID.
         """
-        try:
-            self.cursor.execute(SQLDatabase.SELECT_SONG, (sid,))
-            return self.cursor.fetchone()
-        except mysql.Error, e:
-            print "Error in get_songs(), %d: %s" % (e.args[0], e.args[1])
-            return None 
-    
+        with self.cursor() as cur:
+            cur.execute(self.SELECT_SONG, (sid,))
+            return cur.fetchone()
 
-    def insert(self, key, value):
+    def insert(self, hash, sid, offset):
         """
-            Insert a (sha1, song_id, offset) row into database. 
-
-            key is a sha1 hash, value = (song_id, offset)
+        Insert a (sha1, song_id, offset) row into database.
         """
-        try:
-            args = (key, value[0], value[1])
-            self.cursor.execute(SQLDatabase.INSERT_FINGERPRINT, args)
-        except mysql.Error, e:
-            print "Error in insert(), %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
+        with self.cursor() as cur:
+            cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
 
     def insert_song(self, songname):
         """
-            Inserts song in the database and returns the ID of the inserted record.
+        Inserts song in the database and returns the ID of the inserted record.
         """
-        try:
-            self.cursor.execute(SQLDatabase.INSERT_SONG, (songname,))
-            self.connection.commit()
-            return int(self.cursor.lastrowid)
-        except mysql.Error, e:
-            print "Error in insert_song(), %d: %s" % (e.args[0], e.args[1])
-            self.connection.rollback()
-            return None
+        with self.cursor() as cur:
+            cur.execute(self.INSERT_SONG, (songname,))
+            return cur.lastrowid
 
-    def query(self, key):
+    def query(self, hash):
         """
-            Return all tuples associated with hash. 
+        Return all tuples associated with hash.
 
-            If hash is None, returns all entries in the 
-            database (be careful with that one!).
+        If hash is None, returns all entries in the
+        database (be careful with that one!).
         """
         # select all if no key
-        if key is not None:
-            sql = SQLDatabase.SELECT
-        else:
-            sql = SQLDatabase.SELECT_ALL
+        query = self.SELECT_ALL if hash is None else self.SELECT
 
-        matches = []
-        try:
-            self.cursor.execute(sql, (key,))
-
-            # collect all matches
-            records = self.cursor.fetchall()
-            for record in records:
-                matches.append((record[SQLDatabase.FIELD_SONG_ID], record[SQLDatabase.FIELD_OFFSET]))
-
-        except mysql.Error, e:
-            print "Error in query(), %d: %s" % (e.args[0], e.args[1])
-
-        return matches
+        with self.cursor() as cur:
+            cur.execute(query)
+            for row in cur:
+                yield (row[self.FIELD_SONG_ID], row[self.FIELD_OFFSET])
 
     def get_iterable_kv_pairs(self):
         """
-            Returns all tuples in database. 
+        Returns all tuples in database.
         """
         return self.query(None)
 
     def insert_hashes(self, hashes):
         """
-            Insert series of hash => song_id, offset
-            values into the database. 
+        Insert series of hash => song_id, offset
+        values into the database.
         """
-        for h in hashes:
-            sha1, val = h
-            self.insert(sha1, val)
-        self.connection.commit()
+        # TODO: Fix this when hashes will be a new format.
+        values = []
+        for hash, (sid, offset) in hashes:
+            values.append((hash, sid, offset))
+
+        with self.cursor() as cur:
+            cur.executemany(self.INSERT_FINGERPRINT, values)
 
     def return_matches(self, hashes):
         """
-            Return the (song_id, offset_diff) tuples associated with 
-            a list of 
+            Return the (song_id, offset_diff) tuples associated with
+            a list of
 
                 sha1 => (None, sample_offset)
 
             values.
         """
-        matches = []
-        for h in hashes:
-            sha1, val = h
-            list_of_tups = self.query(sha1)
-            if list_of_tups:
-                for t in list_of_tups:
-                    # (song_id, db_offset, song_sampled_offset)
-                    matches.append((t[0], t[1] - val[1]))
-        return matches
+        from pymysql.cursors import Cursor
+        # Create a dictionary of hash => offset pairs for later lookups
+        mapper = {}
+        for hash, (_, offset) in hashes:
+            mapper[hash.upper()] = offset
+
+        # Get an iteratable of all the hashes we need
+        values = mapper.keys()
+
+        with self.cursor(cursor_type=Cursor) as cur:
+            for split_values in grouper(values, 1000):
+                # Create our IN part of the query
+                query = self.SELECT_MULTIPLE
+                query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
+
+                cur.execute(query, split_values)
+
+                for hash, sid, offset in cur:
+                    # (sid, db_offset - song_sampled_offset)
+                    yield (sid, offset - mapper[hash])
+
+
+from itertools import izip_longest
+def grouper(iterable, n, fillvalue=None):
+    args = [iter(iterable)] * n
+    return izip_longest(fillvalue=fillvalue, *args)

From 3bc507d8e42f84e395209fbd6d7504e8ef799080 Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 00:48:49 +0000
Subject: [PATCH 04/27] Changed insert_hashes to fit the new data format

---
 dejavu/database.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 dejavu/database.py

diff --git a/dejavu/database.py b/dejavu/database.py
old mode 100644
new mode 100755
index 018ddd4..78b1237
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -260,14 +260,13 @@ class SQLDatabase(Database):
         """
         return self.query(None)
 
-    def insert_hashes(self, hashes):
+    def insert_hashes(self, sid, hashes):
         """
         Insert series of hash => song_id, offset
         values into the database.
         """
-        # TODO: Fix this when hashes will be a new format.
         values = []
-        for hash, (sid, offset) in hashes:
+        for hash, offset in hashes:
             values.append((hash, sid, offset))
 
         with self.cursor() as cur:

From ab2cf9d58b1c668f0861bf076f6582c9a3141c3a Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 00:48:49 +0000
Subject: [PATCH 05/27] Changed insert_hashes to fit the new data format

---
 dejavu/database.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 dejavu/database.py

diff --git a/dejavu/database.py b/dejavu/database.py
old mode 100644
new mode 100755
index 018ddd4..78b1237
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -260,14 +260,13 @@ class SQLDatabase(Database):
         """
         return self.query(None)
 
-    def insert_hashes(self, hashes):
+    def insert_hashes(self, sid, hashes):
         """
         Insert series of hash => song_id, offset
         values into the database.
         """
-        # TODO: Fix this when hashes will be a new format.
         values = []
-        for hash, (sid, offset) in hashes:
+        for hash, offset in hashes:
             values.append((hash, sid, offset))
 
         with self.cursor() as cur:

From bed11f3de7b0020f91fcf08458bff3987fdc9f81 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Tue, 17 Dec 2013 02:00:05 +0100
Subject: [PATCH 06/27] Switched back to MySQLdb for better support of
 executemany.

---
 dejavu/cursor.py   | 10 +++++-----
 dejavu/database.py | 11 ++++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/dejavu/cursor.py b/dejavu/cursor.py
index 1b46b52..07649c6 100644
--- a/dejavu/cursor.py
+++ b/dejavu/cursor.py
@@ -2,8 +2,8 @@ from __future__ import unicode_literals
 from __future__ import absolute_import
 import Queue
 
-import pymysql
-import pymysql.cursors
+import MySQLdb as mysql
+import MySQLdb.cursors
 
 
 def cursor_factory(**factory_options):
@@ -26,18 +26,18 @@ class Cursor(object):
     """
     _cache = Queue.Queue(maxsize=5)
 
-    def __init__(self, cursor_type=pymysql.cursors.DictCursor, **options):
+    def __init__(self, cursor_type=mysql.cursors.DictCursor, **options):
         super(Cursor, self).__init__()
 
         try:
             conn = self._cache.get_nowait()
         except Queue.Empty:
-            conn = pymysql.connect(**options)
+            conn = mysql.connect(**options)
 
         self.conn = conn
         self.cursor_type = cursor_type
 
-    def __enter__(self):
+    def __enter__(slf):
         self.cursor = self.conn.cursor(self.cursor_type)
         return self.cursor
 
diff --git a/dejavu/database.py b/dejavu/database.py
index 78b1237..a08bfcd 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import
 from binascii import unhexlify
 
+from MySQLdb.cursors import Cursor
+
 class Database(object):
     def __init__(self):
         super(Database, self).__init__()
@@ -274,14 +276,13 @@ class SQLDatabase(Database):
 
     def return_matches(self, hashes):
         """
-            Return the (song_id, offset_diff) tuples associated with
-            a list of
+        Return the (song_id, offset_diff) tuples associated with
+        a list of
 
-                sha1 => (None, sample_offset)
+            sha1 => (None, sample_offset)
 
-            values.
+        values.
         """
-        from pymysql.cursors import Cursor
         # Create a dictionary of hash => offset pairs for later lookups
         mapper = {}
         for hash, (_, offset) in hashes:

From 25bf97e813fc28001e0a45d4768608f81d036bb2 Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 16:37:06 +0000
Subject: [PATCH 07/27] Changed a method to fit the new hash tuples

---
 dejavu/database.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/dejavu/database.py b/dejavu/database.py
index a08bfcd..5d0222f 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -277,15 +277,11 @@ class SQLDatabase(Database):
     def return_matches(self, hashes):
         """
         Return the (song_id, offset_diff) tuples associated with
-        a list of
-
-            sha1 => (None, sample_offset)
-
-        values.
+        a list of (sha1, sample_offset) values.
         """
         # Create a dictionary of hash => offset pairs for later lookups
         mapper = {}
-        for hash, (_, offset) in hashes:
+        for hash, offset in hashes:
             mapper[hash.upper()] = offset
 
         # Get an iteratable of all the hashes we need

From f02ab9419255b4d7dcf6705f0c77f1e303327d57 Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 20:18:55 +0000
Subject: [PATCH 08/27] Cleaned up Dejavu class calls (find_matches,
 align_matches)

---
 dejavu/__init__.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index b44897d..33414f7 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -118,11 +118,22 @@ class Dejavu():
             channels.append(frames[:, channel])
         return (channels, Fs)
     
-    def match(self, samples, Fs=fingerprint.DEFAULT_FS):
+    def fingerprint(self, filepath, song_name=None):
+        # TODO: replace with something that handles all audio formats
+        channels, Fs = self.extract_channels(path)
+        if not song_name:
+            song_name = os.path.basename(filename).split(".")[0]
+        song_id = self.db.insert_song(song_name)
+        
+        for data in channels:
+            hashes = fingerprint.fingerprint(data, Fs=Fs)
+            self.db.insert_hashes(song_id, hashes)
+    
+    def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
         hashes = fingerprint.fingerprint(samples, Fs=Fs)
         return self.db.return_matches(hashes)
     
-    def align_matches(self, matches, starttime, record_seconds=None):
+    def align_matches(self, matches):
         """
             Finds hash matches that align in time with other matches and finds
             consensus about which hashes are "true" signal from the audio.
@@ -156,8 +167,6 @@ class Dejavu():
             songname = song.get(SQLDatabase.FIELD_SONGNAME, None)
         else:
             return None
-        songname = songname.replace("_", " ")
-        elapsed = time.time() - starttime
         
         if DEBUG: 
             print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed))
@@ -166,11 +175,7 @@ class Dejavu():
         song = {
             "song_id" : song_id,
             "song_name" : songname,
-            "match_time" : elapsed,
             "confidence" : largest_count
         }
-        
-        if record_seconds: 
-            song['record_time'] = record_seconds
             
         return song
\ No newline at end of file

From 6f4cadafbb4780eec6ee78527a38427a968e5be9 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Tue, 17 Dec 2013 21:55:05 +0100
Subject: [PATCH 09/27] Added a foreign key relationship to the create table
 statements.

- MySQL will also use the InnoDB engine now.
- Added a ping call in the MySQL Cursor cache mechanism
- Added a rollback call when a MySQLError occurs
- Removed 'delete_orphans' which is not needed anymore due to foreign key constraints and delete on cascade
- Changed SQLDatabase to accept options to create a cursor factory, instead of taking a pre-created cursor factory
---
 dejavu/cursor.py   | 11 ++++++++--
 dejavu/database.py | 52 ++++++++++++++++++++++++----------------------
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/dejavu/cursor.py b/dejavu/cursor.py
index 07649c6..50146f2 100644
--- a/dejavu/cursor.py
+++ b/dejavu/cursor.py
@@ -33,15 +33,22 @@ class Cursor(object):
             conn = self._cache.get_nowait()
         except Queue.Empty:
             conn = mysql.connect(**options)
+        else:
+            # Ping the connection before using it from the cache.
+            conn.ping(True)
 
         self.conn = conn
         self.cursor_type = cursor_type
 
-    def __enter__(slf):
+    def __enter__(self):
         self.cursor = self.conn.cursor(self.cursor_type)
         return self.cursor
 
-    def __exit__(self, type, value, traceback):
+    def __exit__(self, extype, exvalue, traceback):
+        # if we had a MySQL related error we try to rollback the cursor.
+        if extype is mysql.MySQLError:
+            self.cursor.rollback()
+
         self.cursor.close()
         self.conn.commit()
 
diff --git a/dejavu/database.py b/dejavu/database.py
index a08bfcd..082484a 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -1,8 +1,10 @@
 from __future__ import absolute_import
-from binascii import unhexlify
+from itertools import izip_longest
 
+from dejavu.cursor import cursor_factory
 from MySQLdb.cursors import Cursor
 
+
 class Database(object):
     def __init__(self):
         super(Database, self).__init__()
@@ -69,11 +71,13 @@ class SQLDatabase(Database):
              `%s` mediumint unsigned not null,
              `%s` int unsigned not null,
          INDEX(%s),
-         UNIQUE(%s, %s, %s)
-    );""" % (
+         UNIQUE(%s, %s, %s),
+         FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
+    ) ENGINE=INNODB;""" % (
         FINGERPRINTS_TABLENAME, FIELD_HASH,
         FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
         FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
+        FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID
     )
 
     CREATE_SONGS_TABLE = """
@@ -83,7 +87,7 @@ class SQLDatabase(Database):
             `%s` tinyint default 0,
         PRIMARY KEY (`%s`),
         UNIQUE KEY `%s` (`%s`)
-    );""" % (
+    ) ENGINE=INNODB;""" % (
         SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
         FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID,
     )
@@ -141,18 +145,17 @@ class SQLDatabase(Database):
         DELETE FROM %s WHERE %s = 0;
     """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
 
-    DELETE_ORPHANS = """
-        delete from fingerprints
-        where not exists (
-            select * from songs where fingerprints.song_id  = songs.song_id
-        );
-    """
-
-    def __init__(self, cursor):
+    def __init__(self, **options):
         super(SQLDatabase, self).__init__()
-        self.cursor = cursor
+        self.cursor = cursor_factory(**options)
 
     def setup(self):
+        """
+        Creates any non-existing tables required for dejavu to function.
+
+        This also removes all songs that have been added but have no
+        fingerprints associated with them.
+        """
         with self.cursor() as cur:
             cur.execute(self.CREATE_FINGERPRINTS_TABLE)
             cur.execute(self.CREATE_SONGS_TABLE)
@@ -160,7 +163,11 @@ class SQLDatabase(Database):
 
     def empty(self):
         """
-            Drops all tables and re-adds them. Be carfeul with this!
+        Drops tables created by dejavu and then creates them again
+        by calling `SQLDatabase.setup`.
+
+        .. warning:
+            This will result in a loss of data
         """
         with self.cursor() as cur:
             cur.execute(self.DROP_FINGERPRINTS)
@@ -168,16 +175,11 @@ class SQLDatabase(Database):
 
         self.setup()
 
-    def delete_orphans(self):
-        # TODO: SQLDatabase.DELETE_ORPHANS is not
-        # performant enough, need better query to
-        # delete fingerprints for which no song is tied to.
-
-        # with self.cursor() as cur:
-        #   cur.execute(self.DELETE_ORPHANS)
-        pass
 
     def delete_unfingerprinted_songs(self):
+        """
+        Removes all songs that have no fingerprints associated with them.
+        """
         with self.cursor() as cur:
             cur.execute(self.DELETE_UNFINGERPRINTED)
 
@@ -188,8 +190,9 @@ class SQLDatabase(Database):
         with self.cursor() as cur:
             cur.execute(self.SELECT_UNIQUE_SONG_IDS)
 
-            for row in cur:
-                return row['n']
+            for count, in cur:
+                return count
+            return 0
 
     def get_num_fingerprints(self):
         """
@@ -304,7 +307,6 @@ class SQLDatabase(Database):
                     yield (sid, offset - mapper[hash])
 
 
-from itertools import izip_longest
 def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
     return izip_longest(fillvalue=fillvalue, *args)

From 371742a3140aeda6cba66c92e0962ccef2e4568f Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 20:55:20 +0000
Subject: [PATCH 10/27] Began moving recognizer functionality into separate
 classes

---
 dejavu/recognize.py | 66 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 18 deletions(-)
 mode change 100644 => 100755 dejavu/recognize.py

diff --git a/dejavu/recognize.py b/dejavu/recognize.py
old mode 100644
new mode 100755
index bcb5d3c..f6a5de5
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -1,5 +1,7 @@
 from multiprocessing import Queue, Process
 from dejavu.database import SQLDatabase
+import dejavu.fingerprint
+from dejavu import Dejavu
 from scipy.io import wavfile
 import wave
 import numpy as np
@@ -8,6 +10,52 @@ import sys
 import time
 import array
 
+
+class BaseRecognizer(object):
+    
+    def __init__(self, dejavu):
+        self.dejavu = dejavu
+        self.Fs = dejavu.fingerprint.DEFAULT_FS
+    
+    def recognize(self, *data):
+        matches = []
+        for d in data:
+            matches.extend(self.dejavu.find_matches(data, Fs=self.Fs))
+        return self.dejavu.align_matches(matches)
+
+
+class WaveFileRecognizer(BaseRecognizer):
+    
+    def __init__(self, dejavu):
+        super(BaseRecognizer, self).__init__(dejavu)
+    
+    def recognize_file(self, filepath):
+        Fs, frames = wavfile.read(filename)
+        self.Fs = Fs
+        
+        wave_object = wave.open(filename)
+        nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
+        
+        channels = []
+        for channel in range(nchannels):
+            channels.append(frames[:, channel])
+        
+        t = time.time()
+        match = self.recognize(*channels)
+        t = time.time() - t
+        
+        if match:
+            match['match_time'] = t
+        
+        return match
+
+
+class MicrophoneRecognizer(BaseRecognizer):
+    pass
+    
+    
+
+
 class Recognizer(object):
 
     CHUNK = 8192 # 44100 is a multiple of 1225
@@ -20,24 +68,6 @@ class Recognizer(object):
         self.fingerprinter = fingerprinter
         self.config = config
         self.audio = pyaudio.PyAudio()
-        
-    def read(self, filename, verbose=False):
-        
-        # read file into channels            
-        channels = []
-        Fs, frames = wavfile.read(filename)
-        wave_object = wave.open(filename)
-        nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
-        for channel in range(nchannels):
-            channels.append(frames[:, channel])    
-        
-        # get matches
-        starttime = time.time()
-        matches = []
-        for channel in channels:
-            matches.extend(self.fingerprinter.match(channel))
-        
-        return self.fingerprinter.align_matches(matches, starttime, verbose=verbose)
     
     def listen(self, seconds=10, verbose=False):
 

From 05adf3bc318bcef7a5472ee3ac2131368d943d78 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Tue, 17 Dec 2013 22:02:59 +0100
Subject: [PATCH 11/27] Changed default cursor type for mysql from DictCursor
 to Cursor.

---
 dejavu/cursor.py   |  2 +-
 dejavu/database.py | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/dejavu/cursor.py b/dejavu/cursor.py
index 50146f2..55429a7 100644
--- a/dejavu/cursor.py
+++ b/dejavu/cursor.py
@@ -26,7 +26,7 @@ class Cursor(object):
     """
     _cache = Queue.Queue(maxsize=5)
 
-    def __init__(self, cursor_type=mysql.cursors.DictCursor, **options):
+    def __init__(self, cursor_type=mysql.cursors.Cursor, **options):
         super(Cursor, self).__init__()
 
         try:
diff --git a/dejavu/database.py b/dejavu/database.py
index 082484a..a43b68a 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import
 from itertools import izip_longest
 
 from dejavu.cursor import cursor_factory
-from MySQLdb.cursors import Cursor
+from MySQLdb.cursors import DictCursor
 
 
 class Database(object):
@@ -175,7 +175,6 @@ class SQLDatabase(Database):
 
         self.setup()
 
-
     def delete_unfingerprinted_songs(self):
         """
         Removes all songs that have no fingerprints associated with them.
@@ -201,8 +200,9 @@ class SQLDatabase(Database):
         with self.cursor() as cur:
             cur.execute(self.SELECT_NUM_FINGERPRINTS)
 
-            for row in cur:
-                return row['n']
+            for count, in cur:
+                return count
+            return 0
 
     def set_song_fingerprinted(self, sid):
         """
@@ -216,7 +216,7 @@ class SQLDatabase(Database):
         """
         Return songs that have the fingerprinted flag set TRUE (1).
         """
-        with self.cursor() as cur:
+        with self.cursor(cursor_type=DictCursor) as cur:
             cur.execute(self.SELECT_SONGS)
             for row in cur:
                 yield row
@@ -225,7 +225,7 @@ class SQLDatabase(Database):
         """
         Returns song by its ID.
         """
-        with self.cursor() as cur:
+        with self.cursor(cursor_type=DictCursor) as cur:
             cur.execute(self.SELECT_SONG, (sid,))
             return cur.fetchone()
 
@@ -256,8 +256,8 @@ class SQLDatabase(Database):
 
         with self.cursor() as cur:
             cur.execute(query)
-            for row in cur:
-                yield (row[self.FIELD_SONG_ID], row[self.FIELD_OFFSET])
+            for sid, offset in cur:
+                yield (sid, offset)
 
     def get_iterable_kv_pairs(self):
         """
@@ -294,7 +294,7 @@ class SQLDatabase(Database):
         # Get an iteratable of all the hashes we need
         values = mapper.keys()
 
-        with self.cursor(cursor_type=Cursor) as cur:
+        with self.cursor() as cur:
             for split_values in grouper(values, 1000):
                 # Create our IN part of the query
                 query = self.SELECT_MULTIPLE

From d25c564d4f82f8e3c7c8fa023d05f3b9fd3ed791 Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 21:57:59 +0000
Subject: [PATCH 12/27] Finished the changes to the recognize module  -
 Recognizers now use a class structure  - Generic code for matching is in
 BaseRecognizer  - Two recognizers are available:   - Wave file recognizer   -
 Recording recognizer

---
 dejavu/recognize.py | 121 ++++++++++++++++++++++++++------------------
 1 file changed, 72 insertions(+), 49 deletions(-)

diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index f6a5de5..6cc1bd4 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -10,6 +10,10 @@ import sys
 import time
 import array
 
+CHUNK = 8192 # 44100 is a multiple of 1225
+FORMAT = pyaudio.paInt16
+CHANNELS = 2
+RATE = 44100
 
 class BaseRecognizer(object):
     
@@ -17,19 +21,25 @@ class BaseRecognizer(object):
         self.dejavu = dejavu
         self.Fs = dejavu.fingerprint.DEFAULT_FS
     
-    def recognize(self, *data):
+    def _recognize(self, *data):
         matches = []
         for d in data:
             matches.extend(self.dejavu.find_matches(data, Fs=self.Fs))
         return self.dejavu.align_matches(matches)
+    
+    def recognize(self):
+        pass # base class does nothing
+    
+    
 
 
 class WaveFileRecognizer(BaseRecognizer):
     
-    def __init__(self, dejavu):
+    def __init__(self, dejavu, filename=None):
         super(BaseRecognizer, self).__init__(dejavu)
+        self.filename = filename
     
-    def recognize_file(self, filepath):
+    def recognize_file(self, filename):
         Fs, frames = wavfile.read(filename)
         self.Fs = Fs
         
@@ -41,62 +51,75 @@ class WaveFileRecognizer(BaseRecognizer):
             channels.append(frames[:, channel])
         
         t = time.time()
-        match = self.recognize(*channels)
+        match = self._recognize(*channels)
         t = time.time() - t
         
         if match:
             match['match_time'] = t
         
         return match
+    
+    def recognize(self):
+        return self.recognize_file(self.filename)
 
 
 class MicrophoneRecognizer(BaseRecognizer):
-    pass
     
-    
-
-
-class Recognizer(object):
-
-    CHUNK = 8192 # 44100 is a multiple of 1225
-    FORMAT = pyaudio.paInt16
-    CHANNELS = 2
-    RATE = 44100
-
-    def __init__(self, fingerprinter, config):
-
-        self.fingerprinter = fingerprinter
-        self.config = config
+    def __init__(self, dejavu, seconds=None)
+        super(BaseRecognizer, self).__init__(dejavu)
         self.audio = pyaudio.PyAudio()
+        self.stream = None
+        self.data = []
+        self.channels = CHANNELS
+        self.chunk_size = CHUNK
+        self.rate = RATE
+        self.recorded = False
     
-    def listen(self, seconds=10, verbose=False):
+    def start_recording(self, channels=CHANNELS, rate=RATE, chunk=CHUNK):
+        self.chunk_size = chunk
+        self.channels = channels
+        self.recorded = False
+        self.rate = rate
+        
+        if self.stream:
+            self.stream.stop_stream()
+            self.stream.close()
+        
+        self.stream = self.audio.open(format=FORMAT,
+                                channels=channels,
+                                rate=rate,
+                                input=True,
+                                frames_per_buffer=chunk)
+        
+        self.data = [[] for i in range(channels)]
+    
+    def process_recording(self):
+        data = self.stream.read(self.chunk_size)
+        nums = np.fromstring(data, np.int16)
+        for c in range(self.channels):
+            self.data[c].extend(nums[c::c+1])
+    
+    def stop_recording(self):
+        self.stream.stop_stream()
+        self.stream.close()
+        self.stream = None
+        self.recorded = True
+        
+    def recognize_recording(self):
+        if not self.recorded:
+            raise NoRecordingError("Recording was not complete/begun")
+        return self._recognize(*data)
+    
+    def get_recorded_time(self):
+        return len(self.data[0]) / self.rate
+    
+    def recognize(self):
+        self.start_recording()
+        for i in range(0, int(self.rate / self.chunk * self.seconds)):
+            self.process_recording()
+        self.stop_recording()
+        return self.recognize_recording()
+    
+class NoRecordingError(Exception):
+    pass
 
-        # open stream
-        stream = self.audio.open(format=Recognizer.FORMAT,
-                        channels=Recognizer.CHANNELS,
-                        rate=Recognizer.RATE,
-                        input=True,
-                        frames_per_buffer=Recognizer.CHUNK)
-        
-        # record
-        if verbose: print("* recording")
-        left, right = [], []
-        for i in range(0, int(Recognizer.RATE / Recognizer.CHUNK * seconds)):
-            data = stream.read(Recognizer.CHUNK)
-            nums = np.fromstring(data, np.int16)
-            left.extend(nums[1::2])
-            right.extend(nums[0::2])
-        if verbose: print("* done recording")
-        
-        # close and stop the stream
-        stream.stop_stream()
-        stream.close()
-        
-        # match both channels
-        starttime = time.time()
-        matches = []
-        matches.extend(self.fingerprinter.match(left))
-        matches.extend(self.fingerprinter.match(right))
-        
-        # align and return
-        return self.fingerprinter.align_matches(matches, starttime, record_seconds=seconds, verbose=verbose)
\ No newline at end of file

From 788b3acebfd202b23d15bf4a1f984d0b680897d7 Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 22:03:50 +0000
Subject: [PATCH 13/27] Added missing 'self'

---
 dejavu/recognize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index 6cc1bd4..d743b16 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -108,7 +108,7 @@ class MicrophoneRecognizer(BaseRecognizer):
     def recognize_recording(self):
         if not self.recorded:
             raise NoRecordingError("Recording was not complete/begun")
-        return self._recognize(*data)
+        return self._recognize(*self.data)
     
     def get_recorded_time(self):
         return len(self.data[0]) / self.rate

From 614ad0d4cc1a2d92d3540aa73e39435e07638afc Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 22:05:09 +0000
Subject: [PATCH 14/27] Moved mic recording constants inside
 MicrophoneRecognizer

---
 dejavu/recognize.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index d743b16..024e62f 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -10,10 +10,6 @@ import sys
 import time
 import array
 
-CHUNK = 8192 # 44100 is a multiple of 1225
-FORMAT = pyaudio.paInt16
-CHANNELS = 2
-RATE = 44100
 
 class BaseRecognizer(object):
     
@@ -65,6 +61,11 @@ class WaveFileRecognizer(BaseRecognizer):
 
 class MicrophoneRecognizer(BaseRecognizer):
     
+    CHUNK = 8192 # 44100 is a multiple of 1225
+    FORMAT = pyaudio.paInt16
+    CHANNELS = 2
+    RATE = 44100
+    
     def __init__(self, dejavu, seconds=None)
         super(BaseRecognizer, self).__init__(dejavu)
         self.audio = pyaudio.PyAudio()

From 1fcff7e2c5f68d0c672e1610ac9290459387d80e Mon Sep 17 00:00:00 2001
From: Vin <bevin@jabana.se>
Date: Tue, 17 Dec 2013 22:12:21 +0000
Subject: [PATCH 15/27] Fixed some import/class mistakes

---
 dejavu/__init__.py  | 6 +++---
 dejavu/recognize.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index 33414f7..340db94 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -1,6 +1,6 @@
 from dejavu.database import SQLDatabase
 from dejavu.convert import Converter
-import dejavu.fingerprint as fingerprint
+import fingerprint
 from scipy.io import wavfile
 from multiprocessing import Process
 import wave, os
@@ -44,7 +44,7 @@ class Dejavu():
         """
         return [lst[i::n] for i in xrange(n)]
 
-    def fingerprint(self, path, output, extensions, nprocesses):
+    def do_fingerprint(self, path, output, extensions, nprocesses):
 
         # convert files, shuffle order
         files = self.converter.find_files(path, extensions)
@@ -118,7 +118,7 @@ class Dejavu():
             channels.append(frames[:, channel])
         return (channels, Fs)
     
-    def fingerprint(self, filepath, song_name=None):
+    def fingerprint_file(self, filepath, song_name=None):
         # TODO: replace with something that handles all audio formats
         channels, Fs = self.extract_channels(path)
         if not song_name:
diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index 024e62f..68700fd 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -32,7 +32,7 @@ class BaseRecognizer(object):
 class WaveFileRecognizer(BaseRecognizer):
     
     def __init__(self, dejavu, filename=None):
-        super(BaseRecognizer, self).__init__(dejavu)
+        super(WaveFileRecognizer, self).__init__(dejavu)
         self.filename = filename
     
     def recognize_file(self, filename):
@@ -66,8 +66,8 @@ class MicrophoneRecognizer(BaseRecognizer):
     CHANNELS = 2
     RATE = 44100
     
-    def __init__(self, dejavu, seconds=None)
-        super(BaseRecognizer, self).__init__(dejavu)
+    def __init__(self, dejavu, seconds=None):
+        super(MicrophoneRecognizer, self).__init__(dejavu)
         self.audio = pyaudio.PyAudio()
         self.stream = None
         self.data = []

From 3b72768f942b8cbd4659649e85bd50f6bc346ac3 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Wed, 18 Dec 2013 00:31:57 +0100
Subject: [PATCH 16/27] Fixed various small things that weren't caught before.

- Fixes SQL queries for table creations
- Table creation is now down in reverse order to accompany the foreign key
- Fixed a typo in the BaseRecognizer that caused it to not work
- Changed configuration passed to Dejavu into a (nested) dictionary
---
 dejavu/__init__.py  | 72 +++++++++++++++++++--------------------------
 dejavu/database.py  |  4 +--
 dejavu/recognize.py | 57 +++++++++++++++++------------------
 3 files changed, 59 insertions(+), 74 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index 340db94..3222395 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -8,20 +8,14 @@ import random
 
 DEBUG = False
 
-class Dejavu():
-
+class Dejavu(object):
     def __init__(self, config):
-    
+
         self.config = config
-        
+
         # initialize db
-        database = SQLDatabase(
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME),
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME),
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD),
-            self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE))
-        self.db = database
-        
+        self.db = SQLDatabase(**config.get("database", {}))
+
         # create components
         self.converter = Converter()
         #self.fingerprinter = Fingerprinter(self.config)
@@ -30,16 +24,16 @@ class Dejavu():
         # get songs previously indexed
         self.songs = self.db.get_songs()
         self.songnames_set = set() # to know which ones we've computed before
-        if self.songs:
-            for song in self.songs:
-                song_id = song[SQLDatabase.FIELD_SONG_ID]
-                song_name = song[SQLDatabase.FIELD_SONGNAME]
-                self.songnames_set.add(song_name)
-                print "Added: %s to the set of fingerprinted songs..." % song_name
+
+        for song in self.songs:
+            song_name = song[self.db.FIELD_SONGNAME]
+
+            self.songnames_set.add(song_name)
+            print "Added: %s to the set of fingerprinted songs..." % song_name
 
     def chunkify(self, lst, n):
         """
-            Splits a list into roughly n equal parts. 
+            Splits a list into roughly n equal parts.
             http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
         """
         return [lst[i::n] for i in xrange(n)]
@@ -55,25 +49,19 @@ class Dejavu():
         processes = []
         for i in range(nprocesses):
 
-            # need database instance since mysql connections shouldn't be shared across processes
-            sql_connection = SQLDatabase( 
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME),
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME),
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD),
-                self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE))
-
             # create process and start it
-            p = Process(target=self.fingerprint_worker, args=(files_split[i], sql_connection, output))
+            p = Process(target=self.fingerprint_worker,
+                        args=(files_split[i], self.db, output))
             p.start()
             processes.append(p)
 
         # wait for all processes to complete
         for p in processes:
             p.join()
-            
+
         # delete orphans
         # print "Done fingerprinting. Deleting orphaned fingerprints..."
-        # TODO: need a more performant query in database.py for the 
+        # TODO: need a more performant query in database.py for the
         #self.fingerprinter.db.delete_orphans()
 
     def fingerprint_worker(self, files, sql_connection, output):
@@ -82,7 +70,7 @@ class Dejavu():
 
             # if there are already fingerprints in database, don't re-fingerprint or convert
             song_name = os.path.basename(filename).split(".")[0]
-            if DEBUG and song_name in self.songnames_set: 
+            if DEBUG and song_name in self.songnames_set:
                 print("-> Already fingerprinted, continuing...")
                 continue
 
@@ -117,27 +105,27 @@ class Dejavu():
         for channel in range(nchannels):
             channels.append(frames[:, channel])
         return (channels, Fs)
-    
+
     def fingerprint_file(self, filepath, song_name=None):
         # TODO: replace with something that handles all audio formats
         channels, Fs = self.extract_channels(path)
         if not song_name:
             song_name = os.path.basename(filename).split(".")[0]
         song_id = self.db.insert_song(song_name)
-        
+
         for data in channels:
             hashes = fingerprint.fingerprint(data, Fs=Fs)
             self.db.insert_hashes(song_id, hashes)
-    
+
     def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
         hashes = fingerprint.fingerprint(samples, Fs=Fs)
         return self.db.return_matches(hashes)
-    
+
     def align_matches(self, matches):
         """
             Finds hash matches that align in time with other matches and finds
             consensus about which hashes are "true" signal from the audio.
-            
+
             Returns a dictionary with match information.
         """
         # align by diffs
@@ -158,24 +146,24 @@ class Dejavu():
                 largest_count = diff_counter[diff][sid]
                 song_id = sid
 
-        if DEBUG: 
+        if DEBUG:
             print("Diff is %d with %d offset-aligned matches" % (largest, largest_count))
-        
-        # extract idenfication      
+
+        # extract idenfication
         song = self.db.get_song_by_id(song_id)
         if song:
             songname = song.get(SQLDatabase.FIELD_SONGNAME, None)
         else:
             return None
-        
-        if DEBUG: 
+
+        if DEBUG:
             print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed))
-        
+
         # return match info
         song = {
             "song_id" : song_id,
             "song_name" : songname,
             "confidence" : largest_count
         }
-            
-        return song
\ No newline at end of file
+
+        return song
diff --git a/dejavu/database.py b/dejavu/database.py
index 929c4eb..75af1bf 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -70,7 +70,7 @@ class SQLDatabase(Database):
              `%s` binary(10) not null,
              `%s` mediumint unsigned not null,
              `%s` int unsigned not null,
-         INDEX(%s),
+         PRIMARY KEY(%s),
          UNIQUE(%s, %s, %s),
          FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
     ) ENGINE=INNODB;""" % (
@@ -157,8 +157,8 @@ class SQLDatabase(Database):
         fingerprints associated with them.
         """
         with self.cursor() as cur:
-            cur.execute(self.CREATE_FINGERPRINTS_TABLE)
             cur.execute(self.CREATE_SONGS_TABLE)
+            cur.execute(self.CREATE_FINGERPRINTS_TABLE)
             cur.execute(self.DELETE_UNFINGERPRINTED)
 
     def empty(self):
diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index 68700fd..9283c6a 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -1,6 +1,6 @@
 from multiprocessing import Queue, Process
 from dejavu.database import SQLDatabase
-import dejavu.fingerprint
+import dejavu.fingerprint as fingerprint
 from dejavu import Dejavu
 from scipy.io import wavfile
 import wave
@@ -12,60 +12,57 @@ import array
 
 
 class BaseRecognizer(object):
-    
+
     def __init__(self, dejavu):
         self.dejavu = dejavu
-        self.Fs = dejavu.fingerprint.DEFAULT_FS
-    
+        self.Fs = fingerprint.DEFAULT_FS
+
     def _recognize(self, *data):
         matches = []
         for d in data:
-            matches.extend(self.dejavu.find_matches(data, Fs=self.Fs))
+            matches.extend(self.dejavu.find_matches(d, Fs=self.Fs))
         return self.dejavu.align_matches(matches)
-    
-    def recognize(self):
-        pass # base class does nothing
-    
-    
 
+    def recognize(self):
+        pass  # base class does nothing
 
 class WaveFileRecognizer(BaseRecognizer):
-    
+
     def __init__(self, dejavu, filename=None):
         super(WaveFileRecognizer, self).__init__(dejavu)
         self.filename = filename
-    
+
     def recognize_file(self, filename):
         Fs, frames = wavfile.read(filename)
         self.Fs = Fs
-        
+
         wave_object = wave.open(filename)
         nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
-        
+
         channels = []
         for channel in range(nchannels):
             channels.append(frames[:, channel])
-        
+
         t = time.time()
         match = self._recognize(*channels)
         t = time.time() - t
-        
+
         if match:
             match['match_time'] = t
-        
+
         return match
-    
+
     def recognize(self):
         return self.recognize_file(self.filename)
 
 
 class MicrophoneRecognizer(BaseRecognizer):
-    
+
     CHUNK = 8192 # 44100 is a multiple of 1225
     FORMAT = pyaudio.paInt16
     CHANNELS = 2
     RATE = 44100
-    
+
     def __init__(self, dejavu, seconds=None):
         super(MicrophoneRecognizer, self).__init__(dejavu)
         self.audio = pyaudio.PyAudio()
@@ -75,52 +72,52 @@ class MicrophoneRecognizer(BaseRecognizer):
         self.chunk_size = CHUNK
         self.rate = RATE
         self.recorded = False
-    
+
     def start_recording(self, channels=CHANNELS, rate=RATE, chunk=CHUNK):
         self.chunk_size = chunk
         self.channels = channels
         self.recorded = False
         self.rate = rate
-        
+
         if self.stream:
             self.stream.stop_stream()
             self.stream.close()
-        
+
         self.stream = self.audio.open(format=FORMAT,
                                 channels=channels,
                                 rate=rate,
                                 input=True,
                                 frames_per_buffer=chunk)
-        
+
         self.data = [[] for i in range(channels)]
-    
+
     def process_recording(self):
         data = self.stream.read(self.chunk_size)
         nums = np.fromstring(data, np.int16)
         for c in range(self.channels):
             self.data[c].extend(nums[c::c+1])
-    
+
     def stop_recording(self):
         self.stream.stop_stream()
         self.stream.close()
         self.stream = None
         self.recorded = True
-        
+
     def recognize_recording(self):
         if not self.recorded:
             raise NoRecordingError("Recording was not complete/begun")
         return self._recognize(*self.data)
-    
+
     def get_recorded_time(self):
         return len(self.data[0]) / self.rate
-    
+
     def recognize(self):
         self.start_recording()
         for i in range(0, int(self.rate / self.chunk * self.seconds)):
             self.process_recording()
         self.stop_recording()
         return self.recognize_recording()
-    
+
 class NoRecordingError(Exception):
     pass
 

From 7122e110d1451fa74d235eae3cc60ebad522fcee Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Wed, 18 Dec 2013 18:02:07 +0100
Subject: [PATCH 17/27] Cleaned up convert.py and renamed it to decode.py

- 'Converter' class removed
- 'ensure_folder' removed
- 'find_files' changed into a generator and made it work with spaces
- 'convert' renamed into 'read'
- 'convert' now handles any supported file by pydub
- 'convert' now returns the data instead of saving it to a file (same format as 'extract_channels')
- generic cleanup of formatting in the file
---
 dejavu/convert.py | 54 -----------------------------------------------
 dejavu/decode.py  | 27 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 54 deletions(-)
 delete mode 100644 dejavu/convert.py
 create mode 100644 dejavu/decode.py

diff --git a/dejavu/convert.py b/dejavu/convert.py
deleted file mode 100644
index 77afcef..0000000
--- a/dejavu/convert.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os, fnmatch
-from pydub import AudioSegment
-
-class Converter():
-
-    WAV = "wav"
-    MP3 = "mp3"
-    FORMATS = [
-        WAV,
-        MP3]
-
-    def __init__(self):
-        pass
-
-    def ensure_folder(self, extension):
-        if not os.path.exists(extension):
-            os.makedirs(extension)
-
-    def find_files(self, path, extensions):
-        filepaths = []
-        extensions = [e.replace(".", "") for e in extensions if e.replace(".", "") in Converter.FORMATS]
-        print "Supported formats: %s" % extensions
-        for dirpath, dirnames, files in os.walk(path) :
-            for extension in extensions:
-                for f in fnmatch.filter(files, "*.%s" % extension):
-                    p = os.path.join(dirpath, f)
-                    renamed = p.replace(" ", "_")
-                    os.rename(p, renamed)
-                    #print "Found file: %s with extension %s" % (renamed, extension)
-                    filepaths.append((renamed, extension))
-        return filepaths
-
-    def convert(self, orig_path, from_format, to_format, output_folder, song_name):
-
-        # start conversion
-        self.ensure_folder(output_folder)
-        print "-> Now converting: %s from %s format to %s format..." % (song_name, from_format, to_format)
-
-        # MP3 --> WAV
-        if from_format == Converter.MP3 and to_format == Converter.WAV:
-
-            newpath = os.path.join(output_folder, "%s.%s" % (song_name, Converter.WAV))
-            if os.path.isfile(newpath):
-                print "-> Already converted, skipping..."
-            else:
-                mp3file = AudioSegment.from_mp3(orig_path)
-                mp3file.export(newpath, format=Converter.WAV)
-
-        # unsupported
-        else:
-            print "CONVERSION ERROR:\nThe conversion from %s to %s is not supported!" % (from_format, to_format)
-
-        print "-> Conversion complete."
-        return newpath
diff --git a/dejavu/decode.py b/dejavu/decode.py
new file mode 100644
index 0000000..0304b08
--- /dev/null
+++ b/dejavu/decode.py
@@ -0,0 +1,27 @@
+import os
+import fnmatch
+import numpy as np
+from pydub import AudioSegment
+
+
+def find_files(path, extensions):
+    # Allow both with ".mp3" and without "mp3" to be used for extensions
+    extensions = [e.replace(".", "") for e in extensions]
+
+    for dirpath, dirnames, files in os.walk(path):
+        for extension in extensions:
+            for f in fnmatch.filter(files, "*.%s" % extension):
+                p = os.path.join(dirpath, f)
+                yield (p, extension)
+
+
+def read(filename):
+    audiofile = AudioSegment.from_file(filename)
+
+    data = np.fromstring(audiofile._data, np.int16)
+
+    channels = []
+    for chn in xrange(audiofile.channels):
+        channels.append(data[chn::audiofile.channels])
+
+    return audiofile.frame_rate, channels

From 8a7358d426fb8830d7197ef5922bcd8196952f80 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Wed, 18 Dec 2013 18:11:23 +0100
Subject: [PATCH 18/27] Added the fix for issue #13 in the original repository.

---
 dejavu/decode.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/dejavu/decode.py b/dejavu/decode.py
index 0304b08..f578c0e 100644
--- a/dejavu/decode.py
+++ b/dejavu/decode.py
@@ -15,9 +15,22 @@ def find_files(path, extensions):
                 yield (p, extension)
 
 
-def read(filename):
+def read(filename, limit=None):
+    """
+    Reads any file supported by pydub (ffmpeg) and returns the data contained
+    within.
+
+    Can be optionally limited to a certain amount of seconds from the start
+    of the file by specifying the `limit` parameter. This is the amount of
+    seconds from the start of the file.
+
+    returns: (samplerate, channels)
+    """
     audiofile = AudioSegment.from_file(filename)
 
+    if limit:
+        audiofile = audiofile[:limit * 1000]
+
     data = np.fromstring(audiofile._data, np.int16)
 
     channels = []

From 7895bae23eefd4dbe19e6be20a4f8e1f403c337d Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Wed, 18 Dec 2013 18:15:57 +0100
Subject: [PATCH 19/27] Fixed any references to old converter to use the new
 functions.

- Reversed return values in decode.read
---
 dejavu/__init__.py | 11 +++--------
 dejavu/decode.py   |  4 ++--
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index 3222395..f9be8bd 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -1,5 +1,5 @@
 from dejavu.database import SQLDatabase
-from dejavu.convert import Converter
+import dejavu.decode as decoder
 import fingerprint
 from scipy.io import wavfile
 from multiprocessing import Process
@@ -16,8 +16,6 @@ class Dejavu(object):
         # initialize db
         self.db = SQLDatabase(**config.get("database", {}))
 
-        # create components
-        self.converter = Converter()
         #self.fingerprinter = Fingerprinter(self.config)
         self.db.setup()
 
@@ -41,7 +39,7 @@ class Dejavu(object):
     def do_fingerprint(self, path, output, extensions, nprocesses):
 
         # convert files, shuffle order
-        files = self.converter.find_files(path, extensions)
+        files = decoder.find_files(path, extensions)
         random.shuffle(files)
         files_split = self.chunkify(files, nprocesses)
 
@@ -74,14 +72,11 @@ class Dejavu(object):
                 print("-> Already fingerprinted, continuing...")
                 continue
 
-            # convert to WAV
-            wavout_path = self.converter.convert(filename, extension, Converter.WAV, output, song_name)
+            channels, Fs = decoder.read(filename)
 
             # insert song name into database
             song_id = sql_connection.insert_song(song_name)
 
-            # for each channel perform FFT analysis and fingerprinting
-            channels, Fs = self.extract_channels(wavout_path)
             for c in range(len(channels)):
                 channel = channels[c]
                 print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name)
diff --git a/dejavu/decode.py b/dejavu/decode.py
index f578c0e..47193ca 100644
--- a/dejavu/decode.py
+++ b/dejavu/decode.py
@@ -24,7 +24,7 @@ def read(filename, limit=None):
     of the file by specifying the `limit` parameter. This is the amount of
     seconds from the start of the file.
 
-    returns: (samplerate, channels)
+    returns: (channels, samplerate)
     """
     audiofile = AudioSegment.from_file(filename)
 
@@ -37,4 +37,4 @@ def read(filename, limit=None):
     for chn in xrange(audiofile.channels):
         channels.append(data[chn::audiofile.channels])
 
-    return audiofile.frame_rate, channels
+    return channels, audiofile.frame_rate

From 29029238fcfc7048d86b8774ac455eac12deea5d Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Thu, 19 Dec 2013 00:54:17 +0100
Subject: [PATCH 20/27] A fairly big batch of changes, blame me to forget
 committing

Changes in no particular order:
- Replaced all use cases of wavfile/wave and extract_channels with the new decoder.read function
- Added a 'recognize' method to the Dejavu class. This is a shortcut for recognizing songs.
- Renamed 'do_fingerprint' into 'fingerprint_directory'
- Removed parameters not required anymore from fingerprint_directory
- Cleaned up fingerprint.py
- Made fingerprint.generate_hashes a generator
- WaveFileRecognizer is now FileRecognizer and can take any formats supported by pydub
- Fixed MicrophoneRecognizer to actually run, previous version had many small mistakes
- Renamed 'fingerprint_worker' to '_fingerprint_worker' to signify it is not to be used publicly
- Moved 'chunkify' outside the Dejavu class
- Cleaned up pep8 styling mistakes in all edited files.
---
 dejavu/__init__.py               | 102 ++++++++++------------
 dejavu/{decode.py => decoder.py} |   0
 dejavu/fingerprint.py            | 145 +++++++++----------------------
 dejavu/recognize.py              |  78 ++++++++---------
 4 files changed, 120 insertions(+), 205 deletions(-)
 rename dejavu/{decode.py => decoder.py} (100%)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index f9be8bd..c10e54d 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -1,27 +1,25 @@
 from dejavu.database import SQLDatabase
-import dejavu.decode as decoder
+import dejavu.decoder as decoder
 import fingerprint
-from scipy.io import wavfile
-from multiprocessing import Process
-import wave, os
+from multiprocessing import Process, cpu_count
+import os
 import random
 
-DEBUG = False
 
 class Dejavu(object):
     def __init__(self, config):
+        super(Dejavu, self).__init__()
 
         self.config = config
 
         # initialize db
         self.db = SQLDatabase(**config.get("database", {}))
 
-        #self.fingerprinter = Fingerprinter(self.config)
         self.db.setup()
 
         # get songs previously indexed
         self.songs = self.db.get_songs()
-        self.songnames_set = set() # to know which ones we've computed before
+        self.songnames_set = set()  # to know which ones we've computed before
 
         for song in self.songs:
             song_name = song[self.db.FIELD_SONGNAME]
@@ -29,27 +27,27 @@ class Dejavu(object):
             self.songnames_set.add(song_name)
             print "Added: %s to the set of fingerprinted songs..." % song_name
 
-    def chunkify(self, lst, n):
-        """
-            Splits a list into roughly n equal parts.
-            http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
-        """
-        return [lst[i::n] for i in xrange(n)]
-
-    def do_fingerprint(self, path, output, extensions, nprocesses):
+    def fingerprint_directory(self, path, extensions, nprocesses=None):
+        # Try to use the maximum amount of processes if not given.
+        if nprocesses is None:
+            try:
+                nprocesses = cpu_count()
+            except NotImplementedError:
+                nprocesses = 1
 
         # convert files, shuffle order
-        files = decoder.find_files(path, extensions)
+        files = list(decoder.find_files(path, extensions))
         random.shuffle(files)
-        files_split = self.chunkify(files, nprocesses)
+
+        files_split = chunkify(files, nprocesses)
 
         # split into processes here
         processes = []
         for i in range(nprocesses):
 
             # create process and start it
-            p = Process(target=self.fingerprint_worker,
-                        args=(files_split[i], self.db, output))
+            p = Process(target=self._fingerprint_worker,
+                        args=(files_split[i], self.db))
             p.start()
             processes.append(p)
 
@@ -57,55 +55,37 @@ class Dejavu(object):
         for p in processes:
             p.join()
 
-        # delete orphans
-        # print "Done fingerprinting. Deleting orphaned fingerprints..."
-        # TODO: need a more performant query in database.py for the
-        #self.fingerprinter.db.delete_orphans()
-
-    def fingerprint_worker(self, files, sql_connection, output):
-
+    def _fingerprint_worker(self, files, db):
         for filename, extension in files:
 
-            # if there are already fingerprints in database, don't re-fingerprint or convert
+            # if there are already fingerprints in database,
+            # don't re-fingerprint
             song_name = os.path.basename(filename).split(".")[0]
-            if DEBUG and song_name in self.songnames_set:
+            if song_name in self.songnames_set:
                 print("-> Already fingerprinted, continuing...")
                 continue
 
             channels, Fs = decoder.read(filename)
 
             # insert song name into database
-            song_id = sql_connection.insert_song(song_name)
+            song_id = db.insert_song(song_name)
 
             for c in range(len(channels)):
                 channel = channels[c]
                 print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name)
+
                 hashes = fingerprint.fingerprint(channel, Fs=Fs)
-                sql_connection.insert_hashes(song_id, hashes)
+
+                db.insert_hashes(song_id, hashes)
 
             # only after done fingerprinting do confirm
-            sql_connection.set_song_fingerprinted(song_id)
-
-    def extract_channels(self, path):
-        """
-            Reads channels from disk.
-            Returns a tuple with (channels, sample_rate)
-        """
-        channels = []
-        Fs, frames = wavfile.read(path)
-        wave_object = wave.open(path)
-        nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
-        #assert Fs == self.fingerprinter.Fs
-
-        for channel in range(nchannels):
-            channels.append(frames[:, channel])
-        return (channels, Fs)
+            db.set_song_fingerprinted(song_id)
 
     def fingerprint_file(self, filepath, song_name=None):
-        # TODO: replace with something that handles all audio formats
-        channels, Fs = self.extract_channels(path)
+        channels, Fs = decoder.read(filepath)
+
         if not song_name:
-            song_name = os.path.basename(filename).split(".")[0]
+            song_name = os.path.basename(filepath).split(".")[0]
         song_id = self.db.insert_song(song_name)
 
         for data in channels:
@@ -141,8 +121,7 @@ class Dejavu(object):
                 largest_count = diff_counter[diff][sid]
                 song_id = sid
 
-        if DEBUG:
-            print("Diff is %d with %d offset-aligned matches" % (largest, largest_count))
+        print("Diff is %d with %d offset-aligned matches" % (largest, largest_count))
 
         # extract idenfication
         song = self.db.get_song_by_id(song_id)
@@ -151,14 +130,23 @@ class Dejavu(object):
         else:
             return None
 
-        if DEBUG:
-            print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed))
-
         # return match info
         song = {
-            "song_id" : song_id,
-            "song_name" : songname,
-            "confidence" : largest_count
+            "song_id": song_id,
+            "song_name": songname,
+            "confidence": largest_count
         }
 
         return song
+
+    def recognize(self, recognizer, *options, **kwoptions):
+        r = recognizer(self)
+        return r.recognize(*options, **kwoptions)
+
+
+def chunkify(lst, n):
+    """
+    Splits a list into roughly n equal parts.
+    http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
+    """
+    return [lst[i::n] for i in xrange(n)]
diff --git a/dejavu/decode.py b/dejavu/decoder.py
similarity index 100%
rename from dejavu/decode.py
rename to dejavu/decoder.py
diff --git a/dejavu/fingerprint.py b/dejavu/fingerprint.py
index 6108799..caf252e 100755
--- a/dejavu/fingerprint.py
+++ b/dejavu/fingerprint.py
@@ -1,17 +1,11 @@
 import numpy as np
 import matplotlib.mlab as mlab
 import matplotlib.pyplot as plt
-import matplotlib.image as mpimg
-from scipy.io import wavfile
 from scipy.ndimage.filters import maximum_filter
-from scipy.ndimage.morphology import generate_binary_structure, iterate_structure, binary_erosion
-from dejavu.database import SQLDatabase
-import os
-import wave
-import sys
-import time
+from scipy.ndimage.morphology import (generate_binary_structure,
+                                      iterate_structure, binary_erosion)
 import hashlib
-import pickle
+
 
 IDX_FREQ_I = 0
 IDX_TIME_J = 1
@@ -25,55 +19,58 @@ DEFAULT_AMP_MIN = 10
 PEAK_NEIGHBORHOOD_SIZE = 20
 MIN_HASH_TIME_DELTA = 0
 
-def fingerprint(channel_samples,
-            Fs=DEFAULT_FS, 
-            wsize=DEFAULT_WINDOW_SIZE, 
-            wratio=DEFAULT_OVERLAP_RATIO, 
-            fan_value=DEFAULT_FAN_VALUE, 
-            amp_min=DEFAULT_AMP_MIN):
+
+def fingerprint(channel_samples, Fs=DEFAULT_FS,
+                wsize=DEFAULT_WINDOW_SIZE,
+                wratio=DEFAULT_OVERLAP_RATIO,
+                fan_value=DEFAULT_FAN_VALUE,
+                amp_min=DEFAULT_AMP_MIN):
     """
-        FFT the channel, log transform output, find local maxima, then return
-        locally sensitive hashes. 
+    FFT the channel, log transform output, find local maxima, then return
+    locally sensitive hashes.
     """
     # FFT the signal and extract frequency components
     arr2D = mlab.specgram(
-        channel_samples, 
-        NFFT=wsize, 
+        channel_samples,
+        NFFT=wsize,
         Fs=Fs,
         window=mlab.window_hanning,
         noverlap=int(wsize * wratio))[0]
 
     # apply log transform since specgram() returns linear array
     arr2D = 10 * np.log10(arr2D)
-    arr2D[arr2D == -np.inf] = 0 # replace infs with zeros
-    
+    arr2D[arr2D == -np.inf] = 0  # replace infs with zeros
+
     # find local maxima
     local_maxima = get_2D_peaks(arr2D, plot=False, amp_min=amp_min)
 
     # return hashes
     return generate_hashes(local_maxima, fan_value=fan_value)
 
-def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN):
 
+def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN):
     # http://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.morphology.iterate_structure.html#scipy.ndimage.morphology.iterate_structure
     struct = generate_binary_structure(2, 1)
     neighborhood = iterate_structure(struct, PEAK_NEIGHBORHOOD_SIZE)
 
     # find local maxima using our fliter shape
-    local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D 
+    local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D
     background = (arr2D == 0)
-    eroded_background = binary_erosion(background, structure=neighborhood, border_value=1)
-    detected_peaks = local_max - eroded_background # this is a boolean mask of arr2D with True at peaks
+    eroded_background = binary_erosion(background, structure=neighborhood,
+                                       border_value=1)
+
+    # Boolean mask of arr2D with True at peaks
+    detected_peaks = local_max - eroded_background
 
     # extract peaks
     amps = arr2D[detected_peaks]
-    j, i = np.where(detected_peaks) 
+    j, i = np.where(detected_peaks)
 
     # filter peaks
     amps = amps.flatten()
     peaks = zip(i, j, amps)
-    peaks_filtered = [x for x in peaks if x[2] > amp_min] # freq, time, amp
-    
+    peaks_filtered = [x for x in peaks if x[2] > amp_min]  # freq, time, amp
+
     # get indices for frequency and time
     frequency_idx = [x[1] for x in peaks_filtered]
     time_idx = [x[0] for x in peaks_filtered]
@@ -85,97 +82,37 @@ def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN):
         ax.scatter(time_idx, frequency_idx)
         ax.set_xlabel('Time')
         ax.set_ylabel('Frequency')
-        ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke");
+        ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke")
         plt.gca().invert_yaxis()
         plt.show()
 
     return zip(frequency_idx, time_idx)
 
+
 def generate_hashes(peaks, fan_value=DEFAULT_FAN_VALUE):
     """
     Hash list structure:
-       sha1-hash[0:20]    time_offset
+       sha1_hash[0:20]    time_offset
     [(e05b341a9b77a51fd26, 32), ... ]
     """
-    fingerprinted = set() # to avoid rehashing same pairs
-    hashes = []
+    fingerprinted = set()  # to avoid rehashing same pairs
 
     for i in range(len(peaks)):
         for j in range(fan_value):
-            if i+j < len(peaks) and not (i, i+j) in fingerprinted:
-                
+            if (i + j) < len(peaks) and not (i, i + j) in fingerprinted:
                 freq1 = peaks[i][IDX_FREQ_I]
-                freq2 = peaks[i+j][IDX_FREQ_I]
+                freq2 = peaks[i + j][IDX_FREQ_I]
+
                 t1 = peaks[i][IDX_TIME_J]
-                t2 = peaks[i+j][IDX_TIME_J]
+                t2 = peaks[i + j][IDX_TIME_J]
+
                 t_delta = t2 - t1
-                
+
                 if t_delta >= MIN_HASH_TIME_DELTA:
-                    h = hashlib.sha1("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta)))
-                    hashes.append((h.hexdigest()[0:20], t1))
-                
+                    h = hashlib.sha1(
+                        "%s|%s|%s" % (str(freq1), str(freq2), str(t_delta))
+                    )
+                    yield (h.hexdigest()[0:20], t1)
+
                 # ensure we don't repeat hashing
-                fingerprinted.add((i, i+j))
-    return hashes
-
-# TODO: move all of the below to a class with DB access
-
-
-class Fingerprinter():
-
-
-
-    def __init__(self, config, 
-            Fs=DEFAULT_FS, 
-            wsize=DEFAULT_WINDOW_SIZE, 
-            wratio=DEFAULT_OVERLAP_RATIO, 
-            fan_value=DEFAULT_FAN_VALUE, 
-            amp_min=DEFAULT_AMP_MIN):
-
-        self.config = config
-        
-
-        self.Fs = Fs
-        self.dt = 1.0 / self.Fs
-        self.window_size = wsize
-        self.window_overlap_ratio = wratio
-        self.fan_value = fan_value
-        self.noverlap = int(self.window_size * self.window_overlap_ratio)
-        self.amp_min = amp_min
-
-    def fingerprint(self, samples, path, sid, cid):
-        """Used for learning known songs"""
-        hashes = self.process_channel(samples, song_id=sid)
-        print "Generated %d hashes" % len(hashes)
-        self.db.insert_hashes(hashes)
-    
-    # TODO: put this in another module
-    def match(self, samples):
-        """Used for matching unknown songs"""
-        hashes = self.process_channel(samples)
-        matches = self.db.return_matches(hashes)
-        return matches
-
-    # TODO: this function has nothing to do with fingerprinting. is it needed?
-    def print_stats(self):
-
-        iterable = self.db.get_iterable_kv_pairs()
-
-        counter = {}
-        for t in iterable:
-            sid, toff = t
-            if not sid in counter:
-                counter[sid] = 1
-            else:
-                counter[sid] += 1
-
-        for song_id, count in counter.iteritems():
-            song_name = self.song_names[song_id]
-            print "%s has %d spectrogram peaks" % (song_name, count)
-    
-    # this does... what? this seems to only be used for the above function
-    def set_song_names(self, wpaths):
-        self.song_names = wpaths
-    
-    # TODO: put this in another module
-
+                fingerprinted.add((i, i + j))
diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index 9283c6a..a723197 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -1,14 +1,8 @@
-from multiprocessing import Queue, Process
-from dejavu.database import SQLDatabase
 import dejavu.fingerprint as fingerprint
-from dejavu import Dejavu
-from scipy.io import wavfile
-import wave
+import dejavu.decoder as decoder
 import numpy as np
 import pyaudio
-import sys
 import time
-import array
 
 
 class BaseRecognizer(object):
@@ -26,25 +20,17 @@ class BaseRecognizer(object):
     def recognize(self):
         pass  # base class does nothing
 
-class WaveFileRecognizer(BaseRecognizer):
 
-    def __init__(self, dejavu, filename=None):
-        super(WaveFileRecognizer, self).__init__(dejavu)
-        self.filename = filename
+class FileRecognizer(BaseRecognizer):
+    def __init__(self, dejavu):
+        super(FileRecognizer, self).__init__(dejavu)
 
     def recognize_file(self, filename):
-        Fs, frames = wavfile.read(filename)
+        Fs, frames = decoder.read(filename)
         self.Fs = Fs
 
-        wave_object = wave.open(filename)
-        nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams()
-
-        channels = []
-        for channel in range(nchannels):
-            channels.append(frames[:, channel])
-
         t = time.time()
-        match = self._recognize(*channels)
+        match = self._recognize(*frames)
         t = time.time() - t
 
         if match:
@@ -52,50 +38,53 @@ class WaveFileRecognizer(BaseRecognizer):
 
         return match
 
-    def recognize(self):
-        return self.recognize_file(self.filename)
+    def recognize(self, filename):
+        return self.recognize_file(filename)
 
 
 class MicrophoneRecognizer(BaseRecognizer):
+    default_chunksize   = 8192
+    default_format      = pyaudio.paInt16
+    default_channels    = 2
+    default_samplerate  = 44100
 
-    CHUNK = 8192 # 44100 is a multiple of 1225
-    FORMAT = pyaudio.paInt16
-    CHANNELS = 2
-    RATE = 44100
-
-    def __init__(self, dejavu, seconds=None):
+    def __init__(self, dejavu):
         super(MicrophoneRecognizer, self).__init__(dejavu)
         self.audio = pyaudio.PyAudio()
         self.stream = None
         self.data = []
-        self.channels = CHANNELS
-        self.chunk_size = CHUNK
-        self.rate = RATE
+        self.channels = self.default_channels
+        self.chunksize = self.default_chunk
+        self.samplerate = self.default_samplerate
         self.recorded = False
 
-    def start_recording(self, channels=CHANNELS, rate=RATE, chunk=CHUNK):
-        self.chunk_size = chunk
+    def start_recording(self, channels=default_channels,
+                        samplerate=default_samplerate,
+                        chunksize=default_chunksize):
+        self.chunksize = chunksize
         self.channels = channels
         self.recorded = False
-        self.rate = rate
+        self.samplerate = samplerate
 
         if self.stream:
             self.stream.stop_stream()
             self.stream.close()
 
-        self.stream = self.audio.open(format=FORMAT,
-                                channels=channels,
-                                rate=rate,
-                                input=True,
-                                frames_per_buffer=chunk)
+        self.stream = self.audio.open(
+            format=self.default_format,
+            channels=channels,
+            rate=samplerate,
+            input=True,
+            frames_per_buffer=chunksize,
+        )
 
         self.data = [[] for i in range(channels)]
 
     def process_recording(self):
-        data = self.stream.read(self.chunk_size)
+        data = self.stream.read(self.chunksize)
         nums = np.fromstring(data, np.int16)
         for c in range(self.channels):
-            self.data[c].extend(nums[c::c+1])
+            self.data[c].extend(nums[c::len(self.channels)])
 
     def stop_recording(self):
         self.stream.stop_stream()
@@ -111,13 +100,14 @@ class MicrophoneRecognizer(BaseRecognizer):
     def get_recorded_time(self):
         return len(self.data[0]) / self.rate
 
-    def recognize(self):
+    def recognize(self, seconds=None):
         self.start_recording()
-        for i in range(0, int(self.rate / self.chunk * self.seconds)):
+        for i in range(0, int(self.samplerate / self.chunksize
+                              * seconds)):
             self.process_recording()
         self.stop_recording()
         return self.recognize_recording()
 
+
 class NoRecordingError(Exception):
     pass
-

From 292ddf029d5017477cfcc0574cd4e47e355109a8 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Thu, 19 Dec 2013 01:22:27 +0100
Subject: [PATCH 21/27] Reversed expected variables to the correct position.

---
 dejavu/recognize.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index a723197..ec9e824 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -26,8 +26,7 @@ class FileRecognizer(BaseRecognizer):
         super(FileRecognizer, self).__init__(dejavu)
 
     def recognize_file(self, filename):
-        Fs, frames = decoder.read(filename)
-        self.Fs = Fs
+        frames, self.Fs = decoder.read(filename)
 
         t = time.time()
         match = self._recognize(*frames)

From 94c4cc0d95ad19f30f74d5d3c1efff484c6bf10b Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Thu, 19 Dec 2013 01:34:04 +0100
Subject: [PATCH 22/27] Updated README for the changes to the interface.

Changed default value from None to 10 for the MicrophoneRecognizer.recognize
---
 README.md           | Bin 10435 -> 10150 bytes
 dejavu/recognize.py |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 76699f33cb13c2ec3a9b8bf6533c2d001ebd1e6a..2fc4490b46ee1a3c704e33f8c95a916ddaf54f4a 100644
GIT binary patch
delta 521
zcmZvZO-{l<7>1*Uzb5FKgqV-I0AkZnLlg*fMNC{6qZfcFowh?@rkR=6C?Oo8cVK5E
z;RtTLf>&_q&Yf)&P>3^|$@f0*_cw3%%e&QUCi1h(sGVyrfJ}YHCG_zag@Iz4CX5($
zhzmWKDY~vha#R~hWFdjo&Yx(icudGRsu~wpdLzDus%>p45D3f@fHNldHNTC7c(h}=
z>nAV9Z9o<av8lms7TZc7il!Kt1Jc7$H(M_KmYS#dJRQIuhtXHg%t()895wT;XqCS?
zg&OlEVH8d3iZUDy{U_-$x;2~>D(ik>dQcvRx^3w$mIE402M(bw<~}D>idudBt(+e&
zZpnx<KQ`kBJYp^-&zPrqQVinqI^PJj|6mP7tW#U`j7A;K;^jLim`DMVfzjBAx-wIe
s4ORuC?ts|U4j7l30?|wHJvu5@TGwQ<-$8=LKeyY$E~mlz^Z29m1DBScVgLXD

delta 795
zcmb7CO>5Oa6xCw;nxTp=EC>qMw`m?qY(=R&+Cb5Ti-I4xD2OEUlAHIYBy+>eB(=JD
zKO+CYg&+tDZ{^yx8+ZN*e}j|cl|<Bq69|*J=bn4cnY@4U_1Wi7t8;v9ZbPJy8w3s!
z$wW*G#f=~diAd3dLo{0&BA%uo41_Tr^Rn<FnS^8JFz2Q1kr>qtub5M4RFY>n21R6;
zlXI<4NX28=JKF-EP>DWsVs7}Q3D*|9SN8xV)xqHURp!h{o0etg?yk3OpPlOC>W5)U
zJQi{aW`=)}3S7q3i?#c6&q^|itllgqW#CaGnfbmCiJ}Z1HZGrXWyPLh`ezN&a@Z(f
z|2MQd>eW1(TI>$HK!$B~tv#vAfqx;PLRDV|4-Tj>d0`+H%4S9C6bwP0@(95@;bjxG
z1~(yPjb#>D+f3{WRj!0IdQ|-!9BwiehGBkcW+WRS%SlJ;H?Is!9uzt-ln3{=?^w=7
zxt^bY+}>7?36&R5yeXPM7Fp0MXm@n(-)phLZmrCpV?<La-k@s6TQAE`n^RYFmyz_y
zQQ5z&Nwm#Kll8X`9f6k$ZBQ-)ZahN`Vs9DPwN>wf>fOfo>hi{oqiz=KDE>#opYxl`
KTztFu<JvC<ISv>A

diff --git a/dejavu/recognize.py b/dejavu/recognize.py
index ec9e824..dfc06d9 100755
--- a/dejavu/recognize.py
+++ b/dejavu/recognize.py
@@ -99,7 +99,7 @@ class MicrophoneRecognizer(BaseRecognizer):
     def get_recorded_time(self):
         return len(self.data[0]) / self.rate
 
-    def recognize(self, seconds=None):
+    def recognize(self, seconds=10):
         self.start_recording()
         for i in range(0, int(self.samplerate / self.chunksize
                               * seconds)):

From 2f19fcaa51331f9d389f899c5d366c94283217c1 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Thu, 19 Dec 2013 01:39:36 +0100
Subject: [PATCH 23/27] Fixed some mistakes in my hasty edit

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2fc4490..9c53b41 100644
--- a/README.md
+++ b/README.md
@@ -50,13 +50,13 @@ Start by creating a Dejavu object.
 >>> djv = Dejavu(config)
 ```
 
-Next, give the `fingerprint()` command three arguments:
+Next, give the `fingerprint_directory` method three arguments:
 * input directory to look for audio files
 * audio extensions to look for in the input directory
 * number of processes (optional)
 
 ```python
->>> djv.fingerprint("va_us_top_40/mp3", [".mp3"], 3)
+>>> djv.fingerprint_directory("va_us_top_40/mp3", [".mp3"], 3)
 ```
 
 For a large amount of files, this will take a while. However, Dejavu is robust enough you can kill and restart without affecting progress: Dejavu remembers which songs it fingerprinted and converted and which it didn't, and so won't repeat itself. 
@@ -67,7 +67,7 @@ You'll have a lot of fingerprints once it completes a large folder of mp3s:
 5442376
 ```
 
-Also, any subsequent calls to `fingerprint()` will fingerprint and add those songs to the database as well. It's meant to simulate a system where as new songs are released, they are fingerprinted and added to the database seemlessly without stopping the system. 
+Also, any subsequent calls to `fingerprint_file` or `fingerprint_directory` will fingerprint and add those songs to the database as well. It's meant to simulate a system where as new songs are released, they are fingerprinted and added to the database seemlessly without stopping the system. 
 
 ## Recognizing
 
@@ -85,7 +85,7 @@ There are two ways to recognize audio using Dejavu. You can use Dejavu interacti
 }
 ```
 
-Or by reading .wav files via scripting functions:
+Or by reading files via scripting functions:
 
 ```python
 >>> from dejavu.recognize import FileRecognizer

From ec823f56e410114fceec1d3be1d3144463c65e42 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Thu, 19 Dec 2013 17:15:11 +0100
Subject: [PATCH 24/27] Updated go.py

Added clarification that the configuration is now an ordinary python
dictionary to the README.
---
 README.md |  3 ++-
 go.py     | 35 +++++++++++++++++++++--------------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 9c53b41..fc9c3bd 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,8 @@ Now you're ready to start fingerprinting your audio collection!
 
 Let's say we want to fingerprint all of July 2013's VA US Top 40 hits. 
 
-Start by creating a Dejavu object. 
+Start by creating a Dejavu object with your configurations settings (Dejavu takes an ordinary Python dictionary for the settings).
+
 ```python
 >>> from dejavu import Dejavu
 >>> config = {
diff --git a/go.py b/go.py
index 9108f4e..7472019 100755
--- a/go.py
+++ b/go.py
@@ -1,22 +1,29 @@
 from dejavu import Dejavu
-from ConfigParser import ConfigParser
 import warnings
+import json
 warnings.filterwarnings("ignore")
 
-# load config
-config = ConfigParser()
-config.read("dejavu.cnf")
+# load config from a JSON file (or anything outputting a python dictionary)
+with open("dejavu.cnf") as f:
+    config = json.load(f)
 
-# create Dejavu object
-dejavu = Dejavu(config)
-dejavu.fingerprint("va_us_top_40/mp3", "va_us_top_40/wav", [".mp3"], 5)
+# create a Dejavu instance
+djv = Dejavu(config)
+# Fingerprint all the mp3's in the directory we give it
+djv.fingerprint_directory("va_us_top_40/mp3", [".mp3"], 5)
 
-# recognize microphone audio
-from dejavu.recognize import Recognizer
-recognizer = Recognizer(dejavu.fingerprinter, config)
 
-song = recognizer.read("va_us_top_40/wav/17_-_#Beautiful_-_Mariah_Carey_ft.wav")
+# Recognize audio from a file
+from dejavu.recognize import FileRecognizer
+song = djv.recognize(FileRecognizer, "va_us_top_40/wav/17_-_#Beautiful_-_Mariah_Carey_ft.wav")
 
-# recognize song playing over microphone for 10 seconds
-#song = recognizer.listen(seconds=1, verbose=True)
-#print song
\ No newline at end of file
+
+# Or recognize audio from your microphone for 10 seconds
+from dejavu.recognize import MicrophoneRecognizer
+song = djv.recognize(MicrophoneRecognizer, seconds=10)
+
+
+# Or use a recognizer without the shortcut, in anyway you would like
+from dejavu.recognize import FileRecognizer
+recognizer = FileRecognizer(djv)
+song = recognizer.recognize_file("va_us_top_40/wav/17_-_#Beautiful_-_Mariah_Carey_ft.wav")

From f276efdf324c24836d8c3bfc128f9558f7245e23 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Fri, 20 Dec 2013 18:16:35 +0100
Subject: [PATCH 25/27] Cleaned up database.py

- Moved SQLDatabase to a SQL specific file
- Database class is now an abstract base class
- Cursor moved into SQL specific file
- Allowed for multi-database support in the future
---
 dejavu/__init__.py     |   5 +-
 dejavu/cursor.py       |  59 -------
 dejavu/database.py     | 348 ++++++++++++---------------------------
 dejavu/database_sql.py | 366 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 472 insertions(+), 306 deletions(-)
 delete mode 100644 dejavu/cursor.py
 create mode 100644 dejavu/database_sql.py

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index c10e54d..e93edac 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -1,4 +1,4 @@
-from dejavu.database import SQLDatabase
+from dejavu.database import get_database
 import dejavu.decoder as decoder
 import fingerprint
 from multiprocessing import Process, cpu_count
@@ -13,8 +13,9 @@ class Dejavu(object):
         self.config = config
 
         # initialize db
-        self.db = SQLDatabase(**config.get("database", {}))
+        db_cls = get_database(config.get("database_type", None))
 
+        self.db = db_cls(**config.get("database", {}))
         self.db.setup()
 
         # get songs previously indexed
diff --git a/dejavu/cursor.py b/dejavu/cursor.py
deleted file mode 100644
index 55429a7..0000000
--- a/dejavu/cursor.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-import Queue
-
-import MySQLdb as mysql
-import MySQLdb.cursors
-
-
-def cursor_factory(**factory_options):
-    def cursor(**options):
-        options.update(factory_options)
-        return Cursor(**options)
-    return cursor
-
-
-class Cursor(object):
-    """
-    Establishes a connection to the database and returns an open cursor.
-
-
-    ```python
-    # Use as context manager
-    with Cursor() as cur:
-        cur.execute(query)
-    ```
-    """
-    _cache = Queue.Queue(maxsize=5)
-
-    def __init__(self, cursor_type=mysql.cursors.Cursor, **options):
-        super(Cursor, self).__init__()
-
-        try:
-            conn = self._cache.get_nowait()
-        except Queue.Empty:
-            conn = mysql.connect(**options)
-        else:
-            # Ping the connection before using it from the cache.
-            conn.ping(True)
-
-        self.conn = conn
-        self.cursor_type = cursor_type
-
-    def __enter__(self):
-        self.cursor = self.conn.cursor(self.cursor_type)
-        return self.cursor
-
-    def __exit__(self, extype, exvalue, traceback):
-        # if we had a MySQL related error we try to rollback the cursor.
-        if extype is mysql.MySQLError:
-            self.cursor.rollback()
-
-        self.cursor.close()
-        self.conn.commit()
-
-        # Put it back on the queue
-        try:
-            self._cache.put_nowait(self.conn)
-        except Queue.Full:
-            self.conn.close()
diff --git a/dejavu/database.py b/dejavu/database.py
index 75af1bf..b0bfc4a 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -1,308 +1,166 @@
 from __future__ import absolute_import
-from itertools import izip_longest
-
-from dejavu.cursor import cursor_factory
-from MySQLdb.cursors import DictCursor
+import abc
 
 
 class Database(object):
+    __metaclass__ = abc.ABCMeta
+
+    # Name of your Database subclass, this is used in configuration
+    # to refer to your class
+    type = None
+
     def __init__(self):
         super(Database, self).__init__()
 
+    def before_fork(self):
+        """
+        Called before the database instance is given to the new process
+        """
+        pass
 
-class SQLDatabase(Database):
-    """
-    Queries:
+    def after_fork(self):
+        """
+        Called after the database instance has been given to the new process
 
-    1) Find duplicates (shouldn't be any, though):
-
-        select `hash`, `song_id`, `offset`, count(*) cnt
-        from fingerprints
-        group by `hash`, `song_id`, `offset`
-        having cnt > 1
-        order by cnt asc;
-
-    2) Get number of hashes by song:
-
-        select song_id, song_name, count(song_id) as num
-        from fingerprints
-        natural join songs
-        group by song_id
-        order by count(song_id) desc;
-
-    3) get hashes with highest number of collisions
-
-        select
-            hash,
-            count(distinct song_id) as n
-        from fingerprints
-        group by `hash`
-        order by n DESC;
-
-    => 26 different songs with same fingerprint (392 times):
-
-        select songs.song_name, fingerprints.offset
-        from fingerprints natural join songs
-        where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73";
-    """
-
-    # config keys
-    CONNECTION = "connection"
-    KEY_USERNAME = "username"
-    KEY_DATABASE = "database"
-    KEY_PASSWORD = "password"
-    KEY_HOSTNAME = "hostname"
-
-    # tables
-    FINGERPRINTS_TABLENAME = "fingerprints"
-    SONGS_TABLENAME = "songs"
-
-    # fields
-    FIELD_HASH = "hash"
-    FIELD_SONG_ID = "song_id"
-    FIELD_OFFSET = "offset"
-    FIELD_SONGNAME = "song_name"
-    FIELD_FINGERPRINTED = "fingerprinted"
-
-    # creates
-    CREATE_FINGERPRINTS_TABLE = """
-        CREATE TABLE IF NOT EXISTS `%s` (
-             `%s` binary(10) not null,
-             `%s` mediumint unsigned not null,
-             `%s` int unsigned not null,
-         PRIMARY KEY(%s),
-         UNIQUE(%s, %s, %s),
-         FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
-    ) ENGINE=INNODB;""" % (
-        FINGERPRINTS_TABLENAME, FIELD_HASH,
-        FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
-        FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
-        FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID
-    )
-
-    CREATE_SONGS_TABLE = """
-        CREATE TABLE IF NOT EXISTS `%s` (
-            `%s` mediumint unsigned not null auto_increment,
-            `%s` varchar(250) not null,
-            `%s` tinyint default 0,
-        PRIMARY KEY (`%s`),
-        UNIQUE KEY `%s` (`%s`)
-    ) ENGINE=INNODB;""" % (
-        SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
-        FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID,
-    )
-
-    # inserts (ignores duplicates)
-    INSERT_FINGERPRINT = """
-        INSERT IGNORE INTO %s (%s, %s, %s) VALUES
-            (UNHEX(%%s), %%s, %%s);
-    """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET)
-
-    INSERT_SONG = "INSERT INTO %s (%s) VALUES (%%s);" % (
-        SONGS_TABLENAME, FIELD_SONGNAME)
-
-    # selects
-    SELECT = """
-        SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
-    """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH)
-
-    SELECT_MULTIPLE = """
-        SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
-    """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET,
-           FINGERPRINTS_TABLENAME, FIELD_HASH)
-
-    SELECT_ALL = """
-        SELECT %s, %s FROM %s;
-    """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME)
-
-    SELECT_SONG = """
-        SELECT %s FROM %s WHERE %s = %%s
-    """ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)
-
-    SELECT_NUM_FINGERPRINTS = """
-        SELECT COUNT(*) as n FROM %s
-    """ % (FINGERPRINTS_TABLENAME)
-
-    SELECT_UNIQUE_SONG_IDS = """
-        SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
-    """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
-
-    SELECT_SONGS = """
-        SELECT %s, %s FROM %s WHERE %s = 1;
-    """ % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED)
-
-    # drops
-    DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
-    DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME
-
-    # update
-    UPDATE_SONG_FINGERPRINTED = """
-        UPDATE %s SET %s = 1 WHERE %s = %%s
-    """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID)
-
-    # delete
-    DELETE_UNFINGERPRINTED = """
-        DELETE FROM %s WHERE %s = 0;
-    """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
-
-    def __init__(self, **options):
-        super(SQLDatabase, self).__init__()
-        self.cursor = cursor_factory(**options)
+        This will be called in the new process.
+        """
+        pass
 
     def setup(self):
         """
-        Creates any non-existing tables required for dejavu to function.
-
-        This also removes all songs that have been added but have no
-        fingerprints associated with them.
+        Called on creation or shortly afterwards.
         """
-        with self.cursor() as cur:
-            cur.execute(self.CREATE_SONGS_TABLE)
-            cur.execute(self.CREATE_FINGERPRINTS_TABLE)
-            cur.execute(self.DELETE_UNFINGERPRINTED)
+        pass
 
+    @abc.abstractmethod
     def empty(self):
         """
-        Drops tables created by dejavu and then creates them again
-        by calling `SQLDatabase.setup`.
-
-        .. warning:
-            This will result in a loss of data
+        Called when the database should be cleared of all data.
         """
-        with self.cursor() as cur:
-            cur.execute(self.DROP_FINGERPRINTS)
-            cur.execute(self.DROP_SONGS)
-
-        self.setup()
+        pass
 
+    @abc.abstractmethod
     def delete_unfingerprinted_songs(self):
         """
-        Removes all songs that have no fingerprints associated with them.
+        Called to remove any song entries that do not have any fingerprints
+        associated with them.
         """
-        with self.cursor() as cur:
-            cur.execute(self.DELETE_UNFINGERPRINTED)
+        pass
 
+    @abc.abstractmethod
     def get_num_songs(self):
         """
-        Returns number of songs the database has fingerprinted.
+        Returns the amount of songs in the database.
         """
-        with self.cursor() as cur:
-            cur.execute(self.SELECT_UNIQUE_SONG_IDS)
-
-            for count, in cur:
-                return count
-            return 0
+        pass
 
+    @abc.abstractmethod
     def get_num_fingerprints(self):
         """
-        Returns number of fingerprints the database has fingerprinted.
+        Returns the number of fingerprints in the database.
         """
-        with self.cursor() as cur:
-            cur.execute(self.SELECT_NUM_FINGERPRINTS)
-
-            for count, in cur:
-                return count
-            return 0
+        pass
 
+    @abc.abstractmethod
     def set_song_fingerprinted(self, sid):
         """
-        Set the fingerprinted flag to TRUE (1) once a song has been completely
-        fingerprinted in the database.
-        """
-        with self.cursor() as cur:
-            cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,))
+        Sets a specific song as having all fingerprints in the database.
 
+        sid: Song identifier
+        """
+        pass
+
+    @abc.abstractmethod
     def get_songs(self):
         """
-        Return songs that have the fingerprinted flag set TRUE (1).
+        Returns all fully fingerprinted songs in the database.
         """
-        with self.cursor(cursor_type=DictCursor) as cur:
-            cur.execute(self.SELECT_SONGS)
-            for row in cur:
-                yield row
+        pass
 
+    @abc.abstractmethod
     def get_song_by_id(self, sid):
         """
-        Returns song by its ID.
-        """
-        with self.cursor(cursor_type=DictCursor) as cur:
-            cur.execute(self.SELECT_SONG, (sid,))
-            return cur.fetchone()
+        Return a song by its identifier
 
+        sid: Song identifier
+        """
+        pass
+
+    @abc.abstractmethod
     def insert(self, hash, sid, offset):
         """
-        Insert a (sha1, song_id, offset) row into database.
-        """
-        with self.cursor() as cur:
-            cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
+        Inserts a single fingerprint into the database.
 
-    def insert_song(self, songname):
+          hash: Part of a sha1 hash, in hexadecimal format
+           sid: Song identifier this fingerprint is off
+        offset: The offset this hash is from
         """
-        Inserts song in the database and returns the ID of the inserted record.
-        """
-        with self.cursor() as cur:
-            cur.execute(self.INSERT_SONG, (songname,))
-            return cur.lastrowid
+        pass
 
+    @abc.abstractmethod
+    def insert_song(self, song_name):
+        """
+        Inserts a song name into the database, returns the new
+        identifier of the song.
+
+        song_name: The name of the song.
+        """
+        pass
+
+    @abc.abstractmethod
     def query(self, hash):
         """
-        Return all tuples associated with hash.
+        Returns all matching fingerprint entries associated with
+        the given hash as parameter.
 
-        If hash is None, returns all entries in the
-        database (be careful with that one!).
+        hash: Part of a sha1 hash, in hexadecimal format
         """
-        # select all if no key
-        query = self.SELECT_ALL if hash is None else self.SELECT
-
-        with self.cursor() as cur:
-            cur.execute(query)
-            for sid, offset in cur:
-                yield (sid, offset)
+        pass
 
+    @abc.abstractmethod
     def get_iterable_kv_pairs(self):
         """
-        Returns all tuples in database.
+        Returns all fingerprints in the database.
         """
-        return self.query(None)
+        pass
 
+    @abc.abstractmethod
     def insert_hashes(self, sid, hashes):
         """
-        Insert series of hash => song_id, offset
-        values into the database.
+        Insert a multitude of fingerprints.
+
+           sid: Song identifier the fingerprints belong to
+        hashes: A sequence of tuples in the format (hash, offset)
+        -   hash: Part of a sha1 hash, in hexadecimal format
+        - offset: Offset this hash was created from/at.
         """
-        values = []
-        for hash, offset in hashes:
-            values.append((hash, sid, offset))
-
-        with self.cursor() as cur:
-            cur.executemany(self.INSERT_FINGERPRINT, values)
+        pass
 
+    @abc.abstractmethod
     def return_matches(self, hashes):
         """
-        Return the (song_id, offset_diff) tuples associated with
-        a list of (sha1, sample_offset) values.
+        Searches the database for pairs of (hash, offset) values.
+
+        hashes: A sequence of tuples in the format (hash, offset)
+        -   hash: Part of a sha1 hash, in hexadecimal format
+        - offset: Offset this hash was created from/at.
+
+        Returns a sequence of (sid, offset_difference) tuples.
+
+                      sid: Song identifier
+        offset_difference: (offset - database_offset)
         """
-        # Create a dictionary of hash => offset pairs for later lookups
-        mapper = {}
-        for hash, offset in hashes:
-            mapper[hash.upper()] = offset
-
-        # Get an iteratable of all the hashes we need
-        values = mapper.keys()
-
-        with self.cursor() as cur:
-            for split_values in grouper(values, 1000):
-                # Create our IN part of the query
-                query = self.SELECT_MULTIPLE
-                query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
-
-                cur.execute(query, split_values)
-
-                for hash, sid, offset in cur:
-                    # (sid, db_offset - song_sampled_offset)
-                    yield (sid, offset - mapper[hash])
+        pass
 
 
-def grouper(iterable, n, fillvalue=None):
-    args = [iter(iterable)] * n
-    return izip_longest(fillvalue=fillvalue, *args)
+def get_database(database_type=None):
+    # Default to using the mysql database
+    database_type = database_type or "mysql"
+    # Lower all the input.
+    database_type = database_type.lower()
+
+    for db_cls in Database.__subclasses__():
+        if db_cls.type == database_type:
+            return db_cls
+
+    raise TypeError("Unsupported database type supplied.")
diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py
new file mode 100644
index 0000000..03c8ad6
--- /dev/null
+++ b/dejavu/database_sql.py
@@ -0,0 +1,366 @@
+from __future__ import absolute_import
+from itertools import izip_longest
+import Queue
+
+import MySQLdb as mysql
+from MySQLdb.cursors import DictCursor
+
+from dejavu.database import Database
+
+
+class SQLDatabase(Database):
+    """
+    Queries:
+
+    1) Find duplicates (shouldn't be any, though):
+
+        select `hash`, `song_id`, `offset`, count(*) cnt
+        from fingerprints
+        group by `hash`, `song_id`, `offset`
+        having cnt > 1
+        order by cnt asc;
+
+    2) Get number of hashes by song:
+
+        select song_id, song_name, count(song_id) as num
+        from fingerprints
+        natural join songs
+        group by song_id
+        order by count(song_id) desc;
+
+    3) get hashes with highest number of collisions
+
+        select
+            hash,
+            count(distinct song_id) as n
+        from fingerprints
+        group by `hash`
+        order by n DESC;
+
+    => 26 different songs with same fingerprint (392 times):
+
+        select songs.song_name, fingerprints.offset
+        from fingerprints natural join songs
+        where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73";
+    """
+
+    type = "mysql"
+
+    # tables
+    FINGERPRINTS_TABLENAME = "fingerprints"
+    SONGS_TABLENAME = "songs"
+
+    # fields
+    FIELD_HASH = "hash"
+    FIELD_SONG_ID = "song_id"
+    FIELD_OFFSET = "offset"
+    FIELD_SONGNAME = "song_name"
+    FIELD_FINGERPRINTED = "fingerprinted"
+
+    # creates
+    CREATE_FINGERPRINTS_TABLE = """
+        CREATE TABLE IF NOT EXISTS `%s` (
+             `%s` binary(10) not null,
+             `%s` mediumint unsigned not null,
+             `%s` int unsigned not null,
+         PRIMARY KEY(%s),
+         UNIQUE(%s, %s, %s),
+         FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
+    ) ENGINE=INNODB;""" % (
+        FINGERPRINTS_TABLENAME, FIELD_HASH,
+        FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
+        FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
+        FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID
+    )
+
+    CREATE_SONGS_TABLE = """
+        CREATE TABLE IF NOT EXISTS `%s` (
+            `%s` mediumint unsigned not null auto_increment,
+            `%s` varchar(250) not null,
+            `%s` tinyint default 0,
+        PRIMARY KEY (`%s`),
+        UNIQUE KEY `%s` (`%s`)
+    ) ENGINE=INNODB;""" % (
+        SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
+        FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID,
+    )
+
+    # inserts (ignores duplicates)
+    INSERT_FINGERPRINT = """
+        INSERT IGNORE INTO %s (%s, %s, %s) values
+            (UNHEX(%%s), %%s, %%s);
+    """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET)
+
+    INSERT_SONG = "INSERT INTO %s (%s) values (%%s);" % (
+        SONGS_TABLENAME, FIELD_SONGNAME)
+
+    # selects
+    SELECT = """
+        SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
+    """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH)
+
+    SELECT_MULTIPLE = """
+        SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
+    """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET,
+           FINGERPRINTS_TABLENAME, FIELD_HASH)
+
+    SELECT_ALL = """
+        SELECT %s, %s FROM %s;
+    """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME)
+
+    SELECT_SONG = """
+        SELECT %s FROM %s WHERE %s = %%s
+    """ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)
+
+    SELECT_NUM_FINGERPRINTS = """
+        SELECT COUNT(*) as n FROM %s
+    """ % (FINGERPRINTS_TABLENAME)
+
+    SELECT_UNIQUE_SONG_IDS = """
+        SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
+    """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
+
+    SELECT_SONGS = """
+        SELECT %s, %s FROM %s WHERE %s = 1;
+    """ % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED)
+
+    # drops
+    DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
+    DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME
+
+    # update
+    UPDATE_SONG_FINGERPRINTED = """
+        UPDATE %s SET %s = 1 WHERE %s = %%s
+    """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID)
+
+    # delete
+    DELETE_UNFINGERPRINTED = """
+        DELETE FROM %s WHERE %s = 0;
+    """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
+
+    def __init__(self, **options):
+        super(SQLDatabase, self).__init__()
+        self.cursor = cursor_factory(**options)
+        self._options = options
+
+    def after_fork(self):
+        # Clear the cursor cache, we don't want any stale connections from
+        # the previous process.
+        Cursor.clear_cache()
+
+    def setup(self):
+        """
+        Creates any non-existing tables required for dejavu to function.
+
+        This also removes all songs that have been added but have no
+        fingerprints associated with them.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.CREATE_SONGS_TABLE)
+            cur.execute(self.CREATE_FINGERPRINTS_TABLE)
+            cur.execute(self.DELETE_UNFINGERPRINTED)
+
+    def empty(self):
+        """
+        Drops tables created by dejavu and then creates them again
+        by calling `SQLDatabase.setup`.
+
+        .. warning:
+            This will result in a loss of data
+        """
+        with self.cursor() as cur:
+            cur.execute(self.DROP_FINGERPRINTS)
+            cur.execute(self.DROP_SONGS)
+
+        self.setup()
+
+    def delete_unfingerprinted_songs(self):
+        """
+        Removes all songs that have no fingerprints associated with them.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.DELETE_UNFINGERPRINTED)
+
+    def get_num_songs(self):
+        """
+        Returns number of songs the database has fingerprinted.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.SELECT_UNIQUE_SONG_IDS)
+
+            for count, in cur:
+                return count
+            return 0
+
+    def get_num_fingerprints(self):
+        """
+        Returns number of fingerprints the database has fingerprinted.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.SELECT_NUM_FINGERPRINTS)
+
+            for count, in cur:
+                return count
+            return 0
+
+    def set_song_fingerprinted(self, sid):
+        """
+        Set the fingerprinted flag to TRUE (1) once a song has been completely
+        fingerprinted in the database.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,))
+
+    def get_songs(self):
+        """
+        Return songs that have the fingerprinted flag set TRUE (1).
+        """
+        with self.cursor(cursor_type=DictCursor) as cur:
+            cur.execute(self.SELECT_SONGS)
+            for row in cur:
+                yield row
+
+    def get_song_by_id(self, sid):
+        """
+        Returns song by its ID.
+        """
+        with self.cursor(cursor_type=DictCursor) as cur:
+            cur.execute(self.SELECT_SONG, (sid,))
+            return cur.fetchone()
+
+    def insert(self, hash, sid, offset):
+        """
+        Insert a (sha1, song_id, offset) row into database.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
+
+    def insert_song(self, songname):
+        """
+        Inserts song in the database and returns the ID of the inserted record.
+        """
+        with self.cursor() as cur:
+            cur.execute(self.INSERT_SONG, (songname,))
+            return cur.lastrowid
+
+    def query(self, hash):
+        """
+        Return all tuples associated with hash.
+
+        If hash is None, returns all entries in the
+        database (be careful with that one!).
+        """
+        # select all if no key
+        query = self.SELECT_ALL if hash is None else self.SELECT
+
+        with self.cursor() as cur:
+            cur.execute(query)
+            for sid, offset in cur:
+                yield (sid, offset)
+
+    def get_iterable_kv_pairs(self):
+        """
+        Returns all tuples in database.
+        """
+        return self.query(None)
+
+    def insert_hashes(self, sid, hashes):
+        """
+        Insert series of hash => song_id, offset
+        values into the database.
+        """
+        values = []
+        for hash, offset in hashes:
+            values.append((hash, sid, offset))
+
+        with self.cursor() as cur:
+            for split_values in grouper(values, 1000):
+                cur.executemany(self.INSERT_FINGERPRINT, split_values)
+
+    def return_matches(self, hashes):
+        """
+        Return the (song_id, offset_diff) tuples associated with
+        a list of (sha1, sample_offset) values.
+        """
+        # Create a dictionary of hash => offset pairs for later lookups
+        mapper = {}
+        for hash, offset in hashes:
+            mapper[hash.upper()] = offset
+
+        # Get an iteratable of all the hashes we need
+        values = mapper.keys()
+
+        with self.cursor() as cur:
+            for split_values in grouper(values, 1000):
+                # Create our IN part of the query
+                query = self.SELECT_MULTIPLE
+                query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
+
+                cur.execute(query, split_values)
+
+                for hash, sid, offset in cur:
+                    # (sid, db_offset - song_sampled_offset)
+                    yield (sid, offset - mapper[hash])
+
+
+def grouper(iterable, n, fillvalue=None):
+    args = [iter(iterable)] * n
+    return izip_longest(fillvalue=fillvalue, *args)
+
+
+def cursor_factory(**factory_options):
+    def cursor(**options):
+        options.update(factory_options)
+        return Cursor(**options)
+    return cursor
+
+
+class Cursor(object):
+    """
+    Establishes a connection to the database and returns an open cursor.
+
+
+    ```python
+    # Use as context manager
+    with Cursor() as cur:
+        cur.execute(query)
+    ```
+    """
+    _cache = Queue.Queue(maxsize=5)
+
+    def __init__(self, cursor_type=mysql.cursors.Cursor, **options):
+        super(Cursor, self).__init__()
+
+        try:
+            conn = self._cache.get_nowait()
+        except Queue.Empty:
+            conn = mysql.connect(**options)
+        else:
+            # Ping the connection before using it from the cache.
+            conn.ping(True)
+
+        self.conn = conn
+        self.conn.autocommit(False)
+        self.cursor_type = cursor_type
+
+    @classmethod
+    def clear_cache(cls):
+        cls._cache = Queue.Queue(maxsize=5)
+
+    def __enter__(self):
+        self.cursor = self.conn.cursor(self.cursor_type)
+        return self.cursor
+
+    def __exit__(self, extype, exvalue, traceback):
+        # if we had a MySQL related error we try to rollback the cursor.
+        if extype is mysql.MySQLError:
+            self.cursor.rollback()
+
+        self.cursor.close()
+        self.conn.commit()
+
+        # Put it back on the queue
+        try:
+            self._cache.put_nowait(self.conn)
+        except Queue.Full:
+            self.conn.close()

From e071804ea5550bfee19a55332345b887744bbdae Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Sat, 21 Dec 2013 12:01:05 +0100
Subject: [PATCH 26/27] Fixed the issue of the default database not being
 imported.

Fixed a bug in the SQL database pertaining to the use of grouper.
Made SQLDatabase pickleable, for better multiprocessing support.
---
 dejavu/database.py     |  4 ++++
 dejavu/database_sql.py | 11 +++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/dejavu/database.py b/dejavu/database.py
index b0bfc4a..5903541 100755
--- a/dejavu/database.py
+++ b/dejavu/database.py
@@ -164,3 +164,7 @@ def get_database(database_type=None):
             return db_cls
 
     raise TypeError("Unsupported database type supplied.")
+
+
+# Import our default database handler
+import dejavu.database_sql
diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py
index 03c8ad6..a93b1f9 100644
--- a/dejavu/database_sql.py
+++ b/dejavu/database_sql.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-from itertools import izip_longest
+from itertools import izip_longest, ifilter
 import Queue
 
 import MySQLdb as mysql
@@ -302,10 +302,17 @@ class SQLDatabase(Database):
                     # (sid, db_offset - song_sampled_offset)
                     yield (sid, offset - mapper[hash])
 
+    def __getstate__(self):
+        return (self._options,)
+
+    def __setstate__(self, state):
+        self._options, = state
+        self.cursor = cursor_factory(**self._options)
+
 
 def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
-    return izip_longest(fillvalue=fillvalue, *args)
+    return (ifilter(None, values) for values in izip_longest(fillvalue=fillvalue, *args))
 
 
 def cursor_factory(**factory_options):

From 7d14e0734aece6cfaec445f18c79641e12032dd3 Mon Sep 17 00:00:00 2001
From: Wessie <wessie@wessie.info>
Date: Mon, 23 Dec 2013 14:59:08 +0100
Subject: [PATCH 27/27] Switched fingerprint_directory to using
 multiprocessing.Pool

Fixed an issue of 'grouper' items being generators due to ifilter usage.
Temporary fix applied for the need of referencing SQLDatabase.FIELD_SONGNAME in __init__
Cleaned up some pep8 style issues
---
 dejavu/__init__.py     | 114 +++++++++++++++++++++++------------------
 dejavu/database_sql.py |   5 +-
 2 files changed, 66 insertions(+), 53 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index e93edac..bb52f1e 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -1,9 +1,8 @@
 from dejavu.database import get_database
 import dejavu.decoder as decoder
 import fingerprint
-from multiprocessing import Process, cpu_count
+import multiprocessing
 import os
-import random
 
 
 class Dejavu(object):
@@ -30,57 +29,39 @@ class Dejavu(object):
 
     def fingerprint_directory(self, path, extensions, nprocesses=None):
         # Try to use the maximum amount of processes if not given.
-        if nprocesses is None:
-            try:
-                nprocesses = cpu_count()
-            except NotImplementedError:
-                nprocesses = 1
+        try:
+            nprocesses = nprocesses or multiprocessing.cpu_count()
+        except NotImplementedError:
+            nprocesses = 1
+        else:
+            nprocesses = 1 if nprocesses <= 0 else nprocesses
 
-        # convert files, shuffle order
-        files = list(decoder.find_files(path, extensions))
-        random.shuffle(files)
+        pool = multiprocessing.Pool(nprocesses)
 
-        files_split = chunkify(files, nprocesses)
+        results = []
+        for filename, _ in decoder.find_files(path, extensions):
+            # TODO: Don't queue up files that have already been fingerprinted.
+            result = pool.apply_async(_fingerprint_worker,
+                                      (filename, self.db))
+            results.append(result)
 
-        # split into processes here
-        processes = []
-        for i in range(nprocesses):
+        while len(results):
+            for result in results[:]:
+                # TODO: Handle errors gracefully and return them to the callee
+                # in some way.
+                try:
+                    result.get(timeout=2)
+                except multiprocessing.TimeoutError:
+                    continue
+                except:
+                    import traceback, sys
+                    traceback.print_exc(file=sys.stdout)
+                    results.remove(result)
+                else:
+                    results.remove(result)
 
-            # create process and start it
-            p = Process(target=self._fingerprint_worker,
-                        args=(files_split[i], self.db))
-            p.start()
-            processes.append(p)
-
-        # wait for all processes to complete
-        for p in processes:
-            p.join()
-
-    def _fingerprint_worker(self, files, db):
-        for filename, extension in files:
-
-            # if there are already fingerprints in database,
-            # don't re-fingerprint
-            song_name = os.path.basename(filename).split(".")[0]
-            if song_name in self.songnames_set:
-                print("-> Already fingerprinted, continuing...")
-                continue
-
-            channels, Fs = decoder.read(filename)
-
-            # insert song name into database
-            song_id = db.insert_song(song_name)
-
-            for c in range(len(channels)):
-                channel = channels[c]
-                print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name)
-
-                hashes = fingerprint.fingerprint(channel, Fs=Fs)
-
-                db.insert_hashes(song_id, hashes)
-
-            # only after done fingerprinting do confirm
-            db.set_song_fingerprinted(song_id)
+        pool.close()
+        pool.join()
 
     def fingerprint_file(self, filepath, song_name=None):
         channels, Fs = decoder.read(filepath)
@@ -122,12 +103,14 @@ class Dejavu(object):
                 largest_count = diff_counter[diff][sid]
                 song_id = sid
 
-        print("Diff is %d with %d offset-aligned matches" % (largest, largest_count))
+        print("Diff is %d with %d offset-aligned matches" % (largest,
+                                                             largest_count))
 
         # extract idenfication
         song = self.db.get_song_by_id(song_id)
         if song:
-            songname = song.get(SQLDatabase.FIELD_SONGNAME, None)
+            # TODO: Clarifey what `get_song_by_id` should return.
+            songname = song.get("song_name", None)
         else:
             return None
 
@@ -145,6 +128,35 @@ class Dejavu(object):
         return r.recognize(*options, **kwoptions)
 
 
+def _fingerprint_worker(filename, db):
+    song_name, extension = os.path.splitext(os.path.basename(filename))
+
+    channels, Fs = decoder.read(filename)
+
+    # insert song into database
+    sid = db.insert_song(song_name)
+
+    channel_amount = len(channels)
+    for channeln, channel in enumerate(channels):
+        # TODO: Remove prints or change them into optional logging.
+        print("Fingerprinting channel %d/%d for %s" % (channeln + 1,
+                                                       channel_amount,
+                                                       filename))
+        hashes = fingerprint.fingerprint(channel, Fs=Fs)
+        print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount,
+                                                 filename))
+
+        print("Inserting fingerprints for channel %d/%d for %s" %
+              (channeln + 1, channel_amount, filename))
+        db.insert_hashes(sid, hashes)
+        print("Finished inserting for channel %d/%d for  %s" %
+              (channeln + 1, channel_amount, filename))
+
+    print("Marking %s finished" % (filename,))
+    db.set_song_fingerprinted(sid)
+    print("%s finished" % (filename,))
+
+
 def chunkify(lst, n):
     """
     Splits a list into roughly n equal parts.
diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py
index a93b1f9..565d83f 100644
--- a/dejavu/database_sql.py
+++ b/dejavu/database_sql.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-from itertools import izip_longest, ifilter
+from itertools import izip_longest
 import Queue
 
 import MySQLdb as mysql
@@ -312,7 +312,8 @@ class SQLDatabase(Database):
 
 def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
-    return (ifilter(None, values) for values in izip_longest(fillvalue=fillvalue, *args))
+    return (filter(None, values) for values
+            in izip_longest(fillvalue=fillvalue, *args))
 
 
 def cursor_factory(**factory_options):