From 6a6ae94e3d2c5aee5b6cf0f60d26c4f0b4276173 Mon Sep 17 00:00:00 2001 From: Vin Date: Mon, 16 Dec 2013 23:12:50 +0000 Subject: [PATCH 01/27] Refactored the fingerprint module --- dejavu/fingerprint.py | 221 ++++++++++++++++++++++-------------------- 1 file changed, 116 insertions(+), 105 deletions(-) mode change 100644 => 100755 dejavu/fingerprint.py diff --git a/dejavu/fingerprint.py b/dejavu/fingerprint.py old mode 100644 new mode 100755 index d1f78cc..41451f3 --- a/dejavu/fingerprint.py +++ b/dejavu/fingerprint.py @@ -13,19 +13,117 @@ import time import hashlib import pickle +IDX_FREQ_I = 0 +IDX_TIME_J = 1 + +DEFAULT_FS = 44100 +DEFAULT_WINDOW_SIZE = 4096 +DEFAULT_OVERLAP_RATIO = 0.5 +DEFAULT_FAN_VALUE = 15 + +DEFAULT_AMP_MIN = 10 +PEAK_NEIGHBORHOOD_SIZE = 20 +MIN_HASH_TIME_DELTA = 0 + +def fingerprint(channel_samples, + Fs=DEFAULT_FS, + wsize=DEFAULT_WINDOW_SIZE, + wratio=DEFAULT_OVERLAP_RATIO, + fan_value=DEFAULT_FAN_VALUE, + amp_min=DEFAULT_AMP_MIN): + """ + FFT the channel, log transform output, find local maxima, then return + locally sensitive hashes. + """ + # FFT the signal and extract frequency components + arr2D = mlab.specgram( + channel_samples, + NFFT=wsize, + Fs=Fs, + window=mlab.window_hanning, + noverlap=int(wsize * wratio))[0] + + # apply log transform since specgram() returns linear array + arr2D = 10 * np.log10(arr2D) + arr2D[arr2D == -np.inf] = 0 # replace infs with zeros + + # find local maxima + local_maxima = get_2D_peaks(arr2D, plot=False, amp_min=amp_min) + + # return hashes + return generate_hashes(local_maxima, fan_value=fan_value) + +def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN): + + # http://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.morphology.iterate_structure.html#scipy.ndimage.morphology.iterate_structure + struct = generate_binary_structure(2, 1) + neighborhood = iterate_structure(struct, PEAK_NEIGHBORHOOD_SIZE) + + # find local maxima using our fliter shape + local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D + background = (arr2D == 0) + eroded_background = binary_erosion(background, structure=neighborhood, border_value=1) + detected_peaks = local_max - eroded_background # this is a boolean mask of arr2D with True at peaks + + # extract peaks + amps = arr2D[detected_peaks] + j, i = np.where(detected_peaks) + + # filter peaks + amps = amps.flatten() + peaks = zip(i, j, amps) + peaks_filtered = [x for x in peaks if x[2] > amp_min] # freq, time, amp + + # get indices for frequency and time + frequency_idx = [x[1] for x in peaks_filtered] + time_idx = [x[0] for x in peaks_filtered] + + if plot: + # scatter of the peaks + fig, ax = plt.subplots() + ax.imshow(arr2D) + ax.scatter(time_idx, frequency_idx) + ax.set_xlabel('Time') + ax.set_ylabel('Frequency') + ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke"); + plt.gca().invert_yaxis() + plt.show() + + return zip(frequency_idx, time_idx) + +def generate_hashes(peaks, fan_value=DEFAULT_FAN_VALUE): + """ + Hash list structure: + sha1-hash[0:20] time_offset + [(e05b341a9b77a51fd26, 32), ... ] + """ + fingerprinted = set() # to avoid rehashing same pairs + hashes = [] + + for i in range(len(peaks)): + for j in range(fan_value): + if i+j < len(peaks) and not (i, i+j) in fingerprinted: + + freq1 = peaks[i][IDX_FREQ_I] + freq2 = peaks[i+j][IDX_FREQ_I] + t1 = peaks[i][IDX_TIME_J] + t2 = peaks[i+j][IDX_TIME_J] + t_delta = t2 - t1 + + if t_delta >= MIN_HASH_TIME_DELTA: + h = hashlib.sha1("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta))) + hashes.append((h.hexdigest()[0:20], t1)) + + # ensure we don't repeat hashing + fingerprinted.add((i, i+j)) + return hashes + +# TODO: move all of the below to a class with DB access + + class Fingerprinter(): - IDX_FREQ_I = 0 - IDX_TIME_J = 1 - - DEFAULT_FS = 44100 - DEFAULT_WINDOW_SIZE = 4096 - DEFAULT_OVERLAP_RATIO = 0.5 - DEFAULT_FAN_VALUE = 15 - - DEFAULT_AMP_MIN = 10 - PEAK_NEIGHBORHOOD_SIZE = 20 - MIN_HASH_TIME_DELTA = 0 + def __init__(self, config, Fs=DEFAULT_FS, @@ -55,104 +153,15 @@ class Fingerprinter(): hashes = self.process_channel(samples, song_id=sid) print "Generated %d hashes" % len(hashes) self.db.insert_hashes(hashes) - + + # TODO: put this in another module def match(self, samples): """Used for matching unknown songs""" hashes = self.process_channel(samples) matches = self.db.return_matches(hashes) return matches - def process_channel(self, channel_samples, song_id=None): - """ - FFT the channel, log transform output, find local maxima, then return - locally sensitive hashes. - """ - # FFT the signal and extract frequency components - arr2D = mlab.specgram( - channel_samples, - NFFT=self.window_size, - Fs=self.Fs, - window=mlab.window_hanning, - noverlap=self.noverlap)[0] - - # apply log transform since specgram() returns linear array - arr2D = 10 * np.log10(arr2D) - arr2D[arr2D == -np.inf] = 0 # replace infs with zeros - - # find local maxima - local_maxima = self.get_2D_peaks(arr2D, plot=False) - - # return hashes - return self.generate_hashes(local_maxima, song_id=song_id) - - def get_2D_peaks(self, arr2D, plot=False): - - # http://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.morphology.iterate_structure.html#scipy.ndimage.morphology.iterate_structure - struct = generate_binary_structure(2, 1) - neighborhood = iterate_structure(struct, Fingerprinter.PEAK_NEIGHBORHOOD_SIZE) - - # find local maxima using our fliter shape - local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D - background = (arr2D == 0) - eroded_background = binary_erosion(background, structure=neighborhood, border_value=1) - detected_peaks = local_max - eroded_background # this is a boolean mask of arr2D with True at peaks - - # extract peaks - amps = arr2D[detected_peaks] - j, i = np.where(detected_peaks) - - # filter peaks - amps = amps.flatten() - peaks = zip(i, j, amps) - peaks_filtered = [x for x in peaks if x[2] > self.amp_min] # freq, time, amp - - # get indices for frequency and time - frequency_idx = [x[1] for x in peaks_filtered] - time_idx = [x[0] for x in peaks_filtered] - - if plot: - # scatter of the peaks - fig, ax = plt.subplots() - ax.imshow(arr2D) - ax.scatter(time_idx, frequency_idx) - ax.set_xlabel('Time') - ax.set_ylabel('Frequency') - ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke"); - plt.gca().invert_yaxis() - plt.show() - - return zip(frequency_idx, time_idx) - - def generate_hashes(self, peaks, song_id=None): - """ - Hash list structure: - sha1-hash[0:20] song_id, time_offset - [(e05b341a9b77a51fd26, (3, 32)), ... ] - """ - fingerprinted = set() # to avoid rehashing same pairs - hashes = [] - - for i in range(len(peaks)): - for j in range(self.fan_value): - if i+j < len(peaks) and not (i, i+j) in fingerprinted: - - freq1 = peaks[i][Fingerprinter.IDX_FREQ_I] - freq2 = peaks[i+j][Fingerprinter.IDX_FREQ_I] - t1 = peaks[i][Fingerprinter.IDX_TIME_J] - t2 = peaks[i+j][Fingerprinter.IDX_TIME_J] - t_delta = t2 - t1 - - if t_delta >= Fingerprinter.MIN_HASH_TIME_DELTA: - h = hashlib.sha1("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta))) - hashes.append((h.hexdigest()[0:20], (song_id, t1))) - - # ensure we don't repeat hashing - fingerprinted.add((i, i+j)) - return hashes - - def insert_into_db(self, key, value): - self.db.insert(key, value) - + # TODO: this function has nothing to do with fingerprinting. is it needed? def print_stats(self): iterable = self.db.get_iterable_kv_pairs() @@ -168,10 +177,12 @@ class Fingerprinter(): for song_id, count in counter.iteritems(): song_name = self.song_names[song_id] print "%s has %d spectrogram peaks" % (song_name, count) - + + # this does... what? this seems to only be used for the above function def set_song_names(self, wpaths): self.song_names = wpaths - + + # TODO: put this in another module def align_matches(self, matches, starttime, record_seconds=0, verbose=False): """ Finds hash matches that align in time with other matches and finds From a4ed61265889b63bd9bd76370f847dc50c5c341c Mon Sep 17 00:00:00 2001 From: Vin Date: Mon, 16 Dec 2013 23:38:58 +0000 Subject: [PATCH 02/27] Moved main Dejavu class from dejavu.control to dejavu --- dejavu/__init__.py | 176 ++++++++++++++++++++++++++++++++++++++++++ dejavu/control.py | 107 ------------------------- dejavu/fingerprint.py | 58 +------------- go.py | 2 +- 4 files changed, 178 insertions(+), 165 deletions(-) mode change 100644 => 100755 dejavu/__init__.py delete mode 100644 dejavu/control.py mode change 100644 => 100755 go.py diff --git a/dejavu/__init__.py b/dejavu/__init__.py old mode 100644 new mode 100755 index e69de29..b44897d --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -0,0 +1,176 @@ +from dejavu.database import SQLDatabase +from dejavu.convert import Converter +import dejavu.fingerprint as fingerprint +from scipy.io import wavfile +from multiprocessing import Process +import wave, os +import random + +DEBUG = False + +class Dejavu(): + + def __init__(self, config): + + self.config = config + + # initialize db + database = SQLDatabase( + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME), + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME), + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD), + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE)) + self.db = database + + # create components + self.converter = Converter() + #self.fingerprinter = Fingerprinter(self.config) + self.db.setup() + + # get songs previously indexed + self.songs = self.db.get_songs() + self.songnames_set = set() # to know which ones we've computed before + if self.songs: + for song in self.songs: + song_id = song[SQLDatabase.FIELD_SONG_ID] + song_name = song[SQLDatabase.FIELD_SONGNAME] + self.songnames_set.add(song_name) + print "Added: %s to the set of fingerprinted songs..." % song_name + + def chunkify(self, lst, n): + """ + Splits a list into roughly n equal parts. + http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts + """ + return [lst[i::n] for i in xrange(n)] + + def fingerprint(self, path, output, extensions, nprocesses): + + # convert files, shuffle order + files = self.converter.find_files(path, extensions) + random.shuffle(files) + files_split = self.chunkify(files, nprocesses) + + # split into processes here + processes = [] + for i in range(nprocesses): + + # need database instance since mysql connections shouldn't be shared across processes + sql_connection = SQLDatabase( + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME), + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME), + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD), + self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE)) + + # create process and start it + p = Process(target=self.fingerprint_worker, args=(files_split[i], sql_connection, output)) + p.start() + processes.append(p) + + # wait for all processes to complete + for p in processes: + p.join() + + # delete orphans + # print "Done fingerprinting. Deleting orphaned fingerprints..." + # TODO: need a more performant query in database.py for the + #self.fingerprinter.db.delete_orphans() + + def fingerprint_worker(self, files, sql_connection, output): + + for filename, extension in files: + + # if there are already fingerprints in database, don't re-fingerprint or convert + song_name = os.path.basename(filename).split(".")[0] + if DEBUG and song_name in self.songnames_set: + print("-> Already fingerprinted, continuing...") + continue + + # convert to WAV + wavout_path = self.converter.convert(filename, extension, Converter.WAV, output, song_name) + + # insert song name into database + song_id = sql_connection.insert_song(song_name) + + # for each channel perform FFT analysis and fingerprinting + channels, Fs = self.extract_channels(wavout_path) + for c in range(len(channels)): + channel = channels[c] + print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name) + hashes = fingerprint.fingerprint(channel, Fs=Fs) + sql_connection.insert_hashes(song_id, hashes) + + # only after done fingerprinting do confirm + sql_connection.set_song_fingerprinted(song_id) + + def extract_channels(self, path): + """ + Reads channels from disk. + Returns a tuple with (channels, sample_rate) + """ + channels = [] + Fs, frames = wavfile.read(path) + wave_object = wave.open(path) + nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() + #assert Fs == self.fingerprinter.Fs + + for channel in range(nchannels): + channels.append(frames[:, channel]) + return (channels, Fs) + + def match(self, samples, Fs=fingerprint.DEFAULT_FS): + hashes = fingerprint.fingerprint(samples, Fs=Fs) + return self.db.return_matches(hashes) + + def align_matches(self, matches, starttime, record_seconds=None): + """ + Finds hash matches that align in time with other matches and finds + consensus about which hashes are "true" signal from the audio. + + Returns a dictionary with match information. + """ + # align by diffs + diff_counter = {} + largest = 0 + largest_count = 0 + song_id = -1 + for tup in matches: + sid, diff = tup + if not diff in diff_counter: + diff_counter[diff] = {} + if not sid in diff_counter[diff]: + diff_counter[diff][sid] = 0 + diff_counter[diff][sid] += 1 + + if diff_counter[diff][sid] > largest_count: + largest = diff + largest_count = diff_counter[diff][sid] + song_id = sid + + if DEBUG: + print("Diff is %d with %d offset-aligned matches" % (largest, largest_count)) + + # extract idenfication + song = self.db.get_song_by_id(song_id) + if song: + songname = song.get(SQLDatabase.FIELD_SONGNAME, None) + else: + return None + songname = songname.replace("_", " ") + elapsed = time.time() - starttime + + if DEBUG: + print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed)) + + # return match info + song = { + "song_id" : song_id, + "song_name" : songname, + "match_time" : elapsed, + "confidence" : largest_count + } + + if record_seconds: + song['record_time'] = record_seconds + + return song \ No newline at end of file diff --git a/dejavu/control.py b/dejavu/control.py deleted file mode 100644 index 606e366..0000000 --- a/dejavu/control.py +++ /dev/null @@ -1,107 +0,0 @@ -from dejavu.database import SQLDatabase -from dejavu.convert import Converter -from dejavu.fingerprint import Fingerprinter -from scipy.io import wavfile -from multiprocessing import Process -import wave, os -import random - -class Dejavu(): - - def __init__(self, config): - - self.config = config - - # create components - self.converter = Converter() - self.fingerprinter = Fingerprinter(self.config) - self.fingerprinter.db.setup() - - # get songs previously indexed - self.songs = self.fingerprinter.db.get_songs() - self.songnames_set = set() # to know which ones we've computed before - if self.songs: - for song in self.songs: - song_id = song[SQLDatabase.FIELD_SONG_ID] - song_name = song[SQLDatabase.FIELD_SONGNAME] - self.songnames_set.add(song_name) - print "Added: %s to the set of fingerprinted songs..." % song_name - - def chunkify(self, lst, n): - """ - Splits a list into roughly n equal parts. - http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts - """ - return [lst[i::n] for i in xrange(n)] - - def fingerprint(self, path, output, extensions, nprocesses): - - # convert files, shuffle order - files = self.converter.find_files(path, extensions) - random.shuffle(files) - files_split = self.chunkify(files, nprocesses) - - # split into processes here - processes = [] - for i in range(nprocesses): - - # need database instance since mysql connections shouldn't be shared across processes - sql_connection = SQLDatabase( - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE)) - - # create process and start it - p = Process(target=self.fingerprint_worker, args=(files_split[i], sql_connection, output)) - p.start() - processes.append(p) - - # wait for all processes to complete - for p in processes: - p.join() - - # delete orphans - # print "Done fingerprinting. Deleting orphaned fingerprints..." - # TODO: need a more performant query in database.py for the - #self.fingerprinter.db.delete_orphans() - - def fingerprint_worker(self, files, sql_connection, output): - - for filename, extension in files: - - # if there are already fingerprints in database, don't re-fingerprint or convert - song_name = os.path.basename(filename).split(".")[0] - if song_name in self.songnames_set: - print "-> Already fingerprinted, continuing..." - continue - - # convert to WAV - wavout_path = self.converter.convert(filename, extension, Converter.WAV, output, song_name) - - # insert song name into database - song_id = sql_connection.insert_song(song_name) - - # for each channel perform FFT analysis and fingerprinting - channels = self.extract_channels(wavout_path) - for c in range(len(channels)): - channel = channels[c] - print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name) - self.fingerprinter.fingerprint(channel, wavout_path, song_id, c+1) - - # only after done fingerprinting do confirm - sql_connection.set_song_fingerprinted(song_id) - - def extract_channels(self, path): - """ - Reads channels from disk. - """ - channels = [] - Fs, frames = wavfile.read(path) - wave_object = wave.open(path) - nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() - assert Fs == self.fingerprinter.Fs - - for channel in range(nchannels): - channels.append(frames[:, channel]) - return channels \ No newline at end of file diff --git a/dejavu/fingerprint.py b/dejavu/fingerprint.py index 41451f3..6108799 100755 --- a/dejavu/fingerprint.py +++ b/dejavu/fingerprint.py @@ -133,12 +133,7 @@ class Fingerprinter(): amp_min=DEFAULT_AMP_MIN): self.config = config - database = SQLDatabase( - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE)) - self.db = database + self.Fs = Fs self.dt = 1.0 / self.Fs @@ -183,55 +178,4 @@ class Fingerprinter(): self.song_names = wpaths # TODO: put this in another module - def align_matches(self, matches, starttime, record_seconds=0, verbose=False): - """ - Finds hash matches that align in time with other matches and finds - consensus about which hashes are "true" signal from the audio. - - Returns a dictionary with match information. - """ - # align by diffs - diff_counter = {} - largest = 0 - largest_count = 0 - song_id = -1 - for tup in matches: - sid, diff = tup - if not diff in diff_counter: - diff_counter[diff] = {} - if not sid in diff_counter[diff]: - diff_counter[diff][sid] = 0 - diff_counter[diff][sid] += 1 - if diff_counter[diff][sid] > largest_count: - largest = diff - largest_count = diff_counter[diff][sid] - song_id = sid - - if verbose: - print "Diff is %d with %d offset-aligned matches" % (largest, largest_count) - - # extract idenfication - song = self.db.get_song_by_id(song_id) - if song: - songname = song.get(SQLDatabase.FIELD_SONGNAME, None) - else: - return None - songname = songname.replace("_", " ") - elapsed = time.time() - starttime - - if verbose: - print "Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed) - - # return match info - song = { - "song_id" : song_id, - "song_name" : songname, - "match_time" : elapsed, - "confidence" : largest_count - } - - if record_seconds: - song['record_time'] = record_seconds - - return song diff --git a/go.py b/go.py old mode 100644 new mode 100755 index 2aaee89..9108f4e --- a/go.py +++ b/go.py @@ -1,4 +1,4 @@ -from dejavu.control import Dejavu +from dejavu import Dejavu from ConfigParser import ConfigParser import warnings warnings.filterwarnings("ignore") From 0bd7219b872aa85dbab030d3ff21d4185e26f901 Mon Sep 17 00:00:00 2001 From: Wessie Date: Tue, 17 Dec 2013 01:39:03 +0100 Subject: [PATCH 03/27] Cleaned up the database driver. - The SQLDatabase class now uses a context manager for mysql access. - Most of the error handling is done by the context manager now - Optimized several methods that returned a list into returning a generator - Optimized return_matches to use an IN query instead. - Other small fixes. --- dejavu/cursor.py | 52 ++++++ dejavu/database.py | 397 ++++++++++++++++++++++----------------------- 2 files changed, 243 insertions(+), 206 deletions(-) create mode 100644 dejavu/cursor.py diff --git a/dejavu/cursor.py b/dejavu/cursor.py new file mode 100644 index 0000000..1b46b52 --- /dev/null +++ b/dejavu/cursor.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals +from __future__ import absolute_import +import Queue + +import pymysql +import pymysql.cursors + + +def cursor_factory(**factory_options): + def cursor(**options): + options.update(factory_options) + return Cursor(**options) + return cursor + + +class Cursor(object): + """ + Establishes a connection to the database and returns an open cursor. + + + ```python + # Use as context manager + with Cursor() as cur: + cur.execute(query) + ``` + """ + _cache = Queue.Queue(maxsize=5) + + def __init__(self, cursor_type=pymysql.cursors.DictCursor, **options): + super(Cursor, self).__init__() + + try: + conn = self._cache.get_nowait() + except Queue.Empty: + conn = pymysql.connect(**options) + + self.conn = conn + self.cursor_type = cursor_type + + def __enter__(self): + self.cursor = self.conn.cursor(self.cursor_type) + return self.cursor + + def __exit__(self, type, value, traceback): + self.cursor.close() + self.conn.commit() + + # Put it back on the queue + try: + self._cache.put_nowait(self.conn) + except Queue.Full: + self.conn.close() diff --git a/dejavu/database.py b/dejavu/database.py index f03e33e..018ddd4 100644 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -1,40 +1,44 @@ -import MySQLdb as mysql -import MySQLdb.cursors as cursors -import os +from __future__ import absolute_import +from binascii import unhexlify -class SQLDatabase(): +class Database(object): + def __init__(self): + super(Database, self).__init__() + + +class SQLDatabase(Database): """ Queries: 1) Find duplicates (shouldn't be any, though): - select `hash`, `song_id`, `offset`, count(*) cnt - from fingerprints - group by `hash`, `song_id`, `offset` + select `hash`, `song_id`, `offset`, count(*) cnt + from fingerprints + group by `hash`, `song_id`, `offset` having cnt > 1 order by cnt asc; 2) Get number of hashes by song: - select song_id, song_name, count(song_id) as num - from fingerprints + select song_id, song_name, count(song_id) as num + from fingerprints natural join songs - group by song_id + group by song_id order by count(song_id) desc; 3) get hashes with highest number of collisions - select - hash, - count(distinct song_id) as n - from fingerprints - group by `hash` + select + hash, + count(distinct song_id) as n + from fingerprints + group by `hash` order by n DESC; => 26 different songs with same fingerprint (392 times): - - select songs.song_name, fingerprints.offset - from fingerprints natural join songs + + select songs.song_name, fingerprints.offset + from fingerprints natural join songs where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73"; """ @@ -57,269 +61,250 @@ class SQLDatabase(): FIELD_FINGERPRINTED = "fingerprinted" # creates - CREATE_FINGERPRINTS_TABLE = """ - CREATE TABLE IF NOT EXISTS `%s` ( - `%s` binary(10) not null, - `%s` mediumint unsigned not null, - `%s` int unsigned not null, + CREATE_FINGERPRINTS_TABLE = """ + CREATE TABLE IF NOT EXISTS `%s` ( + `%s` binary(10) not null, + `%s` mediumint unsigned not null, + `%s` int unsigned not null, INDEX(%s), UNIQUE(%s, %s, %s) - );""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, - FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, - FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH) - + );""" % ( + FINGERPRINTS_TABLENAME, FIELD_HASH, + FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, + FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, + ) + CREATE_SONGS_TABLE = """ - CREATE TABLE IF NOT EXISTS `%s` ( - `%s` mediumint unsigned not null auto_increment, - `%s` varchar(250) not null, - `%s` tinyint default 0, + CREATE TABLE IF NOT EXISTS `%s` ( + `%s` mediumint unsigned not null auto_increment, + `%s` varchar(250) not null, + `%s` tinyint default 0, PRIMARY KEY (`%s`), UNIQUE KEY `%s` (`%s`) - );""" % (SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, - FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID) + );""" % ( + SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, + FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, + ) + + # inserts (ignores duplicates) + INSERT_FINGERPRINT = """ + INSERT IGNORE INTO %s (%s, %s, %s) VALUES + (UNHEX(%%s), %%s, %%s); + """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) - # inserts - INSERT_FINGERPRINT = "INSERT IGNORE INTO %s (%s, %s, %s) VALUES (UNHEX(%%s), %%s, %%s)" % ( - FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) # ignore duplicates and don't insert them INSERT_SONG = "INSERT INTO %s (%s) VALUES (%%s);" % ( SONGS_TABLENAME, FIELD_SONGNAME) # selects - SELECT = "SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) - SELECT_ALL = "SELECT %s, %s FROM %s;" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) - SELECT_SONG = "SELECT %s FROM %s WHERE %s = %%s" % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID) - SELECT_NUM_FINGERPRINTS = "SELECT COUNT(*) as n FROM %s" % (FINGERPRINTS_TABLENAME) - - SELECT_UNIQUE_SONG_IDS = "SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;" % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) - SELECT_SONGS = "SELECT %s, %s FROM %s WHERE %s = 1;" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED) + SELECT = """ + SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); + """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) + + SELECT_MULTIPLE = """ + SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); + """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, + FINGERPRINTS_TABLENAME, FIELD_HASH) + + SELECT_ALL = """ + SELECT %s, %s FROM %s; + """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) + + SELECT_SONG = """ + SELECT %s FROM %s WHERE %s = %%s + """ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID) + + SELECT_NUM_FINGERPRINTS = """ + SELECT COUNT(*) as n FROM %s + """ % (FINGERPRINTS_TABLENAME) + + SELECT_UNIQUE_SONG_IDS = """ + SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; + """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) + + SELECT_SONGS = """ + SELECT %s, %s FROM %s WHERE %s = 1; + """ % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED) # drops DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME # update - UPDATE_SONG_FINGERPRINTED = "UPDATE %s SET %s = 1 WHERE %s = %%s" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) + UPDATE_SONG_FINGERPRINTED = """ + UPDATE %s SET %s = 1 WHERE %s = %%s + """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) # delete - DELETE_UNFINGERPRINTED = "DELETE FROM %s WHERE %s = 0;" % (SONGS_TABLENAME, FIELD_FINGERPRINTED) + DELETE_UNFINGERPRINTED = """ + DELETE FROM %s WHERE %s = 0; + """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED) + DELETE_ORPHANS = """ - delete from fingerprints - where not exists ( - select * from songs where fingerprints.song_id = songs.song_id - )""" - - def __init__(self, hostname, username, password, database): - # connect - self.database = database - try: - # http://www.halfcooked.com/mt/archives/000969.html - self.connection = mysql.connect( - hostname, username, password, - database, cursorclass=cursors.DictCursor) + delete from fingerprints + where not exists ( + select * from songs where fingerprints.song_id = songs.song_id + ); + """ - self.connection.autocommit(False) # for fast bulk inserts - self.cursor = self.connection.cursor() - - except mysql.Error, e: - print "Connection error %d: %s" % (e.args[0], e.args[1]) + def __init__(self, cursor): + super(SQLDatabase, self).__init__() + self.cursor = cursor def setup(self): - try: - # create fingerprints table - self.cursor.execute("USE %s;" % self.database) - self.cursor.execute(SQLDatabase.CREATE_FINGERPRINTS_TABLE) - self.cursor.execute(SQLDatabase.CREATE_SONGS_TABLE) - self.delete_unfingerprinted_songs() - self.connection.commit() - except mysql.Error, e: - print "Connection error %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() + with self.cursor() as cur: + cur.execute(self.CREATE_FINGERPRINTS_TABLE) + cur.execute(self.CREATE_SONGS_TABLE) + cur.execute(self.DELETE_UNFINGERPRINTED) def empty(self): """ Drops all tables and re-adds them. Be carfeul with this! """ - try: - self.cursor.execute("USE %s;" % self.database) + with self.cursor() as cur: + cur.execute(self.DROP_FINGERPRINTS) + cur.execute(self.DROP_SONGS) - # drop tables - self.cursor.execute(SQLDatabase.DROP_FINGERPRINTS) - self.cursor.execute(SQLDatabase.DROP_SONGS) + self.setup() - # recreate - self.cursor.execute(SQLDatabase.CREATE_FINGERPRINTS_TABLE) - self.cursor.execute(SQLDatabase.CREATE_SONGS_TABLE) - self.connection.commit() - - except mysql.Error, e: - print "Error in empty(), %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() - def delete_orphans(self): - try: - self.cursor = self.connection.cursor() - ### TODO: SQLDatabase.DELETE_ORPHANS is not performant enough, need better query - ### to delete fingerprints for which no song is tied to. - #self.cursor.execute(SQLDatabase.DELETE_ORPHANS) - #self.connection.commit() - except mysql.Error, e: - print "Error in delete_orphans(), %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() - + # TODO: SQLDatabase.DELETE_ORPHANS is not + # performant enough, need better query to + # delete fingerprints for which no song is tied to. + + # with self.cursor() as cur: + # cur.execute(self.DELETE_ORPHANS) + pass + def delete_unfingerprinted_songs(self): - try: - self.cursor = self.connection.cursor() - self.cursor.execute(SQLDatabase.DELETE_UNFINGERPRINTED) - self.connection.commit() - except mysql.Error, e: - print "Error in delete_unfingerprinted_songs(), %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() + with self.cursor() as cur: + cur.execute(self.DELETE_UNFINGERPRINTED) def get_num_songs(self): """ - Returns number of songs the database has fingerprinted. + Returns number of songs the database has fingerprinted. """ - try: - self.cursor = self.connection.cursor() - self.cursor.execute(SQLDatabase.SELECT_UNIQUE_SONG_IDS) - record = self.cursor.fetchone() - return int(record['n']) - except mysql.Error, e: - print "Error in get_num_songs(), %d: %s" % (e.args[0], e.args[1]) - + with self.cursor() as cur: + cur.execute(self.SELECT_UNIQUE_SONG_IDS) + + for row in cur: + return row['n'] + def get_num_fingerprints(self): """ - Returns number of fingerprints the database has fingerprinted. + Returns number of fingerprints the database has fingerprinted. """ - try: - self.cursor = self.connection.cursor() - self.cursor.execute(SQLDatabase.SELECT_NUM_FINGERPRINTS) - record = self.cursor.fetchone() - return int(record['n']) - except mysql.Error, e: - print "Error in get_num_songs(), %d: %s" % (e.args[0], e.args[1]) - + with self.cursor() as cur: + cur.execute(self.SELECT_NUM_FINGERPRINTS) - def set_song_fingerprinted(self, song_id): + for row in cur: + return row['n'] + + def set_song_fingerprinted(self, sid): """ - Set the fingerprinted flag to TRUE (1) once a song has been completely - fingerprinted in the database. + Set the fingerprinted flag to TRUE (1) once a song has been completely + fingerprinted in the database. """ - try: - self.cursor = self.connection.cursor() - self.cursor.execute(SQLDatabase.UPDATE_SONG_FINGERPRINTED, song_id) - self.connection.commit() - except mysql.Error, e: - print "Error in set_song_fingerprinted(), %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() + with self.cursor() as cur: + cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,)) def get_songs(self): """ - Return songs that have the fingerprinted flag set TRUE (1). + Return songs that have the fingerprinted flag set TRUE (1). """ - try: - self.cursor.execute(SQLDatabase.SELECT_SONGS) - return self.cursor.fetchall() - except mysql.Error, e: - print "Error in get_songs(), %d: %s" % (e.args[0], e.args[1]) - return None - + with self.cursor() as cur: + cur.execute(self.SELECT_SONGS) + for row in cur: + yield row + def get_song_by_id(self, sid): """ - Returns song by its ID. + Returns song by its ID. """ - try: - self.cursor.execute(SQLDatabase.SELECT_SONG, (sid,)) - return self.cursor.fetchone() - except mysql.Error, e: - print "Error in get_songs(), %d: %s" % (e.args[0], e.args[1]) - return None - + with self.cursor() as cur: + cur.execute(self.SELECT_SONG, (sid,)) + return cur.fetchone() - def insert(self, key, value): + def insert(self, hash, sid, offset): """ - Insert a (sha1, song_id, offset) row into database. - - key is a sha1 hash, value = (song_id, offset) + Insert a (sha1, song_id, offset) row into database. """ - try: - args = (key, value[0], value[1]) - self.cursor.execute(SQLDatabase.INSERT_FINGERPRINT, args) - except mysql.Error, e: - print "Error in insert(), %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() + with self.cursor() as cur: + cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset)) def insert_song(self, songname): """ - Inserts song in the database and returns the ID of the inserted record. + Inserts song in the database and returns the ID of the inserted record. """ - try: - self.cursor.execute(SQLDatabase.INSERT_SONG, (songname,)) - self.connection.commit() - return int(self.cursor.lastrowid) - except mysql.Error, e: - print "Error in insert_song(), %d: %s" % (e.args[0], e.args[1]) - self.connection.rollback() - return None + with self.cursor() as cur: + cur.execute(self.INSERT_SONG, (songname,)) + return cur.lastrowid - def query(self, key): + def query(self, hash): """ - Return all tuples associated with hash. + Return all tuples associated with hash. - If hash is None, returns all entries in the - database (be careful with that one!). + If hash is None, returns all entries in the + database (be careful with that one!). """ # select all if no key - if key is not None: - sql = SQLDatabase.SELECT - else: - sql = SQLDatabase.SELECT_ALL + query = self.SELECT_ALL if hash is None else self.SELECT - matches = [] - try: - self.cursor.execute(sql, (key,)) - - # collect all matches - records = self.cursor.fetchall() - for record in records: - matches.append((record[SQLDatabase.FIELD_SONG_ID], record[SQLDatabase.FIELD_OFFSET])) - - except mysql.Error, e: - print "Error in query(), %d: %s" % (e.args[0], e.args[1]) - - return matches + with self.cursor() as cur: + cur.execute(query) + for row in cur: + yield (row[self.FIELD_SONG_ID], row[self.FIELD_OFFSET]) def get_iterable_kv_pairs(self): """ - Returns all tuples in database. + Returns all tuples in database. """ return self.query(None) def insert_hashes(self, hashes): """ - Insert series of hash => song_id, offset - values into the database. + Insert series of hash => song_id, offset + values into the database. """ - for h in hashes: - sha1, val = h - self.insert(sha1, val) - self.connection.commit() + # TODO: Fix this when hashes will be a new format. + values = [] + for hash, (sid, offset) in hashes: + values.append((hash, sid, offset)) + + with self.cursor() as cur: + cur.executemany(self.INSERT_FINGERPRINT, values) def return_matches(self, hashes): """ - Return the (song_id, offset_diff) tuples associated with - a list of + Return the (song_id, offset_diff) tuples associated with + a list of sha1 => (None, sample_offset) values. """ - matches = [] - for h in hashes: - sha1, val = h - list_of_tups = self.query(sha1) - if list_of_tups: - for t in list_of_tups: - # (song_id, db_offset, song_sampled_offset) - matches.append((t[0], t[1] - val[1])) - return matches + from pymysql.cursors import Cursor + # Create a dictionary of hash => offset pairs for later lookups + mapper = {} + for hash, (_, offset) in hashes: + mapper[hash.upper()] = offset + + # Get an iteratable of all the hashes we need + values = mapper.keys() + + with self.cursor(cursor_type=Cursor) as cur: + for split_values in grouper(values, 1000): + # Create our IN part of the query + query = self.SELECT_MULTIPLE + query = query % ', '.join(['UNHEX(%s)'] * len(split_values)) + + cur.execute(query, split_values) + + for hash, sid, offset in cur: + # (sid, db_offset - song_sampled_offset) + yield (sid, offset - mapper[hash]) + + +from itertools import izip_longest +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return izip_longest(fillvalue=fillvalue, *args) From 3bc507d8e42f84e395209fbd6d7504e8ef799080 Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 00:48:49 +0000 Subject: [PATCH 04/27] Changed insert_hashes to fit the new data format --- dejavu/database.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) mode change 100644 => 100755 dejavu/database.py diff --git a/dejavu/database.py b/dejavu/database.py old mode 100644 new mode 100755 index 018ddd4..78b1237 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -260,14 +260,13 @@ class SQLDatabase(Database): """ return self.query(None) - def insert_hashes(self, hashes): + def insert_hashes(self, sid, hashes): """ Insert series of hash => song_id, offset values into the database. """ - # TODO: Fix this when hashes will be a new format. values = [] - for hash, (sid, offset) in hashes: + for hash, offset in hashes: values.append((hash, sid, offset)) with self.cursor() as cur: From ab2cf9d58b1c668f0861bf076f6582c9a3141c3a Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 00:48:49 +0000 Subject: [PATCH 05/27] Changed insert_hashes to fit the new data format --- dejavu/database.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) mode change 100644 => 100755 dejavu/database.py diff --git a/dejavu/database.py b/dejavu/database.py old mode 100644 new mode 100755 index 018ddd4..78b1237 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -260,14 +260,13 @@ class SQLDatabase(Database): """ return self.query(None) - def insert_hashes(self, hashes): + def insert_hashes(self, sid, hashes): """ Insert series of hash => song_id, offset values into the database. """ - # TODO: Fix this when hashes will be a new format. values = [] - for hash, (sid, offset) in hashes: + for hash, offset in hashes: values.append((hash, sid, offset)) with self.cursor() as cur: From bed11f3de7b0020f91fcf08458bff3987fdc9f81 Mon Sep 17 00:00:00 2001 From: Wessie Date: Tue, 17 Dec 2013 02:00:05 +0100 Subject: [PATCH 06/27] Switched back to MySQLdb for better support of executemany. --- dejavu/cursor.py | 10 +++++----- dejavu/database.py | 11 ++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/dejavu/cursor.py b/dejavu/cursor.py index 1b46b52..07649c6 100644 --- a/dejavu/cursor.py +++ b/dejavu/cursor.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals from __future__ import absolute_import import Queue -import pymysql -import pymysql.cursors +import MySQLdb as mysql +import MySQLdb.cursors def cursor_factory(**factory_options): @@ -26,18 +26,18 @@ class Cursor(object): """ _cache = Queue.Queue(maxsize=5) - def __init__(self, cursor_type=pymysql.cursors.DictCursor, **options): + def __init__(self, cursor_type=mysql.cursors.DictCursor, **options): super(Cursor, self).__init__() try: conn = self._cache.get_nowait() except Queue.Empty: - conn = pymysql.connect(**options) + conn = mysql.connect(**options) self.conn = conn self.cursor_type = cursor_type - def __enter__(self): + def __enter__(slf): self.cursor = self.conn.cursor(self.cursor_type) return self.cursor diff --git a/dejavu/database.py b/dejavu/database.py index 78b1237..a08bfcd 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -1,6 +1,8 @@ from __future__ import absolute_import from binascii import unhexlify +from MySQLdb.cursors import Cursor + class Database(object): def __init__(self): super(Database, self).__init__() @@ -274,14 +276,13 @@ class SQLDatabase(Database): def return_matches(self, hashes): """ - Return the (song_id, offset_diff) tuples associated with - a list of + Return the (song_id, offset_diff) tuples associated with + a list of - sha1 => (None, sample_offset) + sha1 => (None, sample_offset) - values. + values. """ - from pymysql.cursors import Cursor # Create a dictionary of hash => offset pairs for later lookups mapper = {} for hash, (_, offset) in hashes: From 25bf97e813fc28001e0a45d4768608f81d036bb2 Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 16:37:06 +0000 Subject: [PATCH 07/27] Changed a method to fit the new hash tuples --- dejavu/database.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dejavu/database.py b/dejavu/database.py index a08bfcd..5d0222f 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -277,15 +277,11 @@ class SQLDatabase(Database): def return_matches(self, hashes): """ Return the (song_id, offset_diff) tuples associated with - a list of - - sha1 => (None, sample_offset) - - values. + a list of (sha1, sample_offset) values. """ # Create a dictionary of hash => offset pairs for later lookups mapper = {} - for hash, (_, offset) in hashes: + for hash, offset in hashes: mapper[hash.upper()] = offset # Get an iteratable of all the hashes we need From f02ab9419255b4d7dcf6705f0c77f1e303327d57 Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 20:18:55 +0000 Subject: [PATCH 08/27] Cleaned up Dejavu class calls (find_matches, align_matches) --- dejavu/__init__.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index b44897d..33414f7 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -118,11 +118,22 @@ class Dejavu(): channels.append(frames[:, channel]) return (channels, Fs) - def match(self, samples, Fs=fingerprint.DEFAULT_FS): + def fingerprint(self, filepath, song_name=None): + # TODO: replace with something that handles all audio formats + channels, Fs = self.extract_channels(path) + if not song_name: + song_name = os.path.basename(filename).split(".")[0] + song_id = self.db.insert_song(song_name) + + for data in channels: + hashes = fingerprint.fingerprint(data, Fs=Fs) + self.db.insert_hashes(song_id, hashes) + + def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS): hashes = fingerprint.fingerprint(samples, Fs=Fs) return self.db.return_matches(hashes) - def align_matches(self, matches, starttime, record_seconds=None): + def align_matches(self, matches): """ Finds hash matches that align in time with other matches and finds consensus about which hashes are "true" signal from the audio. @@ -156,8 +167,6 @@ class Dejavu(): songname = song.get(SQLDatabase.FIELD_SONGNAME, None) else: return None - songname = songname.replace("_", " ") - elapsed = time.time() - starttime if DEBUG: print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed)) @@ -166,11 +175,7 @@ class Dejavu(): song = { "song_id" : song_id, "song_name" : songname, - "match_time" : elapsed, "confidence" : largest_count } - - if record_seconds: - song['record_time'] = record_seconds return song \ No newline at end of file From 6f4cadafbb4780eec6ee78527a38427a968e5be9 Mon Sep 17 00:00:00 2001 From: Wessie Date: Tue, 17 Dec 2013 21:55:05 +0100 Subject: [PATCH 09/27] Added a foreign key relationship to the create table statements. - MySQL will also use the InnoDB engine now. - Added a ping call in the MySQL Cursor cache mechanism - Added a rollback call when a MySQLError occurs - Removed 'delete_orphans' which is not needed anymore due to foreign key constraints and delete on cascade - Changed SQLDatabase to accept options to create a cursor factory, instead of taking a pre-created cursor factory --- dejavu/cursor.py | 11 ++++++++-- dejavu/database.py | 52 ++++++++++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/dejavu/cursor.py b/dejavu/cursor.py index 07649c6..50146f2 100644 --- a/dejavu/cursor.py +++ b/dejavu/cursor.py @@ -33,15 +33,22 @@ class Cursor(object): conn = self._cache.get_nowait() except Queue.Empty: conn = mysql.connect(**options) + else: + # Ping the connection before using it from the cache. + conn.ping(True) self.conn = conn self.cursor_type = cursor_type - def __enter__(slf): + def __enter__(self): self.cursor = self.conn.cursor(self.cursor_type) return self.cursor - def __exit__(self, type, value, traceback): + def __exit__(self, extype, exvalue, traceback): + # if we had a MySQL related error we try to rollback the cursor. + if extype is mysql.MySQLError: + self.cursor.rollback() + self.cursor.close() self.conn.commit() diff --git a/dejavu/database.py b/dejavu/database.py index a08bfcd..082484a 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -1,8 +1,10 @@ from __future__ import absolute_import -from binascii import unhexlify +from itertools import izip_longest +from dejavu.cursor import cursor_factory from MySQLdb.cursors import Cursor + class Database(object): def __init__(self): super(Database, self).__init__() @@ -69,11 +71,13 @@ class SQLDatabase(Database): `%s` mediumint unsigned not null, `%s` int unsigned not null, INDEX(%s), - UNIQUE(%s, %s, %s) - );""" % ( + UNIQUE(%s, %s, %s), + FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE + ) ENGINE=INNODB;""" % ( FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, + FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID ) CREATE_SONGS_TABLE = """ @@ -83,7 +87,7 @@ class SQLDatabase(Database): `%s` tinyint default 0, PRIMARY KEY (`%s`), UNIQUE KEY `%s` (`%s`) - );""" % ( + ) ENGINE=INNODB;""" % ( SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, ) @@ -141,18 +145,17 @@ class SQLDatabase(Database): DELETE FROM %s WHERE %s = 0; """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED) - DELETE_ORPHANS = """ - delete from fingerprints - where not exists ( - select * from songs where fingerprints.song_id = songs.song_id - ); - """ - - def __init__(self, cursor): + def __init__(self, **options): super(SQLDatabase, self).__init__() - self.cursor = cursor + self.cursor = cursor_factory(**options) def setup(self): + """ + Creates any non-existing tables required for dejavu to function. + + This also removes all songs that have been added but have no + fingerprints associated with them. + """ with self.cursor() as cur: cur.execute(self.CREATE_FINGERPRINTS_TABLE) cur.execute(self.CREATE_SONGS_TABLE) @@ -160,7 +163,11 @@ class SQLDatabase(Database): def empty(self): """ - Drops all tables and re-adds them. Be carfeul with this! + Drops tables created by dejavu and then creates them again + by calling `SQLDatabase.setup`. + + .. warning: + This will result in a loss of data """ with self.cursor() as cur: cur.execute(self.DROP_FINGERPRINTS) @@ -168,16 +175,11 @@ class SQLDatabase(Database): self.setup() - def delete_orphans(self): - # TODO: SQLDatabase.DELETE_ORPHANS is not - # performant enough, need better query to - # delete fingerprints for which no song is tied to. - - # with self.cursor() as cur: - # cur.execute(self.DELETE_ORPHANS) - pass def delete_unfingerprinted_songs(self): + """ + Removes all songs that have no fingerprints associated with them. + """ with self.cursor() as cur: cur.execute(self.DELETE_UNFINGERPRINTED) @@ -188,8 +190,9 @@ class SQLDatabase(Database): with self.cursor() as cur: cur.execute(self.SELECT_UNIQUE_SONG_IDS) - for row in cur: - return row['n'] + for count, in cur: + return count + return 0 def get_num_fingerprints(self): """ @@ -304,7 +307,6 @@ class SQLDatabase(Database): yield (sid, offset - mapper[hash]) -from itertools import izip_longest def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) From 371742a3140aeda6cba66c92e0962ccef2e4568f Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 20:55:20 +0000 Subject: [PATCH 10/27] Began moving recognizer functionality into separate classes --- dejavu/recognize.py | 66 ++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 18 deletions(-) mode change 100644 => 100755 dejavu/recognize.py diff --git a/dejavu/recognize.py b/dejavu/recognize.py old mode 100644 new mode 100755 index bcb5d3c..f6a5de5 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -1,5 +1,7 @@ from multiprocessing import Queue, Process from dejavu.database import SQLDatabase +import dejavu.fingerprint +from dejavu import Dejavu from scipy.io import wavfile import wave import numpy as np @@ -8,6 +10,52 @@ import sys import time import array + +class BaseRecognizer(object): + + def __init__(self, dejavu): + self.dejavu = dejavu + self.Fs = dejavu.fingerprint.DEFAULT_FS + + def recognize(self, *data): + matches = [] + for d in data: + matches.extend(self.dejavu.find_matches(data, Fs=self.Fs)) + return self.dejavu.align_matches(matches) + + +class WaveFileRecognizer(BaseRecognizer): + + def __init__(self, dejavu): + super(BaseRecognizer, self).__init__(dejavu) + + def recognize_file(self, filepath): + Fs, frames = wavfile.read(filename) + self.Fs = Fs + + wave_object = wave.open(filename) + nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() + + channels = [] + for channel in range(nchannels): + channels.append(frames[:, channel]) + + t = time.time() + match = self.recognize(*channels) + t = time.time() - t + + if match: + match['match_time'] = t + + return match + + +class MicrophoneRecognizer(BaseRecognizer): + pass + + + + class Recognizer(object): CHUNK = 8192 # 44100 is a multiple of 1225 @@ -20,24 +68,6 @@ class Recognizer(object): self.fingerprinter = fingerprinter self.config = config self.audio = pyaudio.PyAudio() - - def read(self, filename, verbose=False): - - # read file into channels - channels = [] - Fs, frames = wavfile.read(filename) - wave_object = wave.open(filename) - nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() - for channel in range(nchannels): - channels.append(frames[:, channel]) - - # get matches - starttime = time.time() - matches = [] - for channel in channels: - matches.extend(self.fingerprinter.match(channel)) - - return self.fingerprinter.align_matches(matches, starttime, verbose=verbose) def listen(self, seconds=10, verbose=False): From 05adf3bc318bcef7a5472ee3ac2131368d943d78 Mon Sep 17 00:00:00 2001 From: Wessie Date: Tue, 17 Dec 2013 22:02:59 +0100 Subject: [PATCH 11/27] Changed default cursor type for mysql from DictCursor to Cursor. --- dejavu/cursor.py | 2 +- dejavu/database.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dejavu/cursor.py b/dejavu/cursor.py index 50146f2..55429a7 100644 --- a/dejavu/cursor.py +++ b/dejavu/cursor.py @@ -26,7 +26,7 @@ class Cursor(object): """ _cache = Queue.Queue(maxsize=5) - def __init__(self, cursor_type=mysql.cursors.DictCursor, **options): + def __init__(self, cursor_type=mysql.cursors.Cursor, **options): super(Cursor, self).__init__() try: diff --git a/dejavu/database.py b/dejavu/database.py index 082484a..a43b68a 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from itertools import izip_longest from dejavu.cursor import cursor_factory -from MySQLdb.cursors import Cursor +from MySQLdb.cursors import DictCursor class Database(object): @@ -175,7 +175,6 @@ class SQLDatabase(Database): self.setup() - def delete_unfingerprinted_songs(self): """ Removes all songs that have no fingerprints associated with them. @@ -201,8 +200,9 @@ class SQLDatabase(Database): with self.cursor() as cur: cur.execute(self.SELECT_NUM_FINGERPRINTS) - for row in cur: - return row['n'] + for count, in cur: + return count + return 0 def set_song_fingerprinted(self, sid): """ @@ -216,7 +216,7 @@ class SQLDatabase(Database): """ Return songs that have the fingerprinted flag set TRUE (1). """ - with self.cursor() as cur: + with self.cursor(cursor_type=DictCursor) as cur: cur.execute(self.SELECT_SONGS) for row in cur: yield row @@ -225,7 +225,7 @@ class SQLDatabase(Database): """ Returns song by its ID. """ - with self.cursor() as cur: + with self.cursor(cursor_type=DictCursor) as cur: cur.execute(self.SELECT_SONG, (sid,)) return cur.fetchone() @@ -256,8 +256,8 @@ class SQLDatabase(Database): with self.cursor() as cur: cur.execute(query) - for row in cur: - yield (row[self.FIELD_SONG_ID], row[self.FIELD_OFFSET]) + for sid, offset in cur: + yield (sid, offset) def get_iterable_kv_pairs(self): """ @@ -294,7 +294,7 @@ class SQLDatabase(Database): # Get an iteratable of all the hashes we need values = mapper.keys() - with self.cursor(cursor_type=Cursor) as cur: + with self.cursor() as cur: for split_values in grouper(values, 1000): # Create our IN part of the query query = self.SELECT_MULTIPLE From d25c564d4f82f8e3c7c8fa023d05f3b9fd3ed791 Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 21:57:59 +0000 Subject: [PATCH 12/27] Finished the changes to the recognize module - Recognizers now use a class structure - Generic code for matching is in BaseRecognizer - Two recognizers are available: - Wave file recognizer - Recording recognizer --- dejavu/recognize.py | 121 ++++++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 49 deletions(-) diff --git a/dejavu/recognize.py b/dejavu/recognize.py index f6a5de5..6cc1bd4 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -10,6 +10,10 @@ import sys import time import array +CHUNK = 8192 # 44100 is a multiple of 1225 +FORMAT = pyaudio.paInt16 +CHANNELS = 2 +RATE = 44100 class BaseRecognizer(object): @@ -17,19 +21,25 @@ class BaseRecognizer(object): self.dejavu = dejavu self.Fs = dejavu.fingerprint.DEFAULT_FS - def recognize(self, *data): + def _recognize(self, *data): matches = [] for d in data: matches.extend(self.dejavu.find_matches(data, Fs=self.Fs)) return self.dejavu.align_matches(matches) + + def recognize(self): + pass # base class does nothing + + class WaveFileRecognizer(BaseRecognizer): - def __init__(self, dejavu): + def __init__(self, dejavu, filename=None): super(BaseRecognizer, self).__init__(dejavu) + self.filename = filename - def recognize_file(self, filepath): + def recognize_file(self, filename): Fs, frames = wavfile.read(filename) self.Fs = Fs @@ -41,62 +51,75 @@ class WaveFileRecognizer(BaseRecognizer): channels.append(frames[:, channel]) t = time.time() - match = self.recognize(*channels) + match = self._recognize(*channels) t = time.time() - t if match: match['match_time'] = t return match + + def recognize(self): + return self.recognize_file(self.filename) class MicrophoneRecognizer(BaseRecognizer): - pass - - - -class Recognizer(object): - - CHUNK = 8192 # 44100 is a multiple of 1225 - FORMAT = pyaudio.paInt16 - CHANNELS = 2 - RATE = 44100 - - def __init__(self, fingerprinter, config): - - self.fingerprinter = fingerprinter - self.config = config + def __init__(self, dejavu, seconds=None) + super(BaseRecognizer, self).__init__(dejavu) self.audio = pyaudio.PyAudio() + self.stream = None + self.data = [] + self.channels = CHANNELS + self.chunk_size = CHUNK + self.rate = RATE + self.recorded = False - def listen(self, seconds=10, verbose=False): + def start_recording(self, channels=CHANNELS, rate=RATE, chunk=CHUNK): + self.chunk_size = chunk + self.channels = channels + self.recorded = False + self.rate = rate + + if self.stream: + self.stream.stop_stream() + self.stream.close() + + self.stream = self.audio.open(format=FORMAT, + channels=channels, + rate=rate, + input=True, + frames_per_buffer=chunk) + + self.data = [[] for i in range(channels)] + + def process_recording(self): + data = self.stream.read(self.chunk_size) + nums = np.fromstring(data, np.int16) + for c in range(self.channels): + self.data[c].extend(nums[c::c+1]) + + def stop_recording(self): + self.stream.stop_stream() + self.stream.close() + self.stream = None + self.recorded = True + + def recognize_recording(self): + if not self.recorded: + raise NoRecordingError("Recording was not complete/begun") + return self._recognize(*data) + + def get_recorded_time(self): + return len(self.data[0]) / self.rate + + def recognize(self): + self.start_recording() + for i in range(0, int(self.rate / self.chunk * self.seconds)): + self.process_recording() + self.stop_recording() + return self.recognize_recording() + +class NoRecordingError(Exception): + pass - # open stream - stream = self.audio.open(format=Recognizer.FORMAT, - channels=Recognizer.CHANNELS, - rate=Recognizer.RATE, - input=True, - frames_per_buffer=Recognizer.CHUNK) - - # record - if verbose: print("* recording") - left, right = [], [] - for i in range(0, int(Recognizer.RATE / Recognizer.CHUNK * seconds)): - data = stream.read(Recognizer.CHUNK) - nums = np.fromstring(data, np.int16) - left.extend(nums[1::2]) - right.extend(nums[0::2]) - if verbose: print("* done recording") - - # close and stop the stream - stream.stop_stream() - stream.close() - - # match both channels - starttime = time.time() - matches = [] - matches.extend(self.fingerprinter.match(left)) - matches.extend(self.fingerprinter.match(right)) - - # align and return - return self.fingerprinter.align_matches(matches, starttime, record_seconds=seconds, verbose=verbose) \ No newline at end of file From 788b3acebfd202b23d15bf4a1f984d0b680897d7 Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 22:03:50 +0000 Subject: [PATCH 13/27] Added missing 'self' --- dejavu/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dejavu/recognize.py b/dejavu/recognize.py index 6cc1bd4..d743b16 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -108,7 +108,7 @@ class MicrophoneRecognizer(BaseRecognizer): def recognize_recording(self): if not self.recorded: raise NoRecordingError("Recording was not complete/begun") - return self._recognize(*data) + return self._recognize(*self.data) def get_recorded_time(self): return len(self.data[0]) / self.rate From 614ad0d4cc1a2d92d3540aa73e39435e07638afc Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 22:05:09 +0000 Subject: [PATCH 14/27] Moved mic recording constants inside MicrophoneRecognizer --- dejavu/recognize.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dejavu/recognize.py b/dejavu/recognize.py index d743b16..024e62f 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -10,10 +10,6 @@ import sys import time import array -CHUNK = 8192 # 44100 is a multiple of 1225 -FORMAT = pyaudio.paInt16 -CHANNELS = 2 -RATE = 44100 class BaseRecognizer(object): @@ -65,6 +61,11 @@ class WaveFileRecognizer(BaseRecognizer): class MicrophoneRecognizer(BaseRecognizer): + CHUNK = 8192 # 44100 is a multiple of 1225 + FORMAT = pyaudio.paInt16 + CHANNELS = 2 + RATE = 44100 + def __init__(self, dejavu, seconds=None) super(BaseRecognizer, self).__init__(dejavu) self.audio = pyaudio.PyAudio() From 1fcff7e2c5f68d0c672e1610ac9290459387d80e Mon Sep 17 00:00:00 2001 From: Vin Date: Tue, 17 Dec 2013 22:12:21 +0000 Subject: [PATCH 15/27] Fixed some import/class mistakes --- dejavu/__init__.py | 6 +++--- dejavu/recognize.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index 33414f7..340db94 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -1,6 +1,6 @@ from dejavu.database import SQLDatabase from dejavu.convert import Converter -import dejavu.fingerprint as fingerprint +import fingerprint from scipy.io import wavfile from multiprocessing import Process import wave, os @@ -44,7 +44,7 @@ class Dejavu(): """ return [lst[i::n] for i in xrange(n)] - def fingerprint(self, path, output, extensions, nprocesses): + def do_fingerprint(self, path, output, extensions, nprocesses): # convert files, shuffle order files = self.converter.find_files(path, extensions) @@ -118,7 +118,7 @@ class Dejavu(): channels.append(frames[:, channel]) return (channels, Fs) - def fingerprint(self, filepath, song_name=None): + def fingerprint_file(self, filepath, song_name=None): # TODO: replace with something that handles all audio formats channels, Fs = self.extract_channels(path) if not song_name: diff --git a/dejavu/recognize.py b/dejavu/recognize.py index 024e62f..68700fd 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -32,7 +32,7 @@ class BaseRecognizer(object): class WaveFileRecognizer(BaseRecognizer): def __init__(self, dejavu, filename=None): - super(BaseRecognizer, self).__init__(dejavu) + super(WaveFileRecognizer, self).__init__(dejavu) self.filename = filename def recognize_file(self, filename): @@ -66,8 +66,8 @@ class MicrophoneRecognizer(BaseRecognizer): CHANNELS = 2 RATE = 44100 - def __init__(self, dejavu, seconds=None) - super(BaseRecognizer, self).__init__(dejavu) + def __init__(self, dejavu, seconds=None): + super(MicrophoneRecognizer, self).__init__(dejavu) self.audio = pyaudio.PyAudio() self.stream = None self.data = [] From 3b72768f942b8cbd4659649e85bd50f6bc346ac3 Mon Sep 17 00:00:00 2001 From: Wessie Date: Wed, 18 Dec 2013 00:31:57 +0100 Subject: [PATCH 16/27] Fixed various small things that weren't caught before. - Fixes SQL queries for table creations - Table creation is now down in reverse order to accompany the foreign key - Fixed a typo in the BaseRecognizer that caused it to not work - Changed configuration passed to Dejavu into a (nested) dictionary --- dejavu/__init__.py | 72 +++++++++++++++++++-------------------------- dejavu/database.py | 4 +-- dejavu/recognize.py | 57 +++++++++++++++++------------------ 3 files changed, 59 insertions(+), 74 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index 340db94..3222395 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -8,20 +8,14 @@ import random DEBUG = False -class Dejavu(): - +class Dejavu(object): def __init__(self, config): - + self.config = config - + # initialize db - database = SQLDatabase( - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE)) - self.db = database - + self.db = SQLDatabase(**config.get("database", {})) + # create components self.converter = Converter() #self.fingerprinter = Fingerprinter(self.config) @@ -30,16 +24,16 @@ class Dejavu(): # get songs previously indexed self.songs = self.db.get_songs() self.songnames_set = set() # to know which ones we've computed before - if self.songs: - for song in self.songs: - song_id = song[SQLDatabase.FIELD_SONG_ID] - song_name = song[SQLDatabase.FIELD_SONGNAME] - self.songnames_set.add(song_name) - print "Added: %s to the set of fingerprinted songs..." % song_name + + for song in self.songs: + song_name = song[self.db.FIELD_SONGNAME] + + self.songnames_set.add(song_name) + print "Added: %s to the set of fingerprinted songs..." % song_name def chunkify(self, lst, n): """ - Splits a list into roughly n equal parts. + Splits a list into roughly n equal parts. http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts """ return [lst[i::n] for i in xrange(n)] @@ -55,25 +49,19 @@ class Dejavu(): processes = [] for i in range(nprocesses): - # need database instance since mysql connections shouldn't be shared across processes - sql_connection = SQLDatabase( - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_HOSTNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_USERNAME), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_PASSWORD), - self.config.get(SQLDatabase.CONNECTION, SQLDatabase.KEY_DATABASE)) - # create process and start it - p = Process(target=self.fingerprint_worker, args=(files_split[i], sql_connection, output)) + p = Process(target=self.fingerprint_worker, + args=(files_split[i], self.db, output)) p.start() processes.append(p) # wait for all processes to complete for p in processes: p.join() - + # delete orphans # print "Done fingerprinting. Deleting orphaned fingerprints..." - # TODO: need a more performant query in database.py for the + # TODO: need a more performant query in database.py for the #self.fingerprinter.db.delete_orphans() def fingerprint_worker(self, files, sql_connection, output): @@ -82,7 +70,7 @@ class Dejavu(): # if there are already fingerprints in database, don't re-fingerprint or convert song_name = os.path.basename(filename).split(".")[0] - if DEBUG and song_name in self.songnames_set: + if DEBUG and song_name in self.songnames_set: print("-> Already fingerprinted, continuing...") continue @@ -117,27 +105,27 @@ class Dejavu(): for channel in range(nchannels): channels.append(frames[:, channel]) return (channels, Fs) - + def fingerprint_file(self, filepath, song_name=None): # TODO: replace with something that handles all audio formats channels, Fs = self.extract_channels(path) if not song_name: song_name = os.path.basename(filename).split(".")[0] song_id = self.db.insert_song(song_name) - + for data in channels: hashes = fingerprint.fingerprint(data, Fs=Fs) self.db.insert_hashes(song_id, hashes) - + def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS): hashes = fingerprint.fingerprint(samples, Fs=Fs) return self.db.return_matches(hashes) - + def align_matches(self, matches): """ Finds hash matches that align in time with other matches and finds consensus about which hashes are "true" signal from the audio. - + Returns a dictionary with match information. """ # align by diffs @@ -158,24 +146,24 @@ class Dejavu(): largest_count = diff_counter[diff][sid] song_id = sid - if DEBUG: + if DEBUG: print("Diff is %d with %d offset-aligned matches" % (largest, largest_count)) - - # extract idenfication + + # extract idenfication song = self.db.get_song_by_id(song_id) if song: songname = song.get(SQLDatabase.FIELD_SONGNAME, None) else: return None - - if DEBUG: + + if DEBUG: print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed)) - + # return match info song = { "song_id" : song_id, "song_name" : songname, "confidence" : largest_count } - - return song \ No newline at end of file + + return song diff --git a/dejavu/database.py b/dejavu/database.py index 929c4eb..75af1bf 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -70,7 +70,7 @@ class SQLDatabase(Database): `%s` binary(10) not null, `%s` mediumint unsigned not null, `%s` int unsigned not null, - INDEX(%s), + PRIMARY KEY(%s), UNIQUE(%s, %s, %s), FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE ) ENGINE=INNODB;""" % ( @@ -157,8 +157,8 @@ class SQLDatabase(Database): fingerprints associated with them. """ with self.cursor() as cur: - cur.execute(self.CREATE_FINGERPRINTS_TABLE) cur.execute(self.CREATE_SONGS_TABLE) + cur.execute(self.CREATE_FINGERPRINTS_TABLE) cur.execute(self.DELETE_UNFINGERPRINTED) def empty(self): diff --git a/dejavu/recognize.py b/dejavu/recognize.py index 68700fd..9283c6a 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -1,6 +1,6 @@ from multiprocessing import Queue, Process from dejavu.database import SQLDatabase -import dejavu.fingerprint +import dejavu.fingerprint as fingerprint from dejavu import Dejavu from scipy.io import wavfile import wave @@ -12,60 +12,57 @@ import array class BaseRecognizer(object): - + def __init__(self, dejavu): self.dejavu = dejavu - self.Fs = dejavu.fingerprint.DEFAULT_FS - + self.Fs = fingerprint.DEFAULT_FS + def _recognize(self, *data): matches = [] for d in data: - matches.extend(self.dejavu.find_matches(data, Fs=self.Fs)) + matches.extend(self.dejavu.find_matches(d, Fs=self.Fs)) return self.dejavu.align_matches(matches) - - def recognize(self): - pass # base class does nothing - - + def recognize(self): + pass # base class does nothing class WaveFileRecognizer(BaseRecognizer): - + def __init__(self, dejavu, filename=None): super(WaveFileRecognizer, self).__init__(dejavu) self.filename = filename - + def recognize_file(self, filename): Fs, frames = wavfile.read(filename) self.Fs = Fs - + wave_object = wave.open(filename) nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() - + channels = [] for channel in range(nchannels): channels.append(frames[:, channel]) - + t = time.time() match = self._recognize(*channels) t = time.time() - t - + if match: match['match_time'] = t - + return match - + def recognize(self): return self.recognize_file(self.filename) class MicrophoneRecognizer(BaseRecognizer): - + CHUNK = 8192 # 44100 is a multiple of 1225 FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 - + def __init__(self, dejavu, seconds=None): super(MicrophoneRecognizer, self).__init__(dejavu) self.audio = pyaudio.PyAudio() @@ -75,52 +72,52 @@ class MicrophoneRecognizer(BaseRecognizer): self.chunk_size = CHUNK self.rate = RATE self.recorded = False - + def start_recording(self, channels=CHANNELS, rate=RATE, chunk=CHUNK): self.chunk_size = chunk self.channels = channels self.recorded = False self.rate = rate - + if self.stream: self.stream.stop_stream() self.stream.close() - + self.stream = self.audio.open(format=FORMAT, channels=channels, rate=rate, input=True, frames_per_buffer=chunk) - + self.data = [[] for i in range(channels)] - + def process_recording(self): data = self.stream.read(self.chunk_size) nums = np.fromstring(data, np.int16) for c in range(self.channels): self.data[c].extend(nums[c::c+1]) - + def stop_recording(self): self.stream.stop_stream() self.stream.close() self.stream = None self.recorded = True - + def recognize_recording(self): if not self.recorded: raise NoRecordingError("Recording was not complete/begun") return self._recognize(*self.data) - + def get_recorded_time(self): return len(self.data[0]) / self.rate - + def recognize(self): self.start_recording() for i in range(0, int(self.rate / self.chunk * self.seconds)): self.process_recording() self.stop_recording() return self.recognize_recording() - + class NoRecordingError(Exception): pass From 7122e110d1451fa74d235eae3cc60ebad522fcee Mon Sep 17 00:00:00 2001 From: Wessie Date: Wed, 18 Dec 2013 18:02:07 +0100 Subject: [PATCH 17/27] Cleaned up convert.py and renamed it to decode.py - 'Converter' class removed - 'ensure_folder' removed - 'find_files' changed into a generator and made it work with spaces - 'convert' renamed into 'read' - 'convert' now handles any supported file by pydub - 'convert' now returns the data instead of saving it to a file (same format as 'extract_channels') - generic cleanup of formatting in the file --- dejavu/convert.py | 54 ----------------------------------------------- dejavu/decode.py | 27 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 54 deletions(-) delete mode 100644 dejavu/convert.py create mode 100644 dejavu/decode.py diff --git a/dejavu/convert.py b/dejavu/convert.py deleted file mode 100644 index 77afcef..0000000 --- a/dejavu/convert.py +++ /dev/null @@ -1,54 +0,0 @@ -import os, fnmatch -from pydub import AudioSegment - -class Converter(): - - WAV = "wav" - MP3 = "mp3" - FORMATS = [ - WAV, - MP3] - - def __init__(self): - pass - - def ensure_folder(self, extension): - if not os.path.exists(extension): - os.makedirs(extension) - - def find_files(self, path, extensions): - filepaths = [] - extensions = [e.replace(".", "") for e in extensions if e.replace(".", "") in Converter.FORMATS] - print "Supported formats: %s" % extensions - for dirpath, dirnames, files in os.walk(path) : - for extension in extensions: - for f in fnmatch.filter(files, "*.%s" % extension): - p = os.path.join(dirpath, f) - renamed = p.replace(" ", "_") - os.rename(p, renamed) - #print "Found file: %s with extension %s" % (renamed, extension) - filepaths.append((renamed, extension)) - return filepaths - - def convert(self, orig_path, from_format, to_format, output_folder, song_name): - - # start conversion - self.ensure_folder(output_folder) - print "-> Now converting: %s from %s format to %s format..." % (song_name, from_format, to_format) - - # MP3 --> WAV - if from_format == Converter.MP3 and to_format == Converter.WAV: - - newpath = os.path.join(output_folder, "%s.%s" % (song_name, Converter.WAV)) - if os.path.isfile(newpath): - print "-> Already converted, skipping..." - else: - mp3file = AudioSegment.from_mp3(orig_path) - mp3file.export(newpath, format=Converter.WAV) - - # unsupported - else: - print "CONVERSION ERROR:\nThe conversion from %s to %s is not supported!" % (from_format, to_format) - - print "-> Conversion complete." - return newpath diff --git a/dejavu/decode.py b/dejavu/decode.py new file mode 100644 index 0000000..0304b08 --- /dev/null +++ b/dejavu/decode.py @@ -0,0 +1,27 @@ +import os +import fnmatch +import numpy as np +from pydub import AudioSegment + + +def find_files(path, extensions): + # Allow both with ".mp3" and without "mp3" to be used for extensions + extensions = [e.replace(".", "") for e in extensions] + + for dirpath, dirnames, files in os.walk(path): + for extension in extensions: + for f in fnmatch.filter(files, "*.%s" % extension): + p = os.path.join(dirpath, f) + yield (p, extension) + + +def read(filename): + audiofile = AudioSegment.from_file(filename) + + data = np.fromstring(audiofile._data, np.int16) + + channels = [] + for chn in xrange(audiofile.channels): + channels.append(data[chn::audiofile.channels]) + + return audiofile.frame_rate, channels From 8a7358d426fb8830d7197ef5922bcd8196952f80 Mon Sep 17 00:00:00 2001 From: Wessie Date: Wed, 18 Dec 2013 18:11:23 +0100 Subject: [PATCH 18/27] Added the fix for issue #13 in the original repository. --- dejavu/decode.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/dejavu/decode.py b/dejavu/decode.py index 0304b08..f578c0e 100644 --- a/dejavu/decode.py +++ b/dejavu/decode.py @@ -15,9 +15,22 @@ def find_files(path, extensions): yield (p, extension) -def read(filename): +def read(filename, limit=None): + """ + Reads any file supported by pydub (ffmpeg) and returns the data contained + within. + + Can be optionally limited to a certain amount of seconds from the start + of the file by specifying the `limit` parameter. This is the amount of + seconds from the start of the file. + + returns: (samplerate, channels) + """ audiofile = AudioSegment.from_file(filename) + if limit: + audiofile = audiofile[:limit * 1000] + data = np.fromstring(audiofile._data, np.int16) channels = [] From 7895bae23eefd4dbe19e6be20a4f8e1f403c337d Mon Sep 17 00:00:00 2001 From: Wessie Date: Wed, 18 Dec 2013 18:15:57 +0100 Subject: [PATCH 19/27] Fixed any references to old converter to use the new functions. - Reversed return values in decode.read --- dejavu/__init__.py | 11 +++-------- dejavu/decode.py | 4 ++-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index 3222395..f9be8bd 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -1,5 +1,5 @@ from dejavu.database import SQLDatabase -from dejavu.convert import Converter +import dejavu.decode as decoder import fingerprint from scipy.io import wavfile from multiprocessing import Process @@ -16,8 +16,6 @@ class Dejavu(object): # initialize db self.db = SQLDatabase(**config.get("database", {})) - # create components - self.converter = Converter() #self.fingerprinter = Fingerprinter(self.config) self.db.setup() @@ -41,7 +39,7 @@ class Dejavu(object): def do_fingerprint(self, path, output, extensions, nprocesses): # convert files, shuffle order - files = self.converter.find_files(path, extensions) + files = decoder.find_files(path, extensions) random.shuffle(files) files_split = self.chunkify(files, nprocesses) @@ -74,14 +72,11 @@ class Dejavu(object): print("-> Already fingerprinted, continuing...") continue - # convert to WAV - wavout_path = self.converter.convert(filename, extension, Converter.WAV, output, song_name) + channels, Fs = decoder.read(filename) # insert song name into database song_id = sql_connection.insert_song(song_name) - # for each channel perform FFT analysis and fingerprinting - channels, Fs = self.extract_channels(wavout_path) for c in range(len(channels)): channel = channels[c] print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name) diff --git a/dejavu/decode.py b/dejavu/decode.py index f578c0e..47193ca 100644 --- a/dejavu/decode.py +++ b/dejavu/decode.py @@ -24,7 +24,7 @@ def read(filename, limit=None): of the file by specifying the `limit` parameter. This is the amount of seconds from the start of the file. - returns: (samplerate, channels) + returns: (channels, samplerate) """ audiofile = AudioSegment.from_file(filename) @@ -37,4 +37,4 @@ def read(filename, limit=None): for chn in xrange(audiofile.channels): channels.append(data[chn::audiofile.channels]) - return audiofile.frame_rate, channels + return channels, audiofile.frame_rate From 29029238fcfc7048d86b8774ac455eac12deea5d Mon Sep 17 00:00:00 2001 From: Wessie Date: Thu, 19 Dec 2013 00:54:17 +0100 Subject: [PATCH 20/27] A fairly big batch of changes, blame me to forget committing Changes in no particular order: - Replaced all use cases of wavfile/wave and extract_channels with the new decoder.read function - Added a 'recognize' method to the Dejavu class. This is a shortcut for recognizing songs. - Renamed 'do_fingerprint' into 'fingerprint_directory' - Removed parameters not required anymore from fingerprint_directory - Cleaned up fingerprint.py - Made fingerprint.generate_hashes a generator - WaveFileRecognizer is now FileRecognizer and can take any formats supported by pydub - Fixed MicrophoneRecognizer to actually run, previous version had many small mistakes - Renamed 'fingerprint_worker' to '_fingerprint_worker' to signify it is not to be used publicly - Moved 'chunkify' outside the Dejavu class - Cleaned up pep8 styling mistakes in all edited files. --- dejavu/__init__.py | 102 ++++++++++------------ dejavu/{decode.py => decoder.py} | 0 dejavu/fingerprint.py | 145 +++++++++---------------------- dejavu/recognize.py | 78 ++++++++--------- 4 files changed, 120 insertions(+), 205 deletions(-) rename dejavu/{decode.py => decoder.py} (100%) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index f9be8bd..c10e54d 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -1,27 +1,25 @@ from dejavu.database import SQLDatabase -import dejavu.decode as decoder +import dejavu.decoder as decoder import fingerprint -from scipy.io import wavfile -from multiprocessing import Process -import wave, os +from multiprocessing import Process, cpu_count +import os import random -DEBUG = False class Dejavu(object): def __init__(self, config): + super(Dejavu, self).__init__() self.config = config # initialize db self.db = SQLDatabase(**config.get("database", {})) - #self.fingerprinter = Fingerprinter(self.config) self.db.setup() # get songs previously indexed self.songs = self.db.get_songs() - self.songnames_set = set() # to know which ones we've computed before + self.songnames_set = set() # to know which ones we've computed before for song in self.songs: song_name = song[self.db.FIELD_SONGNAME] @@ -29,27 +27,27 @@ class Dejavu(object): self.songnames_set.add(song_name) print "Added: %s to the set of fingerprinted songs..." % song_name - def chunkify(self, lst, n): - """ - Splits a list into roughly n equal parts. - http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts - """ - return [lst[i::n] for i in xrange(n)] - - def do_fingerprint(self, path, output, extensions, nprocesses): + def fingerprint_directory(self, path, extensions, nprocesses=None): + # Try to use the maximum amount of processes if not given. + if nprocesses is None: + try: + nprocesses = cpu_count() + except NotImplementedError: + nprocesses = 1 # convert files, shuffle order - files = decoder.find_files(path, extensions) + files = list(decoder.find_files(path, extensions)) random.shuffle(files) - files_split = self.chunkify(files, nprocesses) + + files_split = chunkify(files, nprocesses) # split into processes here processes = [] for i in range(nprocesses): # create process and start it - p = Process(target=self.fingerprint_worker, - args=(files_split[i], self.db, output)) + p = Process(target=self._fingerprint_worker, + args=(files_split[i], self.db)) p.start() processes.append(p) @@ -57,55 +55,37 @@ class Dejavu(object): for p in processes: p.join() - # delete orphans - # print "Done fingerprinting. Deleting orphaned fingerprints..." - # TODO: need a more performant query in database.py for the - #self.fingerprinter.db.delete_orphans() - - def fingerprint_worker(self, files, sql_connection, output): - + def _fingerprint_worker(self, files, db): for filename, extension in files: - # if there are already fingerprints in database, don't re-fingerprint or convert + # if there are already fingerprints in database, + # don't re-fingerprint song_name = os.path.basename(filename).split(".")[0] - if DEBUG and song_name in self.songnames_set: + if song_name in self.songnames_set: print("-> Already fingerprinted, continuing...") continue channels, Fs = decoder.read(filename) # insert song name into database - song_id = sql_connection.insert_song(song_name) + song_id = db.insert_song(song_name) for c in range(len(channels)): channel = channels[c] print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name) + hashes = fingerprint.fingerprint(channel, Fs=Fs) - sql_connection.insert_hashes(song_id, hashes) + + db.insert_hashes(song_id, hashes) # only after done fingerprinting do confirm - sql_connection.set_song_fingerprinted(song_id) - - def extract_channels(self, path): - """ - Reads channels from disk. - Returns a tuple with (channels, sample_rate) - """ - channels = [] - Fs, frames = wavfile.read(path) - wave_object = wave.open(path) - nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() - #assert Fs == self.fingerprinter.Fs - - for channel in range(nchannels): - channels.append(frames[:, channel]) - return (channels, Fs) + db.set_song_fingerprinted(song_id) def fingerprint_file(self, filepath, song_name=None): - # TODO: replace with something that handles all audio formats - channels, Fs = self.extract_channels(path) + channels, Fs = decoder.read(filepath) + if not song_name: - song_name = os.path.basename(filename).split(".")[0] + song_name = os.path.basename(filepath).split(".")[0] song_id = self.db.insert_song(song_name) for data in channels: @@ -141,8 +121,7 @@ class Dejavu(object): largest_count = diff_counter[diff][sid] song_id = sid - if DEBUG: - print("Diff is %d with %d offset-aligned matches" % (largest, largest_count)) + print("Diff is %d with %d offset-aligned matches" % (largest, largest_count)) # extract idenfication song = self.db.get_song_by_id(song_id) @@ -151,14 +130,23 @@ class Dejavu(object): else: return None - if DEBUG: - print("Song is %s (song ID = %d) identification took %f seconds" % (songname, song_id, elapsed)) - # return match info song = { - "song_id" : song_id, - "song_name" : songname, - "confidence" : largest_count + "song_id": song_id, + "song_name": songname, + "confidence": largest_count } return song + + def recognize(self, recognizer, *options, **kwoptions): + r = recognizer(self) + return r.recognize(*options, **kwoptions) + + +def chunkify(lst, n): + """ + Splits a list into roughly n equal parts. + http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts + """ + return [lst[i::n] for i in xrange(n)] diff --git a/dejavu/decode.py b/dejavu/decoder.py similarity index 100% rename from dejavu/decode.py rename to dejavu/decoder.py diff --git a/dejavu/fingerprint.py b/dejavu/fingerprint.py index 6108799..caf252e 100755 --- a/dejavu/fingerprint.py +++ b/dejavu/fingerprint.py @@ -1,17 +1,11 @@ import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt -import matplotlib.image as mpimg -from scipy.io import wavfile from scipy.ndimage.filters import maximum_filter -from scipy.ndimage.morphology import generate_binary_structure, iterate_structure, binary_erosion -from dejavu.database import SQLDatabase -import os -import wave -import sys -import time +from scipy.ndimage.morphology import (generate_binary_structure, + iterate_structure, binary_erosion) import hashlib -import pickle + IDX_FREQ_I = 0 IDX_TIME_J = 1 @@ -25,55 +19,58 @@ DEFAULT_AMP_MIN = 10 PEAK_NEIGHBORHOOD_SIZE = 20 MIN_HASH_TIME_DELTA = 0 -def fingerprint(channel_samples, - Fs=DEFAULT_FS, - wsize=DEFAULT_WINDOW_SIZE, - wratio=DEFAULT_OVERLAP_RATIO, - fan_value=DEFAULT_FAN_VALUE, - amp_min=DEFAULT_AMP_MIN): + +def fingerprint(channel_samples, Fs=DEFAULT_FS, + wsize=DEFAULT_WINDOW_SIZE, + wratio=DEFAULT_OVERLAP_RATIO, + fan_value=DEFAULT_FAN_VALUE, + amp_min=DEFAULT_AMP_MIN): """ - FFT the channel, log transform output, find local maxima, then return - locally sensitive hashes. + FFT the channel, log transform output, find local maxima, then return + locally sensitive hashes. """ # FFT the signal and extract frequency components arr2D = mlab.specgram( - channel_samples, - NFFT=wsize, + channel_samples, + NFFT=wsize, Fs=Fs, window=mlab.window_hanning, noverlap=int(wsize * wratio))[0] # apply log transform since specgram() returns linear array arr2D = 10 * np.log10(arr2D) - arr2D[arr2D == -np.inf] = 0 # replace infs with zeros - + arr2D[arr2D == -np.inf] = 0 # replace infs with zeros + # find local maxima local_maxima = get_2D_peaks(arr2D, plot=False, amp_min=amp_min) # return hashes return generate_hashes(local_maxima, fan_value=fan_value) -def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN): +def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN): # http://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.morphology.iterate_structure.html#scipy.ndimage.morphology.iterate_structure struct = generate_binary_structure(2, 1) neighborhood = iterate_structure(struct, PEAK_NEIGHBORHOOD_SIZE) # find local maxima using our fliter shape - local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D + local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D background = (arr2D == 0) - eroded_background = binary_erosion(background, structure=neighborhood, border_value=1) - detected_peaks = local_max - eroded_background # this is a boolean mask of arr2D with True at peaks + eroded_background = binary_erosion(background, structure=neighborhood, + border_value=1) + + # Boolean mask of arr2D with True at peaks + detected_peaks = local_max - eroded_background # extract peaks amps = arr2D[detected_peaks] - j, i = np.where(detected_peaks) + j, i = np.where(detected_peaks) # filter peaks amps = amps.flatten() peaks = zip(i, j, amps) - peaks_filtered = [x for x in peaks if x[2] > amp_min] # freq, time, amp - + peaks_filtered = [x for x in peaks if x[2] > amp_min] # freq, time, amp + # get indices for frequency and time frequency_idx = [x[1] for x in peaks_filtered] time_idx = [x[0] for x in peaks_filtered] @@ -85,97 +82,37 @@ def get_2D_peaks(arr2D, plot=False, amp_min=DEFAULT_AMP_MIN): ax.scatter(time_idx, frequency_idx) ax.set_xlabel('Time') ax.set_ylabel('Frequency') - ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke"); + ax.set_title("Spectrogram of \"Blurred Lines\" by Robin Thicke") plt.gca().invert_yaxis() plt.show() return zip(frequency_idx, time_idx) + def generate_hashes(peaks, fan_value=DEFAULT_FAN_VALUE): """ Hash list structure: - sha1-hash[0:20] time_offset + sha1_hash[0:20] time_offset [(e05b341a9b77a51fd26, 32), ... ] """ - fingerprinted = set() # to avoid rehashing same pairs - hashes = [] + fingerprinted = set() # to avoid rehashing same pairs for i in range(len(peaks)): for j in range(fan_value): - if i+j < len(peaks) and not (i, i+j) in fingerprinted: - + if (i + j) < len(peaks) and not (i, i + j) in fingerprinted: freq1 = peaks[i][IDX_FREQ_I] - freq2 = peaks[i+j][IDX_FREQ_I] + freq2 = peaks[i + j][IDX_FREQ_I] + t1 = peaks[i][IDX_TIME_J] - t2 = peaks[i+j][IDX_TIME_J] + t2 = peaks[i + j][IDX_TIME_J] + t_delta = t2 - t1 - + if t_delta >= MIN_HASH_TIME_DELTA: - h = hashlib.sha1("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta))) - hashes.append((h.hexdigest()[0:20], t1)) - + h = hashlib.sha1( + "%s|%s|%s" % (str(freq1), str(freq2), str(t_delta)) + ) + yield (h.hexdigest()[0:20], t1) + # ensure we don't repeat hashing - fingerprinted.add((i, i+j)) - return hashes - -# TODO: move all of the below to a class with DB access - - -class Fingerprinter(): - - - - def __init__(self, config, - Fs=DEFAULT_FS, - wsize=DEFAULT_WINDOW_SIZE, - wratio=DEFAULT_OVERLAP_RATIO, - fan_value=DEFAULT_FAN_VALUE, - amp_min=DEFAULT_AMP_MIN): - - self.config = config - - - self.Fs = Fs - self.dt = 1.0 / self.Fs - self.window_size = wsize - self.window_overlap_ratio = wratio - self.fan_value = fan_value - self.noverlap = int(self.window_size * self.window_overlap_ratio) - self.amp_min = amp_min - - def fingerprint(self, samples, path, sid, cid): - """Used for learning known songs""" - hashes = self.process_channel(samples, song_id=sid) - print "Generated %d hashes" % len(hashes) - self.db.insert_hashes(hashes) - - # TODO: put this in another module - def match(self, samples): - """Used for matching unknown songs""" - hashes = self.process_channel(samples) - matches = self.db.return_matches(hashes) - return matches - - # TODO: this function has nothing to do with fingerprinting. is it needed? - def print_stats(self): - - iterable = self.db.get_iterable_kv_pairs() - - counter = {} - for t in iterable: - sid, toff = t - if not sid in counter: - counter[sid] = 1 - else: - counter[sid] += 1 - - for song_id, count in counter.iteritems(): - song_name = self.song_names[song_id] - print "%s has %d spectrogram peaks" % (song_name, count) - - # this does... what? this seems to only be used for the above function - def set_song_names(self, wpaths): - self.song_names = wpaths - - # TODO: put this in another module - + fingerprinted.add((i, i + j)) diff --git a/dejavu/recognize.py b/dejavu/recognize.py index 9283c6a..a723197 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -1,14 +1,8 @@ -from multiprocessing import Queue, Process -from dejavu.database import SQLDatabase import dejavu.fingerprint as fingerprint -from dejavu import Dejavu -from scipy.io import wavfile -import wave +import dejavu.decoder as decoder import numpy as np import pyaudio -import sys import time -import array class BaseRecognizer(object): @@ -26,25 +20,17 @@ class BaseRecognizer(object): def recognize(self): pass # base class does nothing -class WaveFileRecognizer(BaseRecognizer): - def __init__(self, dejavu, filename=None): - super(WaveFileRecognizer, self).__init__(dejavu) - self.filename = filename +class FileRecognizer(BaseRecognizer): + def __init__(self, dejavu): + super(FileRecognizer, self).__init__(dejavu) def recognize_file(self, filename): - Fs, frames = wavfile.read(filename) + Fs, frames = decoder.read(filename) self.Fs = Fs - wave_object = wave.open(filename) - nchannels, sampwidth, framerate, num_frames, comptype, compname = wave_object.getparams() - - channels = [] - for channel in range(nchannels): - channels.append(frames[:, channel]) - t = time.time() - match = self._recognize(*channels) + match = self._recognize(*frames) t = time.time() - t if match: @@ -52,50 +38,53 @@ class WaveFileRecognizer(BaseRecognizer): return match - def recognize(self): - return self.recognize_file(self.filename) + def recognize(self, filename): + return self.recognize_file(filename) class MicrophoneRecognizer(BaseRecognizer): + default_chunksize = 8192 + default_format = pyaudio.paInt16 + default_channels = 2 + default_samplerate = 44100 - CHUNK = 8192 # 44100 is a multiple of 1225 - FORMAT = pyaudio.paInt16 - CHANNELS = 2 - RATE = 44100 - - def __init__(self, dejavu, seconds=None): + def __init__(self, dejavu): super(MicrophoneRecognizer, self).__init__(dejavu) self.audio = pyaudio.PyAudio() self.stream = None self.data = [] - self.channels = CHANNELS - self.chunk_size = CHUNK - self.rate = RATE + self.channels = self.default_channels + self.chunksize = self.default_chunk + self.samplerate = self.default_samplerate self.recorded = False - def start_recording(self, channels=CHANNELS, rate=RATE, chunk=CHUNK): - self.chunk_size = chunk + def start_recording(self, channels=default_channels, + samplerate=default_samplerate, + chunksize=default_chunksize): + self.chunksize = chunksize self.channels = channels self.recorded = False - self.rate = rate + self.samplerate = samplerate if self.stream: self.stream.stop_stream() self.stream.close() - self.stream = self.audio.open(format=FORMAT, - channels=channels, - rate=rate, - input=True, - frames_per_buffer=chunk) + self.stream = self.audio.open( + format=self.default_format, + channels=channels, + rate=samplerate, + input=True, + frames_per_buffer=chunksize, + ) self.data = [[] for i in range(channels)] def process_recording(self): - data = self.stream.read(self.chunk_size) + data = self.stream.read(self.chunksize) nums = np.fromstring(data, np.int16) for c in range(self.channels): - self.data[c].extend(nums[c::c+1]) + self.data[c].extend(nums[c::len(self.channels)]) def stop_recording(self): self.stream.stop_stream() @@ -111,13 +100,14 @@ class MicrophoneRecognizer(BaseRecognizer): def get_recorded_time(self): return len(self.data[0]) / self.rate - def recognize(self): + def recognize(self, seconds=None): self.start_recording() - for i in range(0, int(self.rate / self.chunk * self.seconds)): + for i in range(0, int(self.samplerate / self.chunksize + * seconds)): self.process_recording() self.stop_recording() return self.recognize_recording() + class NoRecordingError(Exception): pass - From 292ddf029d5017477cfcc0574cd4e47e355109a8 Mon Sep 17 00:00:00 2001 From: Wessie Date: Thu, 19 Dec 2013 01:22:27 +0100 Subject: [PATCH 21/27] Reversed expected variables to the correct position. --- dejavu/recognize.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dejavu/recognize.py b/dejavu/recognize.py index a723197..ec9e824 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -26,8 +26,7 @@ class FileRecognizer(BaseRecognizer): super(FileRecognizer, self).__init__(dejavu) def recognize_file(self, filename): - Fs, frames = decoder.read(filename) - self.Fs = Fs + frames, self.Fs = decoder.read(filename) t = time.time() match = self._recognize(*frames) From 94c4cc0d95ad19f30f74d5d3c1efff484c6bf10b Mon Sep 17 00:00:00 2001 From: Wessie Date: Thu, 19 Dec 2013 01:34:04 +0100 Subject: [PATCH 22/27] Updated README for the changes to the interface. Changed default value from None to 10 for the MicrophoneRecognizer.recognize --- README.md | Bin 10435 -> 10150 bytes dejavu/recognize.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76699f33cb13c2ec3a9b8bf6533c2d001ebd1e6a..2fc4490b46ee1a3c704e33f8c95a916ddaf54f4a 100644 GIT binary patch delta 521 zcmZvZO-{l<7>1*Uzb5FKgqV-I0AkZnLlg*fMNC{6qZfcFowh?@rkR=6C?Oo8cVK5E z;RtTLf>&_q&Yf)&P>3^|$@f0*_cw3%%e&QUCi1h(sGVyrfJ}YHCG_zag@Iz4CX5($ zhzmWKDY~vha#R~hWFdjo&Yx(icudGRsu~wpdLzDus%>p45D3f@fHNldHNTC7c(h}= z>nAV9Z9oD(ik>dQcvRx^3w$mIE402M(bw<~}D>idudBt(+e& zZpnx5Oa6xCw;nxTp=EC>qMw`m?qY(=R&+Cb5Ti-I4xD2OEUlAHIYBy+>eB(=JD zKO+CYg&+tDZ{^yx8+ZN*e}j|cl|qtub5M4RFY>n21R6; zlXI<4NX28=JKF-EP>DWsVs7}Q3D*|9SN8xV)xqHURp!h{o0etg?yk3OpPlOC>W5)U zJQi{aW`=)}3S7q3i?#c6&q^|itllgqW#CaGnfbmCiJ}Z1HZGrXWyPLh`ezN&a@Z(f z|2MQd>eW1(TI>$HK!$B~tv#vAfqx;PLRDV|4-Tj>d0`+H%4S9C6bwP0@(95@;bjxG z1~(yPjb#>D+f3{WRj!0IdQ|-!9BwiehGBkcW+WRS%SlJ;H?Is!9uzt-ln3{=?^w=7 zxt^bY+}>7?36&R5yeXPM7Fp0MXm@n(-)phLZmrCpV?wf>fOfo>hi{oqiz=KDE>#opYxl` KTztFuA diff --git a/dejavu/recognize.py b/dejavu/recognize.py index ec9e824..dfc06d9 100755 --- a/dejavu/recognize.py +++ b/dejavu/recognize.py @@ -99,7 +99,7 @@ class MicrophoneRecognizer(BaseRecognizer): def get_recorded_time(self): return len(self.data[0]) / self.rate - def recognize(self, seconds=None): + def recognize(self, seconds=10): self.start_recording() for i in range(0, int(self.samplerate / self.chunksize * seconds)): From 2f19fcaa51331f9d389f899c5d366c94283217c1 Mon Sep 17 00:00:00 2001 From: Wessie Date: Thu, 19 Dec 2013 01:39:36 +0100 Subject: [PATCH 23/27] Fixed some mistakes in my hasty edit --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2fc4490..9c53b41 100644 --- a/README.md +++ b/README.md @@ -50,13 +50,13 @@ Start by creating a Dejavu object. >>> djv = Dejavu(config) ``` -Next, give the `fingerprint()` command three arguments: +Next, give the `fingerprint_directory` method three arguments: * input directory to look for audio files * audio extensions to look for in the input directory * number of processes (optional) ```python ->>> djv.fingerprint("va_us_top_40/mp3", [".mp3"], 3) +>>> djv.fingerprint_directory("va_us_top_40/mp3", [".mp3"], 3) ``` For a large amount of files, this will take a while. However, Dejavu is robust enough you can kill and restart without affecting progress: Dejavu remembers which songs it fingerprinted and converted and which it didn't, and so won't repeat itself. @@ -67,7 +67,7 @@ You'll have a lot of fingerprints once it completes a large folder of mp3s: 5442376 ``` -Also, any subsequent calls to `fingerprint()` will fingerprint and add those songs to the database as well. It's meant to simulate a system where as new songs are released, they are fingerprinted and added to the database seemlessly without stopping the system. +Also, any subsequent calls to `fingerprint_file` or `fingerprint_directory` will fingerprint and add those songs to the database as well. It's meant to simulate a system where as new songs are released, they are fingerprinted and added to the database seemlessly without stopping the system. ## Recognizing @@ -85,7 +85,7 @@ There are two ways to recognize audio using Dejavu. You can use Dejavu interacti } ``` -Or by reading .wav files via scripting functions: +Or by reading files via scripting functions: ```python >>> from dejavu.recognize import FileRecognizer From ec823f56e410114fceec1d3be1d3144463c65e42 Mon Sep 17 00:00:00 2001 From: Wessie Date: Thu, 19 Dec 2013 17:15:11 +0100 Subject: [PATCH 24/27] Updated go.py Added clarification that the configuration is now an ordinary python dictionary to the README. --- README.md | 3 ++- go.py | 35 +++++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9c53b41..fc9c3bd 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,8 @@ Now you're ready to start fingerprinting your audio collection! Let's say we want to fingerprint all of July 2013's VA US Top 40 hits. -Start by creating a Dejavu object. +Start by creating a Dejavu object with your configurations settings (Dejavu takes an ordinary Python dictionary for the settings). + ```python >>> from dejavu import Dejavu >>> config = { diff --git a/go.py b/go.py index 9108f4e..7472019 100755 --- a/go.py +++ b/go.py @@ -1,22 +1,29 @@ from dejavu import Dejavu -from ConfigParser import ConfigParser import warnings +import json warnings.filterwarnings("ignore") -# load config -config = ConfigParser() -config.read("dejavu.cnf") +# load config from a JSON file (or anything outputting a python dictionary) +with open("dejavu.cnf") as f: + config = json.load(f) -# create Dejavu object -dejavu = Dejavu(config) -dejavu.fingerprint("va_us_top_40/mp3", "va_us_top_40/wav", [".mp3"], 5) +# create a Dejavu instance +djv = Dejavu(config) +# Fingerprint all the mp3's in the directory we give it +djv.fingerprint_directory("va_us_top_40/mp3", [".mp3"], 5) -# recognize microphone audio -from dejavu.recognize import Recognizer -recognizer = Recognizer(dejavu.fingerprinter, config) -song = recognizer.read("va_us_top_40/wav/17_-_#Beautiful_-_Mariah_Carey_ft.wav") +# Recognize audio from a file +from dejavu.recognize import FileRecognizer +song = djv.recognize(FileRecognizer, "va_us_top_40/wav/17_-_#Beautiful_-_Mariah_Carey_ft.wav") -# recognize song playing over microphone for 10 seconds -#song = recognizer.listen(seconds=1, verbose=True) -#print song \ No newline at end of file + +# Or recognize audio from your microphone for 10 seconds +from dejavu.recognize import MicrophoneRecognizer +song = djv.recognize(MicrophoneRecognizer, seconds=10) + + +# Or use a recognizer without the shortcut, in anyway you would like +from dejavu.recognize import FileRecognizer +recognizer = FileRecognizer(djv) +song = recognizer.recognize_file("va_us_top_40/wav/17_-_#Beautiful_-_Mariah_Carey_ft.wav") From f276efdf324c24836d8c3bfc128f9558f7245e23 Mon Sep 17 00:00:00 2001 From: Wessie Date: Fri, 20 Dec 2013 18:16:35 +0100 Subject: [PATCH 25/27] Cleaned up database.py - Moved SQLDatabase to a SQL specific file - Database class is now an abstract base class - Cursor moved into SQL specific file - Allowed for multi-database support in the future --- dejavu/__init__.py | 5 +- dejavu/cursor.py | 59 ------- dejavu/database.py | 348 ++++++++++++--------------------------- dejavu/database_sql.py | 366 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 472 insertions(+), 306 deletions(-) delete mode 100644 dejavu/cursor.py create mode 100644 dejavu/database_sql.py diff --git a/dejavu/__init__.py b/dejavu/__init__.py index c10e54d..e93edac 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -1,4 +1,4 @@ -from dejavu.database import SQLDatabase +from dejavu.database import get_database import dejavu.decoder as decoder import fingerprint from multiprocessing import Process, cpu_count @@ -13,8 +13,9 @@ class Dejavu(object): self.config = config # initialize db - self.db = SQLDatabase(**config.get("database", {})) + db_cls = get_database(config.get("database_type", None)) + self.db = db_cls(**config.get("database", {})) self.db.setup() # get songs previously indexed diff --git a/dejavu/cursor.py b/dejavu/cursor.py deleted file mode 100644 index 55429a7..0000000 --- a/dejavu/cursor.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -import Queue - -import MySQLdb as mysql -import MySQLdb.cursors - - -def cursor_factory(**factory_options): - def cursor(**options): - options.update(factory_options) - return Cursor(**options) - return cursor - - -class Cursor(object): - """ - Establishes a connection to the database and returns an open cursor. - - - ```python - # Use as context manager - with Cursor() as cur: - cur.execute(query) - ``` - """ - _cache = Queue.Queue(maxsize=5) - - def __init__(self, cursor_type=mysql.cursors.Cursor, **options): - super(Cursor, self).__init__() - - try: - conn = self._cache.get_nowait() - except Queue.Empty: - conn = mysql.connect(**options) - else: - # Ping the connection before using it from the cache. - conn.ping(True) - - self.conn = conn - self.cursor_type = cursor_type - - def __enter__(self): - self.cursor = self.conn.cursor(self.cursor_type) - return self.cursor - - def __exit__(self, extype, exvalue, traceback): - # if we had a MySQL related error we try to rollback the cursor. - if extype is mysql.MySQLError: - self.cursor.rollback() - - self.cursor.close() - self.conn.commit() - - # Put it back on the queue - try: - self._cache.put_nowait(self.conn) - except Queue.Full: - self.conn.close() diff --git a/dejavu/database.py b/dejavu/database.py index 75af1bf..b0bfc4a 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -1,308 +1,166 @@ from __future__ import absolute_import -from itertools import izip_longest - -from dejavu.cursor import cursor_factory -from MySQLdb.cursors import DictCursor +import abc class Database(object): + __metaclass__ = abc.ABCMeta + + # Name of your Database subclass, this is used in configuration + # to refer to your class + type = None + def __init__(self): super(Database, self).__init__() + def before_fork(self): + """ + Called before the database instance is given to the new process + """ + pass -class SQLDatabase(Database): - """ - Queries: + def after_fork(self): + """ + Called after the database instance has been given to the new process - 1) Find duplicates (shouldn't be any, though): - - select `hash`, `song_id`, `offset`, count(*) cnt - from fingerprints - group by `hash`, `song_id`, `offset` - having cnt > 1 - order by cnt asc; - - 2) Get number of hashes by song: - - select song_id, song_name, count(song_id) as num - from fingerprints - natural join songs - group by song_id - order by count(song_id) desc; - - 3) get hashes with highest number of collisions - - select - hash, - count(distinct song_id) as n - from fingerprints - group by `hash` - order by n DESC; - - => 26 different songs with same fingerprint (392 times): - - select songs.song_name, fingerprints.offset - from fingerprints natural join songs - where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73"; - """ - - # config keys - CONNECTION = "connection" - KEY_USERNAME = "username" - KEY_DATABASE = "database" - KEY_PASSWORD = "password" - KEY_HOSTNAME = "hostname" - - # tables - FINGERPRINTS_TABLENAME = "fingerprints" - SONGS_TABLENAME = "songs" - - # fields - FIELD_HASH = "hash" - FIELD_SONG_ID = "song_id" - FIELD_OFFSET = "offset" - FIELD_SONGNAME = "song_name" - FIELD_FINGERPRINTED = "fingerprinted" - - # creates - CREATE_FINGERPRINTS_TABLE = """ - CREATE TABLE IF NOT EXISTS `%s` ( - `%s` binary(10) not null, - `%s` mediumint unsigned not null, - `%s` int unsigned not null, - PRIMARY KEY(%s), - UNIQUE(%s, %s, %s), - FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE - ) ENGINE=INNODB;""" % ( - FINGERPRINTS_TABLENAME, FIELD_HASH, - FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, - FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, - FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID - ) - - CREATE_SONGS_TABLE = """ - CREATE TABLE IF NOT EXISTS `%s` ( - `%s` mediumint unsigned not null auto_increment, - `%s` varchar(250) not null, - `%s` tinyint default 0, - PRIMARY KEY (`%s`), - UNIQUE KEY `%s` (`%s`) - ) ENGINE=INNODB;""" % ( - SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, - FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, - ) - - # inserts (ignores duplicates) - INSERT_FINGERPRINT = """ - INSERT IGNORE INTO %s (%s, %s, %s) VALUES - (UNHEX(%%s), %%s, %%s); - """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) - - INSERT_SONG = "INSERT INTO %s (%s) VALUES (%%s);" % ( - SONGS_TABLENAME, FIELD_SONGNAME) - - # selects - SELECT = """ - SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); - """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) - - SELECT_MULTIPLE = """ - SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); - """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, - FINGERPRINTS_TABLENAME, FIELD_HASH) - - SELECT_ALL = """ - SELECT %s, %s FROM %s; - """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) - - SELECT_SONG = """ - SELECT %s FROM %s WHERE %s = %%s - """ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID) - - SELECT_NUM_FINGERPRINTS = """ - SELECT COUNT(*) as n FROM %s - """ % (FINGERPRINTS_TABLENAME) - - SELECT_UNIQUE_SONG_IDS = """ - SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; - """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) - - SELECT_SONGS = """ - SELECT %s, %s FROM %s WHERE %s = 1; - """ % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED) - - # drops - DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME - DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME - - # update - UPDATE_SONG_FINGERPRINTED = """ - UPDATE %s SET %s = 1 WHERE %s = %%s - """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) - - # delete - DELETE_UNFINGERPRINTED = """ - DELETE FROM %s WHERE %s = 0; - """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED) - - def __init__(self, **options): - super(SQLDatabase, self).__init__() - self.cursor = cursor_factory(**options) + This will be called in the new process. + """ + pass def setup(self): """ - Creates any non-existing tables required for dejavu to function. - - This also removes all songs that have been added but have no - fingerprints associated with them. + Called on creation or shortly afterwards. """ - with self.cursor() as cur: - cur.execute(self.CREATE_SONGS_TABLE) - cur.execute(self.CREATE_FINGERPRINTS_TABLE) - cur.execute(self.DELETE_UNFINGERPRINTED) + pass + @abc.abstractmethod def empty(self): """ - Drops tables created by dejavu and then creates them again - by calling `SQLDatabase.setup`. - - .. warning: - This will result in a loss of data + Called when the database should be cleared of all data. """ - with self.cursor() as cur: - cur.execute(self.DROP_FINGERPRINTS) - cur.execute(self.DROP_SONGS) - - self.setup() + pass + @abc.abstractmethod def delete_unfingerprinted_songs(self): """ - Removes all songs that have no fingerprints associated with them. + Called to remove any song entries that do not have any fingerprints + associated with them. """ - with self.cursor() as cur: - cur.execute(self.DELETE_UNFINGERPRINTED) + pass + @abc.abstractmethod def get_num_songs(self): """ - Returns number of songs the database has fingerprinted. + Returns the amount of songs in the database. """ - with self.cursor() as cur: - cur.execute(self.SELECT_UNIQUE_SONG_IDS) - - for count, in cur: - return count - return 0 + pass + @abc.abstractmethod def get_num_fingerprints(self): """ - Returns number of fingerprints the database has fingerprinted. + Returns the number of fingerprints in the database. """ - with self.cursor() as cur: - cur.execute(self.SELECT_NUM_FINGERPRINTS) - - for count, in cur: - return count - return 0 + pass + @abc.abstractmethod def set_song_fingerprinted(self, sid): """ - Set the fingerprinted flag to TRUE (1) once a song has been completely - fingerprinted in the database. - """ - with self.cursor() as cur: - cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,)) + Sets a specific song as having all fingerprints in the database. + sid: Song identifier + """ + pass + + @abc.abstractmethod def get_songs(self): """ - Return songs that have the fingerprinted flag set TRUE (1). + Returns all fully fingerprinted songs in the database. """ - with self.cursor(cursor_type=DictCursor) as cur: - cur.execute(self.SELECT_SONGS) - for row in cur: - yield row + pass + @abc.abstractmethod def get_song_by_id(self, sid): """ - Returns song by its ID. - """ - with self.cursor(cursor_type=DictCursor) as cur: - cur.execute(self.SELECT_SONG, (sid,)) - return cur.fetchone() + Return a song by its identifier + sid: Song identifier + """ + pass + + @abc.abstractmethod def insert(self, hash, sid, offset): """ - Insert a (sha1, song_id, offset) row into database. - """ - with self.cursor() as cur: - cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset)) + Inserts a single fingerprint into the database. - def insert_song(self, songname): + hash: Part of a sha1 hash, in hexadecimal format + sid: Song identifier this fingerprint is off + offset: The offset this hash is from """ - Inserts song in the database and returns the ID of the inserted record. - """ - with self.cursor() as cur: - cur.execute(self.INSERT_SONG, (songname,)) - return cur.lastrowid + pass + @abc.abstractmethod + def insert_song(self, song_name): + """ + Inserts a song name into the database, returns the new + identifier of the song. + + song_name: The name of the song. + """ + pass + + @abc.abstractmethod def query(self, hash): """ - Return all tuples associated with hash. + Returns all matching fingerprint entries associated with + the given hash as parameter. - If hash is None, returns all entries in the - database (be careful with that one!). + hash: Part of a sha1 hash, in hexadecimal format """ - # select all if no key - query = self.SELECT_ALL if hash is None else self.SELECT - - with self.cursor() as cur: - cur.execute(query) - for sid, offset in cur: - yield (sid, offset) + pass + @abc.abstractmethod def get_iterable_kv_pairs(self): """ - Returns all tuples in database. + Returns all fingerprints in the database. """ - return self.query(None) + pass + @abc.abstractmethod def insert_hashes(self, sid, hashes): """ - Insert series of hash => song_id, offset - values into the database. + Insert a multitude of fingerprints. + + sid: Song identifier the fingerprints belong to + hashes: A sequence of tuples in the format (hash, offset) + - hash: Part of a sha1 hash, in hexadecimal format + - offset: Offset this hash was created from/at. """ - values = [] - for hash, offset in hashes: - values.append((hash, sid, offset)) - - with self.cursor() as cur: - cur.executemany(self.INSERT_FINGERPRINT, values) + pass + @abc.abstractmethod def return_matches(self, hashes): """ - Return the (song_id, offset_diff) tuples associated with - a list of (sha1, sample_offset) values. + Searches the database for pairs of (hash, offset) values. + + hashes: A sequence of tuples in the format (hash, offset) + - hash: Part of a sha1 hash, in hexadecimal format + - offset: Offset this hash was created from/at. + + Returns a sequence of (sid, offset_difference) tuples. + + sid: Song identifier + offset_difference: (offset - database_offset) """ - # Create a dictionary of hash => offset pairs for later lookups - mapper = {} - for hash, offset in hashes: - mapper[hash.upper()] = offset - - # Get an iteratable of all the hashes we need - values = mapper.keys() - - with self.cursor() as cur: - for split_values in grouper(values, 1000): - # Create our IN part of the query - query = self.SELECT_MULTIPLE - query = query % ', '.join(['UNHEX(%s)'] * len(split_values)) - - cur.execute(query, split_values) - - for hash, sid, offset in cur: - # (sid, db_offset - song_sampled_offset) - yield (sid, offset - mapper[hash]) + pass -def grouper(iterable, n, fillvalue=None): - args = [iter(iterable)] * n - return izip_longest(fillvalue=fillvalue, *args) +def get_database(database_type=None): + # Default to using the mysql database + database_type = database_type or "mysql" + # Lower all the input. + database_type = database_type.lower() + + for db_cls in Database.__subclasses__(): + if db_cls.type == database_type: + return db_cls + + raise TypeError("Unsupported database type supplied.") diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py new file mode 100644 index 0000000..03c8ad6 --- /dev/null +++ b/dejavu/database_sql.py @@ -0,0 +1,366 @@ +from __future__ import absolute_import +from itertools import izip_longest +import Queue + +import MySQLdb as mysql +from MySQLdb.cursors import DictCursor + +from dejavu.database import Database + + +class SQLDatabase(Database): + """ + Queries: + + 1) Find duplicates (shouldn't be any, though): + + select `hash`, `song_id`, `offset`, count(*) cnt + from fingerprints + group by `hash`, `song_id`, `offset` + having cnt > 1 + order by cnt asc; + + 2) Get number of hashes by song: + + select song_id, song_name, count(song_id) as num + from fingerprints + natural join songs + group by song_id + order by count(song_id) desc; + + 3) get hashes with highest number of collisions + + select + hash, + count(distinct song_id) as n + from fingerprints + group by `hash` + order by n DESC; + + => 26 different songs with same fingerprint (392 times): + + select songs.song_name, fingerprints.offset + from fingerprints natural join songs + where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73"; + """ + + type = "mysql" + + # tables + FINGERPRINTS_TABLENAME = "fingerprints" + SONGS_TABLENAME = "songs" + + # fields + FIELD_HASH = "hash" + FIELD_SONG_ID = "song_id" + FIELD_OFFSET = "offset" + FIELD_SONGNAME = "song_name" + FIELD_FINGERPRINTED = "fingerprinted" + + # creates + CREATE_FINGERPRINTS_TABLE = """ + CREATE TABLE IF NOT EXISTS `%s` ( + `%s` binary(10) not null, + `%s` mediumint unsigned not null, + `%s` int unsigned not null, + PRIMARY KEY(%s), + UNIQUE(%s, %s, %s), + FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE + ) ENGINE=INNODB;""" % ( + FINGERPRINTS_TABLENAME, FIELD_HASH, + FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, + FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, + FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID + ) + + CREATE_SONGS_TABLE = """ + CREATE TABLE IF NOT EXISTS `%s` ( + `%s` mediumint unsigned not null auto_increment, + `%s` varchar(250) not null, + `%s` tinyint default 0, + PRIMARY KEY (`%s`), + UNIQUE KEY `%s` (`%s`) + ) ENGINE=INNODB;""" % ( + SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, + FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, + ) + + # inserts (ignores duplicates) + INSERT_FINGERPRINT = """ + INSERT IGNORE INTO %s (%s, %s, %s) values + (UNHEX(%%s), %%s, %%s); + """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) + + INSERT_SONG = "INSERT INTO %s (%s) values (%%s);" % ( + SONGS_TABLENAME, FIELD_SONGNAME) + + # selects + SELECT = """ + SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); + """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) + + SELECT_MULTIPLE = """ + SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); + """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, + FINGERPRINTS_TABLENAME, FIELD_HASH) + + SELECT_ALL = """ + SELECT %s, %s FROM %s; + """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) + + SELECT_SONG = """ + SELECT %s FROM %s WHERE %s = %%s + """ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID) + + SELECT_NUM_FINGERPRINTS = """ + SELECT COUNT(*) as n FROM %s + """ % (FINGERPRINTS_TABLENAME) + + SELECT_UNIQUE_SONG_IDS = """ + SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; + """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) + + SELECT_SONGS = """ + SELECT %s, %s FROM %s WHERE %s = 1; + """ % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED) + + # drops + DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME + DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME + + # update + UPDATE_SONG_FINGERPRINTED = """ + UPDATE %s SET %s = 1 WHERE %s = %%s + """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) + + # delete + DELETE_UNFINGERPRINTED = """ + DELETE FROM %s WHERE %s = 0; + """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED) + + def __init__(self, **options): + super(SQLDatabase, self).__init__() + self.cursor = cursor_factory(**options) + self._options = options + + def after_fork(self): + # Clear the cursor cache, we don't want any stale connections from + # the previous process. + Cursor.clear_cache() + + def setup(self): + """ + Creates any non-existing tables required for dejavu to function. + + This also removes all songs that have been added but have no + fingerprints associated with them. + """ + with self.cursor() as cur: + cur.execute(self.CREATE_SONGS_TABLE) + cur.execute(self.CREATE_FINGERPRINTS_TABLE) + cur.execute(self.DELETE_UNFINGERPRINTED) + + def empty(self): + """ + Drops tables created by dejavu and then creates them again + by calling `SQLDatabase.setup`. + + .. warning: + This will result in a loss of data + """ + with self.cursor() as cur: + cur.execute(self.DROP_FINGERPRINTS) + cur.execute(self.DROP_SONGS) + + self.setup() + + def delete_unfingerprinted_songs(self): + """ + Removes all songs that have no fingerprints associated with them. + """ + with self.cursor() as cur: + cur.execute(self.DELETE_UNFINGERPRINTED) + + def get_num_songs(self): + """ + Returns number of songs the database has fingerprinted. + """ + with self.cursor() as cur: + cur.execute(self.SELECT_UNIQUE_SONG_IDS) + + for count, in cur: + return count + return 0 + + def get_num_fingerprints(self): + """ + Returns number of fingerprints the database has fingerprinted. + """ + with self.cursor() as cur: + cur.execute(self.SELECT_NUM_FINGERPRINTS) + + for count, in cur: + return count + return 0 + + def set_song_fingerprinted(self, sid): + """ + Set the fingerprinted flag to TRUE (1) once a song has been completely + fingerprinted in the database. + """ + with self.cursor() as cur: + cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,)) + + def get_songs(self): + """ + Return songs that have the fingerprinted flag set TRUE (1). + """ + with self.cursor(cursor_type=DictCursor) as cur: + cur.execute(self.SELECT_SONGS) + for row in cur: + yield row + + def get_song_by_id(self, sid): + """ + Returns song by its ID. + """ + with self.cursor(cursor_type=DictCursor) as cur: + cur.execute(self.SELECT_SONG, (sid,)) + return cur.fetchone() + + def insert(self, hash, sid, offset): + """ + Insert a (sha1, song_id, offset) row into database. + """ + with self.cursor() as cur: + cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset)) + + def insert_song(self, songname): + """ + Inserts song in the database and returns the ID of the inserted record. + """ + with self.cursor() as cur: + cur.execute(self.INSERT_SONG, (songname,)) + return cur.lastrowid + + def query(self, hash): + """ + Return all tuples associated with hash. + + If hash is None, returns all entries in the + database (be careful with that one!). + """ + # select all if no key + query = self.SELECT_ALL if hash is None else self.SELECT + + with self.cursor() as cur: + cur.execute(query) + for sid, offset in cur: + yield (sid, offset) + + def get_iterable_kv_pairs(self): + """ + Returns all tuples in database. + """ + return self.query(None) + + def insert_hashes(self, sid, hashes): + """ + Insert series of hash => song_id, offset + values into the database. + """ + values = [] + for hash, offset in hashes: + values.append((hash, sid, offset)) + + with self.cursor() as cur: + for split_values in grouper(values, 1000): + cur.executemany(self.INSERT_FINGERPRINT, split_values) + + def return_matches(self, hashes): + """ + Return the (song_id, offset_diff) tuples associated with + a list of (sha1, sample_offset) values. + """ + # Create a dictionary of hash => offset pairs for later lookups + mapper = {} + for hash, offset in hashes: + mapper[hash.upper()] = offset + + # Get an iteratable of all the hashes we need + values = mapper.keys() + + with self.cursor() as cur: + for split_values in grouper(values, 1000): + # Create our IN part of the query + query = self.SELECT_MULTIPLE + query = query % ', '.join(['UNHEX(%s)'] * len(split_values)) + + cur.execute(query, split_values) + + for hash, sid, offset in cur: + # (sid, db_offset - song_sampled_offset) + yield (sid, offset - mapper[hash]) + + +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return izip_longest(fillvalue=fillvalue, *args) + + +def cursor_factory(**factory_options): + def cursor(**options): + options.update(factory_options) + return Cursor(**options) + return cursor + + +class Cursor(object): + """ + Establishes a connection to the database and returns an open cursor. + + + ```python + # Use as context manager + with Cursor() as cur: + cur.execute(query) + ``` + """ + _cache = Queue.Queue(maxsize=5) + + def __init__(self, cursor_type=mysql.cursors.Cursor, **options): + super(Cursor, self).__init__() + + try: + conn = self._cache.get_nowait() + except Queue.Empty: + conn = mysql.connect(**options) + else: + # Ping the connection before using it from the cache. + conn.ping(True) + + self.conn = conn + self.conn.autocommit(False) + self.cursor_type = cursor_type + + @classmethod + def clear_cache(cls): + cls._cache = Queue.Queue(maxsize=5) + + def __enter__(self): + self.cursor = self.conn.cursor(self.cursor_type) + return self.cursor + + def __exit__(self, extype, exvalue, traceback): + # if we had a MySQL related error we try to rollback the cursor. + if extype is mysql.MySQLError: + self.cursor.rollback() + + self.cursor.close() + self.conn.commit() + + # Put it back on the queue + try: + self._cache.put_nowait(self.conn) + except Queue.Full: + self.conn.close() From e071804ea5550bfee19a55332345b887744bbdae Mon Sep 17 00:00:00 2001 From: Wessie Date: Sat, 21 Dec 2013 12:01:05 +0100 Subject: [PATCH 26/27] Fixed the issue of the default database not being imported. Fixed a bug in the SQL database pertaining to the use of grouper. Made SQLDatabase pickleable, for better multiprocessing support. --- dejavu/database.py | 4 ++++ dejavu/database_sql.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dejavu/database.py b/dejavu/database.py index b0bfc4a..5903541 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -164,3 +164,7 @@ def get_database(database_type=None): return db_cls raise TypeError("Unsupported database type supplied.") + + +# Import our default database handler +import dejavu.database_sql diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py index 03c8ad6..a93b1f9 100644 --- a/dejavu/database_sql.py +++ b/dejavu/database_sql.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -from itertools import izip_longest +from itertools import izip_longest, ifilter import Queue import MySQLdb as mysql @@ -302,10 +302,17 @@ class SQLDatabase(Database): # (sid, db_offset - song_sampled_offset) yield (sid, offset - mapper[hash]) + def __getstate__(self): + return (self._options,) + + def __setstate__(self, state): + self._options, = state + self.cursor = cursor_factory(**self._options) + def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n - return izip_longest(fillvalue=fillvalue, *args) + return (ifilter(None, values) for values in izip_longest(fillvalue=fillvalue, *args)) def cursor_factory(**factory_options): From 7d14e0734aece6cfaec445f18c79641e12032dd3 Mon Sep 17 00:00:00 2001 From: Wessie Date: Mon, 23 Dec 2013 14:59:08 +0100 Subject: [PATCH 27/27] Switched fingerprint_directory to using multiprocessing.Pool Fixed an issue of 'grouper' items being generators due to ifilter usage. Temporary fix applied for the need of referencing SQLDatabase.FIELD_SONGNAME in __init__ Cleaned up some pep8 style issues --- dejavu/__init__.py | 114 +++++++++++++++++++++++------------------ dejavu/database_sql.py | 5 +- 2 files changed, 66 insertions(+), 53 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index e93edac..bb52f1e 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -1,9 +1,8 @@ from dejavu.database import get_database import dejavu.decoder as decoder import fingerprint -from multiprocessing import Process, cpu_count +import multiprocessing import os -import random class Dejavu(object): @@ -30,57 +29,39 @@ class Dejavu(object): def fingerprint_directory(self, path, extensions, nprocesses=None): # Try to use the maximum amount of processes if not given. - if nprocesses is None: - try: - nprocesses = cpu_count() - except NotImplementedError: - nprocesses = 1 + try: + nprocesses = nprocesses or multiprocessing.cpu_count() + except NotImplementedError: + nprocesses = 1 + else: + nprocesses = 1 if nprocesses <= 0 else nprocesses - # convert files, shuffle order - files = list(decoder.find_files(path, extensions)) - random.shuffle(files) + pool = multiprocessing.Pool(nprocesses) - files_split = chunkify(files, nprocesses) + results = [] + for filename, _ in decoder.find_files(path, extensions): + # TODO: Don't queue up files that have already been fingerprinted. + result = pool.apply_async(_fingerprint_worker, + (filename, self.db)) + results.append(result) - # split into processes here - processes = [] - for i in range(nprocesses): + while len(results): + for result in results[:]: + # TODO: Handle errors gracefully and return them to the callee + # in some way. + try: + result.get(timeout=2) + except multiprocessing.TimeoutError: + continue + except: + import traceback, sys + traceback.print_exc(file=sys.stdout) + results.remove(result) + else: + results.remove(result) - # create process and start it - p = Process(target=self._fingerprint_worker, - args=(files_split[i], self.db)) - p.start() - processes.append(p) - - # wait for all processes to complete - for p in processes: - p.join() - - def _fingerprint_worker(self, files, db): - for filename, extension in files: - - # if there are already fingerprints in database, - # don't re-fingerprint - song_name = os.path.basename(filename).split(".")[0] - if song_name in self.songnames_set: - print("-> Already fingerprinted, continuing...") - continue - - channels, Fs = decoder.read(filename) - - # insert song name into database - song_id = db.insert_song(song_name) - - for c in range(len(channels)): - channel = channels[c] - print "-> Fingerprinting channel %d of song %s..." % (c+1, song_name) - - hashes = fingerprint.fingerprint(channel, Fs=Fs) - - db.insert_hashes(song_id, hashes) - - # only after done fingerprinting do confirm - db.set_song_fingerprinted(song_id) + pool.close() + pool.join() def fingerprint_file(self, filepath, song_name=None): channels, Fs = decoder.read(filepath) @@ -122,12 +103,14 @@ class Dejavu(object): largest_count = diff_counter[diff][sid] song_id = sid - print("Diff is %d with %d offset-aligned matches" % (largest, largest_count)) + print("Diff is %d with %d offset-aligned matches" % (largest, + largest_count)) # extract idenfication song = self.db.get_song_by_id(song_id) if song: - songname = song.get(SQLDatabase.FIELD_SONGNAME, None) + # TODO: Clarifey what `get_song_by_id` should return. + songname = song.get("song_name", None) else: return None @@ -145,6 +128,35 @@ class Dejavu(object): return r.recognize(*options, **kwoptions) +def _fingerprint_worker(filename, db): + song_name, extension = os.path.splitext(os.path.basename(filename)) + + channels, Fs = decoder.read(filename) + + # insert song into database + sid = db.insert_song(song_name) + + channel_amount = len(channels) + for channeln, channel in enumerate(channels): + # TODO: Remove prints or change them into optional logging. + print("Fingerprinting channel %d/%d for %s" % (channeln + 1, + channel_amount, + filename)) + hashes = fingerprint.fingerprint(channel, Fs=Fs) + print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount, + filename)) + + print("Inserting fingerprints for channel %d/%d for %s" % + (channeln + 1, channel_amount, filename)) + db.insert_hashes(sid, hashes) + print("Finished inserting for channel %d/%d for %s" % + (channeln + 1, channel_amount, filename)) + + print("Marking %s finished" % (filename,)) + db.set_song_fingerprinted(sid) + print("%s finished" % (filename,)) + + def chunkify(lst, n): """ Splits a list into roughly n equal parts. diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py index a93b1f9..565d83f 100644 --- a/dejavu/database_sql.py +++ b/dejavu/database_sql.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -from itertools import izip_longest, ifilter +from itertools import izip_longest import Queue import MySQLdb as mysql @@ -312,7 +312,8 @@ class SQLDatabase(Database): def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n - return (ifilter(None, values) for values in izip_longest(fillvalue=fillvalue, *args)) + return (filter(None, values) for values + in izip_longest(fillvalue=fillvalue, *args)) def cursor_factory(**factory_options):