Merge branch 'thomascirca-use_sha1'

This commit is contained in:
Will Drevo 2015-04-19 15:12:37 -04:00
commit 8efd97564a
5 changed files with 73 additions and 50 deletions

View file

@ -1,4 +1,4 @@
from dejavu.database import get_database from dejavu.database import get_database, Database
import dejavu.decoder as decoder import dejavu.decoder as decoder
import fingerprint import fingerprint
import multiprocessing import multiprocessing
@ -36,12 +36,11 @@ class Dejavu(object):
def get_fingerprinted_songs(self): def get_fingerprinted_songs(self):
# get songs previously indexed # get songs previously indexed
# TODO: should probably use a checksum of the file instead of filename
self.songs = self.db.get_songs() self.songs = self.db.get_songs()
self.songnames_set = set() # to know which ones we've computed before self.songhashes_set = set() # to know which ones we've computed before
for song in self.songs: for song in self.songs:
song_name = song[self.db.FIELD_SONGNAME] song_hash = song[Database.FIELD_FILE_SHA1]
self.songnames_set.add(song_name) self.songhashes_set.add(song_hash)
def fingerprint_directory(self, path, extensions, nprocesses=None): def fingerprint_directory(self, path, extensions, nprocesses=None):
# Try to use the maximum amount of processes if not given. # Try to use the maximum amount of processes if not given.
@ -58,7 +57,7 @@ class Dejavu(object):
for filename, _ in decoder.find_files(path, extensions): for filename, _ in decoder.find_files(path, extensions):
# don't refingerprint already fingerprinted files # don't refingerprint already fingerprinted files
if decoder.path_to_songname(filename) in self.songnames_set: if decoder.unique_hash(filename) in self.songhashes_set:
print "%s already fingerprinted, continuing..." % filename print "%s already fingerprinted, continuing..." % filename
continue continue
@ -75,7 +74,7 @@ class Dejavu(object):
# Loop till we have all of them # Loop till we have all of them
while True: while True:
try: try:
song_name, hashes = iterator.next() song_name, hashes, file_hash = iterator.next()
except multiprocessing.TimeoutError: except multiprocessing.TimeoutError:
continue continue
except StopIteration: except StopIteration:
@ -85,7 +84,7 @@ class Dejavu(object):
# Print traceback because we can't reraise it here # Print traceback because we can't reraise it here
traceback.print_exc(file=sys.stdout) traceback.print_exc(file=sys.stdout)
else: else:
sid = self.db.insert_song(song_name) sid = self.db.insert_song(song_name, file_hash)
self.db.insert_hashes(sid, hashes) self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid) self.db.set_song_fingerprinted(sid)
@ -96,16 +95,18 @@ class Dejavu(object):
def fingerprint_file(self, filepath, song_name=None): def fingerprint_file(self, filepath, song_name=None):
songname = decoder.path_to_songname(filepath) songname = decoder.path_to_songname(filepath)
song_hash = decoder.unique_hash(filepath)
song_name = song_name or songname song_name = song_name or songname
# don't refingerprint already fingerprinted files # don't refingerprint already fingerprinted files
if song_name in self.songnames_set: if song_hash in self.songhashes_set:
print "%s already fingerprinted, continuing..." % song_name print "%s already fingerprinted, continuing..." % song_name
else: else:
song_name, hashes = _fingerprint_worker(filepath, song_name, hashes, file_hash = _fingerprint_worker(
self.limit, filepath,
song_name=song_name) self.limit,
song_name=song_name
sid = self.db.insert_song(song_name) )
sid = self.db.insert_song(song_name, file_hash)
self.db.insert_hashes(sid, hashes) self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid) self.db.set_song_fingerprinted(sid)
@ -153,13 +154,12 @@ class Dejavu(object):
fingerprint.DEFAULT_WINDOW_SIZE * fingerprint.DEFAULT_WINDOW_SIZE *
fingerprint.DEFAULT_OVERLAP_RATIO, 5) fingerprint.DEFAULT_OVERLAP_RATIO, 5)
song = { song = {
Dejavu.SONG_ID: song_id, Dejavu.SONG_ID : song_id,
Dejavu.SONG_NAME: songname, Dejavu.SONG_NAME : songname,
Dejavu.CONFIDENCE: largest_count, Dejavu.CONFIDENCE : largest_count,
Dejavu.OFFSET: int(largest), Dejavu.OFFSET : int(largest),
Dejavu.OFFSET_SECS: nseconds Dejavu.OFFSET_SECS : nseconds,
} Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),}
return song return song
def recognize(self, recognizer, *options, **kwoptions): def recognize(self, recognizer, *options, **kwoptions):
@ -177,7 +177,7 @@ def _fingerprint_worker(filename, limit=None, song_name=None):
songname, extension = os.path.splitext(os.path.basename(filename)) songname, extension = os.path.splitext(os.path.basename(filename))
song_name = song_name or songname song_name = song_name or songname
channels, Fs = decoder.read(filename, limit) channels, Fs, file_hash = decoder.read(filename, limit)
result = set() result = set()
channel_amount = len(channels) channel_amount = len(channels)
@ -191,7 +191,7 @@ def _fingerprint_worker(filename, limit=None, song_name=None):
filename)) filename))
result |= set(hashes) result |= set(hashes)
return song_name, result return song_name, result, file_hash
def chunkify(lst, n): def chunkify(lst, n):

View file

@ -5,6 +5,12 @@ import abc
class Database(object): class Database(object):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
FIELD_FILE_SHA1 = 'file_sha1'
FIELD_SONG_ID = 'song_id'
FIELD_SONGNAME = 'song_name'
FIELD_OFFSET = 'offset'
FIELD_HASH = 'hash'
# Name of your Database subclass, this is used in configuration # Name of your Database subclass, this is used in configuration
# to refer to your class # to refer to your class
type = None type = None

View file

@ -51,10 +51,6 @@ class SQLDatabase(Database):
SONGS_TABLENAME = "songs" SONGS_TABLENAME = "songs"
# fields # fields
FIELD_HASH = "hash"
FIELD_SONG_ID = "song_id"
FIELD_OFFSET = "offset"
FIELD_SONGNAME = "song_name"
FIELD_FINGERPRINTED = "fingerprinted" FIELD_FINGERPRINTED = "fingerprinted"
# creates # creates
@ -67,10 +63,10 @@ class SQLDatabase(Database):
UNIQUE KEY `unique_constraint` (%s, %s, %s), UNIQUE KEY `unique_constraint` (%s, %s, %s),
FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
) ENGINE=INNODB;""" % ( ) ENGINE=INNODB;""" % (
FINGERPRINTS_TABLENAME, FIELD_HASH, FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
) )
CREATE_SONGS_TABLE = """ CREATE_SONGS_TABLE = """
@ -78,39 +74,41 @@ class SQLDatabase(Database):
`%s` mediumint unsigned not null auto_increment, `%s` mediumint unsigned not null auto_increment,
`%s` varchar(250) not null, `%s` varchar(250) not null,
`%s` tinyint default 0, `%s` tinyint default 0,
`%s` binary(20) not null,
PRIMARY KEY (`%s`), PRIMARY KEY (`%s`),
UNIQUE KEY `%s` (`%s`) UNIQUE KEY `%s` (`%s`)
) ENGINE=INNODB;""" % ( ) ENGINE=INNODB;""" % (
SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, FIELD_FINGERPRINTED,
FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, Database.FIELD_FILE_SHA1,
Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
) )
# inserts (ignores duplicates) # inserts (ignores duplicates)
INSERT_FINGERPRINT = """ INSERT_FINGERPRINT = """
INSERT IGNORE INTO %s (%s, %s, %s) values INSERT IGNORE INTO %s (%s, %s, %s) values
(UNHEX(%%s), %%s, %%s); (UNHEX(%%s), %%s, %%s);
""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
INSERT_SONG = "INSERT INTO %s (%s) values (%%s);" % ( INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % (
SONGS_TABLENAME, FIELD_SONGNAME) SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1)
# selects # selects
SELECT = """ SELECT = """
SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
SELECT_MULTIPLE = """ SELECT_MULTIPLE = """
SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
""" % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
FINGERPRINTS_TABLENAME, FIELD_HASH) FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
SELECT_ALL = """ SELECT_ALL = """
SELECT %s, %s FROM %s; SELECT %s, %s FROM %s;
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
SELECT_SONG = """ SELECT_SONG = """
SELECT %s FROM %s WHERE %s = %%s SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
""" % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID) """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
SELECT_NUM_FINGERPRINTS = """ SELECT_NUM_FINGERPRINTS = """
SELECT COUNT(*) as n FROM %s SELECT COUNT(*) as n FROM %s
@ -118,11 +116,12 @@ class SQLDatabase(Database):
SELECT_UNIQUE_SONG_IDS = """ SELECT_UNIQUE_SONG_IDS = """
SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
""" % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
SELECT_SONGS = """ SELECT_SONGS = """
SELECT %s, %s FROM %s WHERE %s = 1; SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
""" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED) """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
SONGS_TABLENAME, FIELD_FINGERPRINTED)
# drops # drops
DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
@ -131,7 +130,7 @@ class SQLDatabase(Database):
# update # update
UPDATE_SONG_FINGERPRINTED = """ UPDATE_SONG_FINGERPRINTED = """
UPDATE %s SET %s = 1 WHERE %s = %%s UPDATE %s SET %s = 1 WHERE %s = %%s
""" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
# delete # delete
DELETE_UNFINGERPRINTED = """ DELETE_UNFINGERPRINTED = """
@ -235,12 +234,12 @@ class SQLDatabase(Database):
with self.cursor() as cur: with self.cursor() as cur:
cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset)) cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
def insert_song(self, songname): def insert_song(self, songname, file_hash):
""" """
Inserts song in the database and returns the ID of the inserted record. Inserts song in the database and returns the ID of the inserted record.
""" """
with self.cursor() as cur: with self.cursor() as cur:
cur.execute(self.INSERT_SONG, (songname,)) cur.execute(self.INSERT_SONG, (songname, file_hash))
return cur.lastrowid return cur.lastrowid
def query(self, hash): def query(self, hash):

View file

@ -4,6 +4,24 @@ import numpy as np
from pydub import AudioSegment from pydub import AudioSegment
from pydub.utils import audioop from pydub.utils import audioop
import wavio import wavio
from hashlib import sha1
def unique_hash(filepath, blocksize=2**20):
""" Small function to generate a hash to uniquely generate
a file. Inspired by MD5 version here:
http://stackoverflow.com/a/1131255/712997
Works with large files.
"""
s = sha1()
with open(filepath , "rb") as f:
while True:
buf = f.read(blocksize)
if not buf:
break
s.update(buf)
return s.hexdigest().upper()
def find_files(path, extensions): def find_files(path, extensions):
# Allow both with ".mp3" and without "mp3" to be used for extensions # Allow both with ".mp3" and without "mp3" to be used for extensions
@ -55,7 +73,7 @@ def read(filename, limit=None):
for chn in audiofile: for chn in audiofile:
channels.append(chn) channels.append(chn)
return channels, fs return channels, audiofile.frame_rate, unique_hash(filename)
def path_to_songname(path): def path_to_songname(path):

View file

@ -26,7 +26,7 @@ class FileRecognizer(BaseRecognizer):
super(FileRecognizer, self).__init__(dejavu) super(FileRecognizer, self).__init__(dejavu)
def recognize_file(self, filename): def recognize_file(self, filename):
frames, self.Fs = decoder.read(filename, self.dejavu.limit) frames, self.Fs, file_hash = decoder.read(filename, self.dejavu.limit)
t = time.time() t = time.time()
match = self._recognize(*frames) match = self._recognize(*frames)