mirror of
https://github.com/correl/dejavu.git
synced 2024-11-23 11:09:52 +00:00
Merge branch 'thomascirca-use_sha1'
This commit is contained in:
commit
8efd97564a
5 changed files with 73 additions and 50 deletions
|
@ -1,4 +1,4 @@
|
||||||
from dejavu.database import get_database
|
from dejavu.database import get_database, Database
|
||||||
import dejavu.decoder as decoder
|
import dejavu.decoder as decoder
|
||||||
import fingerprint
|
import fingerprint
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
@ -36,12 +36,11 @@ class Dejavu(object):
|
||||||
|
|
||||||
def get_fingerprinted_songs(self):
|
def get_fingerprinted_songs(self):
|
||||||
# get songs previously indexed
|
# get songs previously indexed
|
||||||
# TODO: should probably use a checksum of the file instead of filename
|
|
||||||
self.songs = self.db.get_songs()
|
self.songs = self.db.get_songs()
|
||||||
self.songnames_set = set() # to know which ones we've computed before
|
self.songhashes_set = set() # to know which ones we've computed before
|
||||||
for song in self.songs:
|
for song in self.songs:
|
||||||
song_name = song[self.db.FIELD_SONGNAME]
|
song_hash = song[Database.FIELD_FILE_SHA1]
|
||||||
self.songnames_set.add(song_name)
|
self.songhashes_set.add(song_hash)
|
||||||
|
|
||||||
def fingerprint_directory(self, path, extensions, nprocesses=None):
|
def fingerprint_directory(self, path, extensions, nprocesses=None):
|
||||||
# Try to use the maximum amount of processes if not given.
|
# Try to use the maximum amount of processes if not given.
|
||||||
|
@ -58,7 +57,7 @@ class Dejavu(object):
|
||||||
for filename, _ in decoder.find_files(path, extensions):
|
for filename, _ in decoder.find_files(path, extensions):
|
||||||
|
|
||||||
# don't refingerprint already fingerprinted files
|
# don't refingerprint already fingerprinted files
|
||||||
if decoder.path_to_songname(filename) in self.songnames_set:
|
if decoder.unique_hash(filename) in self.songhashes_set:
|
||||||
print "%s already fingerprinted, continuing..." % filename
|
print "%s already fingerprinted, continuing..." % filename
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -75,7 +74,7 @@ class Dejavu(object):
|
||||||
# Loop till we have all of them
|
# Loop till we have all of them
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
song_name, hashes = iterator.next()
|
song_name, hashes, file_hash = iterator.next()
|
||||||
except multiprocessing.TimeoutError:
|
except multiprocessing.TimeoutError:
|
||||||
continue
|
continue
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
|
@ -85,7 +84,7 @@ class Dejavu(object):
|
||||||
# Print traceback because we can't reraise it here
|
# Print traceback because we can't reraise it here
|
||||||
traceback.print_exc(file=sys.stdout)
|
traceback.print_exc(file=sys.stdout)
|
||||||
else:
|
else:
|
||||||
sid = self.db.insert_song(song_name)
|
sid = self.db.insert_song(song_name, file_hash)
|
||||||
|
|
||||||
self.db.insert_hashes(sid, hashes)
|
self.db.insert_hashes(sid, hashes)
|
||||||
self.db.set_song_fingerprinted(sid)
|
self.db.set_song_fingerprinted(sid)
|
||||||
|
@ -96,16 +95,18 @@ class Dejavu(object):
|
||||||
|
|
||||||
def fingerprint_file(self, filepath, song_name=None):
|
def fingerprint_file(self, filepath, song_name=None):
|
||||||
songname = decoder.path_to_songname(filepath)
|
songname = decoder.path_to_songname(filepath)
|
||||||
|
song_hash = decoder.unique_hash(filepath)
|
||||||
song_name = song_name or songname
|
song_name = song_name or songname
|
||||||
# don't refingerprint already fingerprinted files
|
# don't refingerprint already fingerprinted files
|
||||||
if song_name in self.songnames_set:
|
if song_hash in self.songhashes_set:
|
||||||
print "%s already fingerprinted, continuing..." % song_name
|
print "%s already fingerprinted, continuing..." % song_name
|
||||||
else:
|
else:
|
||||||
song_name, hashes = _fingerprint_worker(filepath,
|
song_name, hashes, file_hash = _fingerprint_worker(
|
||||||
self.limit,
|
filepath,
|
||||||
song_name=song_name)
|
self.limit,
|
||||||
|
song_name=song_name
|
||||||
sid = self.db.insert_song(song_name)
|
)
|
||||||
|
sid = self.db.insert_song(song_name, file_hash)
|
||||||
|
|
||||||
self.db.insert_hashes(sid, hashes)
|
self.db.insert_hashes(sid, hashes)
|
||||||
self.db.set_song_fingerprinted(sid)
|
self.db.set_song_fingerprinted(sid)
|
||||||
|
@ -153,13 +154,12 @@ class Dejavu(object):
|
||||||
fingerprint.DEFAULT_WINDOW_SIZE *
|
fingerprint.DEFAULT_WINDOW_SIZE *
|
||||||
fingerprint.DEFAULT_OVERLAP_RATIO, 5)
|
fingerprint.DEFAULT_OVERLAP_RATIO, 5)
|
||||||
song = {
|
song = {
|
||||||
Dejavu.SONG_ID: song_id,
|
Dejavu.SONG_ID : song_id,
|
||||||
Dejavu.SONG_NAME: songname,
|
Dejavu.SONG_NAME : songname,
|
||||||
Dejavu.CONFIDENCE: largest_count,
|
Dejavu.CONFIDENCE : largest_count,
|
||||||
Dejavu.OFFSET: int(largest),
|
Dejavu.OFFSET : int(largest),
|
||||||
Dejavu.OFFSET_SECS: nseconds
|
Dejavu.OFFSET_SECS : nseconds,
|
||||||
}
|
Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),}
|
||||||
|
|
||||||
return song
|
return song
|
||||||
|
|
||||||
def recognize(self, recognizer, *options, **kwoptions):
|
def recognize(self, recognizer, *options, **kwoptions):
|
||||||
|
@ -177,7 +177,7 @@ def _fingerprint_worker(filename, limit=None, song_name=None):
|
||||||
|
|
||||||
songname, extension = os.path.splitext(os.path.basename(filename))
|
songname, extension = os.path.splitext(os.path.basename(filename))
|
||||||
song_name = song_name or songname
|
song_name = song_name or songname
|
||||||
channels, Fs = decoder.read(filename, limit)
|
channels, Fs, file_hash = decoder.read(filename, limit)
|
||||||
result = set()
|
result = set()
|
||||||
channel_amount = len(channels)
|
channel_amount = len(channels)
|
||||||
|
|
||||||
|
@ -191,7 +191,7 @@ def _fingerprint_worker(filename, limit=None, song_name=None):
|
||||||
filename))
|
filename))
|
||||||
result |= set(hashes)
|
result |= set(hashes)
|
||||||
|
|
||||||
return song_name, result
|
return song_name, result, file_hash
|
||||||
|
|
||||||
|
|
||||||
def chunkify(lst, n):
|
def chunkify(lst, n):
|
||||||
|
|
|
@ -5,6 +5,12 @@ import abc
|
||||||
class Database(object):
|
class Database(object):
|
||||||
__metaclass__ = abc.ABCMeta
|
__metaclass__ = abc.ABCMeta
|
||||||
|
|
||||||
|
FIELD_FILE_SHA1 = 'file_sha1'
|
||||||
|
FIELD_SONG_ID = 'song_id'
|
||||||
|
FIELD_SONGNAME = 'song_name'
|
||||||
|
FIELD_OFFSET = 'offset'
|
||||||
|
FIELD_HASH = 'hash'
|
||||||
|
|
||||||
# Name of your Database subclass, this is used in configuration
|
# Name of your Database subclass, this is used in configuration
|
||||||
# to refer to your class
|
# to refer to your class
|
||||||
type = None
|
type = None
|
||||||
|
|
|
@ -51,10 +51,6 @@ class SQLDatabase(Database):
|
||||||
SONGS_TABLENAME = "songs"
|
SONGS_TABLENAME = "songs"
|
||||||
|
|
||||||
# fields
|
# fields
|
||||||
FIELD_HASH = "hash"
|
|
||||||
FIELD_SONG_ID = "song_id"
|
|
||||||
FIELD_OFFSET = "offset"
|
|
||||||
FIELD_SONGNAME = "song_name"
|
|
||||||
FIELD_FINGERPRINTED = "fingerprinted"
|
FIELD_FINGERPRINTED = "fingerprinted"
|
||||||
|
|
||||||
# creates
|
# creates
|
||||||
|
@ -67,10 +63,10 @@ class SQLDatabase(Database):
|
||||||
UNIQUE KEY `unique_constraint` (%s, %s, %s),
|
UNIQUE KEY `unique_constraint` (%s, %s, %s),
|
||||||
FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
|
FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
|
||||||
) ENGINE=INNODB;""" % (
|
) ENGINE=INNODB;""" % (
|
||||||
FINGERPRINTS_TABLENAME, FIELD_HASH,
|
FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
|
||||||
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
|
Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
|
||||||
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
|
Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
|
||||||
FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID
|
Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
|
||||||
)
|
)
|
||||||
|
|
||||||
CREATE_SONGS_TABLE = """
|
CREATE_SONGS_TABLE = """
|
||||||
|
@ -78,39 +74,41 @@ class SQLDatabase(Database):
|
||||||
`%s` mediumint unsigned not null auto_increment,
|
`%s` mediumint unsigned not null auto_increment,
|
||||||
`%s` varchar(250) not null,
|
`%s` varchar(250) not null,
|
||||||
`%s` tinyint default 0,
|
`%s` tinyint default 0,
|
||||||
|
`%s` binary(20) not null,
|
||||||
PRIMARY KEY (`%s`),
|
PRIMARY KEY (`%s`),
|
||||||
UNIQUE KEY `%s` (`%s`)
|
UNIQUE KEY `%s` (`%s`)
|
||||||
) ENGINE=INNODB;""" % (
|
) ENGINE=INNODB;""" % (
|
||||||
SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
|
SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, FIELD_FINGERPRINTED,
|
||||||
FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID,
|
Database.FIELD_FILE_SHA1,
|
||||||
|
Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
|
||||||
)
|
)
|
||||||
|
|
||||||
# inserts (ignores duplicates)
|
# inserts (ignores duplicates)
|
||||||
INSERT_FINGERPRINT = """
|
INSERT_FINGERPRINT = """
|
||||||
INSERT IGNORE INTO %s (%s, %s, %s) values
|
INSERT IGNORE INTO %s (%s, %s, %s) values
|
||||||
(UNHEX(%%s), %%s, %%s);
|
(UNHEX(%%s), %%s, %%s);
|
||||||
""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET)
|
""" % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
|
||||||
|
|
||||||
INSERT_SONG = "INSERT INTO %s (%s) values (%%s);" % (
|
INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % (
|
||||||
SONGS_TABLENAME, FIELD_SONGNAME)
|
SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1)
|
||||||
|
|
||||||
# selects
|
# selects
|
||||||
SELECT = """
|
SELECT = """
|
||||||
SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
|
SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
|
||||||
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH)
|
""" % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
||||||
|
|
||||||
SELECT_MULTIPLE = """
|
SELECT_MULTIPLE = """
|
||||||
SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
|
SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
|
||||||
""" % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET,
|
""" % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
|
||||||
FINGERPRINTS_TABLENAME, FIELD_HASH)
|
FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
||||||
|
|
||||||
SELECT_ALL = """
|
SELECT_ALL = """
|
||||||
SELECT %s, %s FROM %s;
|
SELECT %s, %s FROM %s;
|
||||||
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME)
|
""" % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
|
||||||
|
|
||||||
SELECT_SONG = """
|
SELECT_SONG = """
|
||||||
SELECT %s FROM %s WHERE %s = %%s
|
SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
|
||||||
""" % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)
|
""" % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
|
||||||
|
|
||||||
SELECT_NUM_FINGERPRINTS = """
|
SELECT_NUM_FINGERPRINTS = """
|
||||||
SELECT COUNT(*) as n FROM %s
|
SELECT COUNT(*) as n FROM %s
|
||||||
|
@ -118,11 +116,12 @@ class SQLDatabase(Database):
|
||||||
|
|
||||||
SELECT_UNIQUE_SONG_IDS = """
|
SELECT_UNIQUE_SONG_IDS = """
|
||||||
SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
|
SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
|
||||||
""" % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
""" % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
||||||
|
|
||||||
SELECT_SONGS = """
|
SELECT_SONGS = """
|
||||||
SELECT %s, %s FROM %s WHERE %s = 1;
|
SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
|
||||||
""" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
""" % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
|
||||||
|
SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
||||||
|
|
||||||
# drops
|
# drops
|
||||||
DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
|
DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
|
||||||
|
@ -131,7 +130,7 @@ class SQLDatabase(Database):
|
||||||
# update
|
# update
|
||||||
UPDATE_SONG_FINGERPRINTED = """
|
UPDATE_SONG_FINGERPRINTED = """
|
||||||
UPDATE %s SET %s = 1 WHERE %s = %%s
|
UPDATE %s SET %s = 1 WHERE %s = %%s
|
||||||
""" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID)
|
""" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
|
||||||
|
|
||||||
# delete
|
# delete
|
||||||
DELETE_UNFINGERPRINTED = """
|
DELETE_UNFINGERPRINTED = """
|
||||||
|
@ -235,12 +234,12 @@ class SQLDatabase(Database):
|
||||||
with self.cursor() as cur:
|
with self.cursor() as cur:
|
||||||
cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
|
cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
|
||||||
|
|
||||||
def insert_song(self, songname):
|
def insert_song(self, songname, file_hash):
|
||||||
"""
|
"""
|
||||||
Inserts song in the database and returns the ID of the inserted record.
|
Inserts song in the database and returns the ID of the inserted record.
|
||||||
"""
|
"""
|
||||||
with self.cursor() as cur:
|
with self.cursor() as cur:
|
||||||
cur.execute(self.INSERT_SONG, (songname,))
|
cur.execute(self.INSERT_SONG, (songname, file_hash))
|
||||||
return cur.lastrowid
|
return cur.lastrowid
|
||||||
|
|
||||||
def query(self, hash):
|
def query(self, hash):
|
||||||
|
|
|
@ -4,6 +4,24 @@ import numpy as np
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from pydub.utils import audioop
|
from pydub.utils import audioop
|
||||||
import wavio
|
import wavio
|
||||||
|
from hashlib import sha1
|
||||||
|
|
||||||
|
def unique_hash(filepath, blocksize=2**20):
|
||||||
|
""" Small function to generate a hash to uniquely generate
|
||||||
|
a file. Inspired by MD5 version here:
|
||||||
|
http://stackoverflow.com/a/1131255/712997
|
||||||
|
|
||||||
|
Works with large files.
|
||||||
|
"""
|
||||||
|
s = sha1()
|
||||||
|
with open(filepath , "rb") as f:
|
||||||
|
while True:
|
||||||
|
buf = f.read(blocksize)
|
||||||
|
if not buf:
|
||||||
|
break
|
||||||
|
s.update(buf)
|
||||||
|
return s.hexdigest().upper()
|
||||||
|
|
||||||
|
|
||||||
def find_files(path, extensions):
|
def find_files(path, extensions):
|
||||||
# Allow both with ".mp3" and without "mp3" to be used for extensions
|
# Allow both with ".mp3" and without "mp3" to be used for extensions
|
||||||
|
@ -55,7 +73,7 @@ def read(filename, limit=None):
|
||||||
for chn in audiofile:
|
for chn in audiofile:
|
||||||
channels.append(chn)
|
channels.append(chn)
|
||||||
|
|
||||||
return channels, fs
|
return channels, audiofile.frame_rate, unique_hash(filename)
|
||||||
|
|
||||||
|
|
||||||
def path_to_songname(path):
|
def path_to_songname(path):
|
||||||
|
|
|
@ -26,7 +26,7 @@ class FileRecognizer(BaseRecognizer):
|
||||||
super(FileRecognizer, self).__init__(dejavu)
|
super(FileRecognizer, self).__init__(dejavu)
|
||||||
|
|
||||||
def recognize_file(self, filename):
|
def recognize_file(self, filename):
|
||||||
frames, self.Fs = decoder.read(filename, self.dejavu.limit)
|
frames, self.Fs, file_hash = decoder.read(filename, self.dejavu.limit)
|
||||||
|
|
||||||
t = time.time()
|
t = time.time()
|
||||||
match = self._recognize(*frames)
|
match = self._recognize(*frames)
|
||||||
|
|
Loading…
Reference in a new issue