From 254a79cdf529c1283aaa3e0afb2c6f617a9937b0 Mon Sep 17 00:00:00 2001 From: Will Drevo Date: Sun, 19 Apr 2015 15:12:16 -0400 Subject: [PATCH] Completed integration for SHA1 file hashing to avoid duplicates --- dejavu/__init__.py | 17 ++++++++-------- dejavu/database.py | 6 ++++++ dejavu/database_sql.py | 45 +++++++++++++++++++----------------------- dejavu/decoder.py | 20 +++++++++++-------- 4 files changed, 46 insertions(+), 42 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index 9d4c8a1..4f6e6e8 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -1,4 +1,4 @@ -from dejavu.database import get_database +from dejavu.database import get_database, Database import dejavu.decoder as decoder import fingerprint import multiprocessing @@ -39,7 +39,7 @@ class Dejavu(object): self.songs = self.db.get_songs() self.songhashes_set = set() # to know which ones we've computed before for song in self.songs: - song_hash = song[self.db.FIELD_SHA1] + song_hash = song[Database.FIELD_FILE_SHA1] self.songhashes_set.add(song_hash) def fingerprint_directory(self, path, extensions, nprocesses=None): @@ -154,13 +154,12 @@ class Dejavu(object): fingerprint.DEFAULT_WINDOW_SIZE * fingerprint.DEFAULT_OVERLAP_RATIO, 5) song = { - Dejavu.SONG_ID: song_id, - Dejavu.SONG_NAME: songname, - Dejavu.CONFIDENCE: largest_count, - Dejavu.OFFSET: int(largest), - Dejavu.OFFSET_SECS: nseconds - } - + Dejavu.SONG_ID : song_id, + Dejavu.SONG_NAME : songname, + Dejavu.CONFIDENCE : largest_count, + Dejavu.OFFSET : int(largest), + Dejavu.OFFSET_SECS : nseconds, + Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),} return song def recognize(self, recognizer, *options, **kwoptions): diff --git a/dejavu/database.py b/dejavu/database.py index 5903541..e5732ff 100755 --- a/dejavu/database.py +++ b/dejavu/database.py @@ -5,6 +5,12 @@ import abc class Database(object): __metaclass__ = abc.ABCMeta + FIELD_FILE_SHA1 = 'file_sha1' + FIELD_SONG_ID = 'song_id' + FIELD_SONGNAME = 'song_name' + FIELD_OFFSET = 'offset' + FIELD_HASH = 'hash' + # Name of your Database subclass, this is used in configuration # to refer to your class type = None diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py index d7c0dc9..031bdcb 100755 --- a/dejavu/database_sql.py +++ b/dejavu/database_sql.py @@ -51,11 +51,6 @@ class SQLDatabase(Database): SONGS_TABLENAME = "songs" # fields - FIELD_HASH = "hash" - FIELD_SONG_ID = "song_id" - FIELD_OFFSET = "offset" - FIELD_SHA1 = 'file_sha1' - FIELD_SONGNAME = "song_name" FIELD_FINGERPRINTED = "fingerprinted" # creates @@ -68,10 +63,10 @@ class SQLDatabase(Database): UNIQUE KEY `unique_constraint` (%s, %s, %s), FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE ) ENGINE=INNODB;""" % ( - FINGERPRINTS_TABLENAME, FIELD_HASH, - FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, - FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, - FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID + FINGERPRINTS_TABLENAME, Database.FIELD_HASH, + Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH, + Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH, + Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID ) CREATE_SONGS_TABLE = """ @@ -79,41 +74,41 @@ class SQLDatabase(Database): `%s` mediumint unsigned not null auto_increment, `%s` varchar(250) not null, `%s` tinyint default 0, - `%s` binary(10) not null, + `%s` binary(20) not null, PRIMARY KEY (`%s`), UNIQUE KEY `%s` (`%s`) ) ENGINE=INNODB;""" % ( - SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, - FIELD_SHA1, - FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, + SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, FIELD_FINGERPRINTED, + Database.FIELD_FILE_SHA1, + Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, ) # inserts (ignores duplicates) INSERT_FINGERPRINT = """ INSERT IGNORE INTO %s (%s, %s, %s) values (UNHEX(%%s), %%s, %%s); - """ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) + """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET) INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % ( - SONGS_TABLENAME, FIELD_SONGNAME, FIELD_SHA1) + SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1) # selects SELECT = """ SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); - """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) + """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH) SELECT_MULTIPLE = """ SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); - """ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, - FINGERPRINTS_TABLENAME, FIELD_HASH) + """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, + FINGERPRINTS_TABLENAME, Database.FIELD_HASH) SELECT_ALL = """ SELECT %s, %s FROM %s; - """ % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) + """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME) SELECT_SONG = """ - SELECT %s, HEX(%s) FROM %s WHERE %s = %%s - """ % (FIELD_SONGNAME, FIELD_SHA1, SONGS_TABLENAME, FIELD_SONG_ID) + SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s; + """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID) SELECT_NUM_FINGERPRINTS = """ SELECT COUNT(*) as n FROM %s @@ -121,11 +116,11 @@ class SQLDatabase(Database): SELECT_UNIQUE_SONG_IDS = """ SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; - """ % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) + """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) SELECT_SONGS = """ - SELECT %s, %s, HEX(%s) FROM %s WHERE %s = 1; - """ % (FIELD_SONG_ID, FIELD_SONGNAME, FIELD_SHA1, + SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1; + """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, FIELD_FINGERPRINTED) # drops @@ -135,7 +130,7 @@ class SQLDatabase(Database): # update UPDATE_SONG_FINGERPRINTED = """ UPDATE %s SET %s = 1 WHERE %s = %%s - """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) + """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID) # delete DELETE_UNFINGERPRINTED = """ diff --git a/dejavu/decoder.py b/dejavu/decoder.py index dde6b6b..04aa39f 100755 --- a/dejavu/decoder.py +++ b/dejavu/decoder.py @@ -6,17 +6,21 @@ from pydub.utils import audioop import wavio from hashlib import sha1 -def unique_hash(filepath): +def unique_hash(filepath, blocksize=2**20): """ Small function to generate a hash to uniquely generate - a file. Taken / inspired from git's way via stackoverflow: - http://stackoverflow.com/questions/552659 + a file. Inspired by MD5 version here: + http://stackoverflow.com/a/1131255/712997 + + Works with large files. """ - filesize_bytes = os.path.getsize(filepath) s = sha1() - s.update(("blob %u\0" % filesize_bytes).encode('ascii')) - with open(filepath, 'rb') as f: - s.update(f.read()) - return s.hexdigest() + with open(filepath , "rb") as f: + while True: + buf = f.read(blocksize) + if not buf: + break + s.update(buf) + return s.hexdigest().upper() def find_files(path, extensions):