Completed integration for SHA1 file hashing to avoid duplicates

This commit is contained in:
Will Drevo 2015-04-19 15:12:16 -04:00
parent 27ed19b0f0
commit 254a79cdf5
4 changed files with 46 additions and 42 deletions

View file

@ -1,4 +1,4 @@
from dejavu.database import get_database from dejavu.database import get_database, Database
import dejavu.decoder as decoder import dejavu.decoder as decoder
import fingerprint import fingerprint
import multiprocessing import multiprocessing
@ -39,7 +39,7 @@ class Dejavu(object):
self.songs = self.db.get_songs() self.songs = self.db.get_songs()
self.songhashes_set = set() # to know which ones we've computed before self.songhashes_set = set() # to know which ones we've computed before
for song in self.songs: for song in self.songs:
song_hash = song[self.db.FIELD_SHA1] song_hash = song[Database.FIELD_FILE_SHA1]
self.songhashes_set.add(song_hash) self.songhashes_set.add(song_hash)
def fingerprint_directory(self, path, extensions, nprocesses=None): def fingerprint_directory(self, path, extensions, nprocesses=None):
@ -158,9 +158,8 @@ class Dejavu(object):
Dejavu.SONG_NAME : songname, Dejavu.SONG_NAME : songname,
Dejavu.CONFIDENCE : largest_count, Dejavu.CONFIDENCE : largest_count,
Dejavu.OFFSET : int(largest), Dejavu.OFFSET : int(largest),
Dejavu.OFFSET_SECS: nseconds Dejavu.OFFSET_SECS : nseconds,
} Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),}
return song return song
def recognize(self, recognizer, *options, **kwoptions): def recognize(self, recognizer, *options, **kwoptions):

View file

@ -5,6 +5,12 @@ import abc
class Database(object): class Database(object):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
FIELD_FILE_SHA1 = 'file_sha1'
FIELD_SONG_ID = 'song_id'
FIELD_SONGNAME = 'song_name'
FIELD_OFFSET = 'offset'
FIELD_HASH = 'hash'
# Name of your Database subclass, this is used in configuration # Name of your Database subclass, this is used in configuration
# to refer to your class # to refer to your class
type = None type = None

View file

@ -51,11 +51,6 @@ class SQLDatabase(Database):
SONGS_TABLENAME = "songs" SONGS_TABLENAME = "songs"
# fields # fields
FIELD_HASH = "hash"
FIELD_SONG_ID = "song_id"
FIELD_OFFSET = "offset"
FIELD_SHA1 = 'file_sha1'
FIELD_SONGNAME = "song_name"
FIELD_FINGERPRINTED = "fingerprinted" FIELD_FINGERPRINTED = "fingerprinted"
# creates # creates
@ -68,10 +63,10 @@ class SQLDatabase(Database):
UNIQUE KEY `unique_constraint` (%s, %s, %s), UNIQUE KEY `unique_constraint` (%s, %s, %s),
FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
) ENGINE=INNODB;""" % ( ) ENGINE=INNODB;""" % (
FINGERPRINTS_TABLENAME, FIELD_HASH, FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
) )
CREATE_SONGS_TABLE = """ CREATE_SONGS_TABLE = """
@ -79,41 +74,41 @@ class SQLDatabase(Database):
`%s` mediumint unsigned not null auto_increment, `%s` mediumint unsigned not null auto_increment,
`%s` varchar(250) not null, `%s` varchar(250) not null,
`%s` tinyint default 0, `%s` tinyint default 0,
`%s` binary(10) not null, `%s` binary(20) not null,
PRIMARY KEY (`%s`), PRIMARY KEY (`%s`),
UNIQUE KEY `%s` (`%s`) UNIQUE KEY `%s` (`%s`)
) ENGINE=INNODB;""" % ( ) ENGINE=INNODB;""" % (
SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED, SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, FIELD_FINGERPRINTED,
FIELD_SHA1, Database.FIELD_FILE_SHA1,
FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
) )
# inserts (ignores duplicates) # inserts (ignores duplicates)
INSERT_FINGERPRINT = """ INSERT_FINGERPRINT = """
INSERT IGNORE INTO %s (%s, %s, %s) values INSERT IGNORE INTO %s (%s, %s, %s) values
(UNHEX(%%s), %%s, %%s); (UNHEX(%%s), %%s, %%s);
""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % ( INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % (
SONGS_TABLENAME, FIELD_SONGNAME, FIELD_SHA1) SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1)
# selects # selects
SELECT = """ SELECT = """
SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
SELECT_MULTIPLE = """ SELECT_MULTIPLE = """
SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
""" % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
FINGERPRINTS_TABLENAME, FIELD_HASH) FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
SELECT_ALL = """ SELECT_ALL = """
SELECT %s, %s FROM %s; SELECT %s, %s FROM %s;
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME) """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
SELECT_SONG = """ SELECT_SONG = """
SELECT %s, HEX(%s) FROM %s WHERE %s = %%s SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
""" % (FIELD_SONGNAME, FIELD_SHA1, SONGS_TABLENAME, FIELD_SONG_ID) """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
SELECT_NUM_FINGERPRINTS = """ SELECT_NUM_FINGERPRINTS = """
SELECT COUNT(*) as n FROM %s SELECT COUNT(*) as n FROM %s
@ -121,11 +116,11 @@ class SQLDatabase(Database):
SELECT_UNIQUE_SONG_IDS = """ SELECT_UNIQUE_SONG_IDS = """
SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
""" % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
SELECT_SONGS = """ SELECT_SONGS = """
SELECT %s, %s, HEX(%s) FROM %s WHERE %s = 1; SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
""" % (FIELD_SONG_ID, FIELD_SONGNAME, FIELD_SHA1, """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
SONGS_TABLENAME, FIELD_FINGERPRINTED) SONGS_TABLENAME, FIELD_FINGERPRINTED)
# drops # drops
@ -135,7 +130,7 @@ class SQLDatabase(Database):
# update # update
UPDATE_SONG_FINGERPRINTED = """ UPDATE_SONG_FINGERPRINTED = """
UPDATE %s SET %s = 1 WHERE %s = %%s UPDATE %s SET %s = 1 WHERE %s = %%s
""" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID) """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
# delete # delete
DELETE_UNFINGERPRINTED = """ DELETE_UNFINGERPRINTED = """

View file

@ -6,17 +6,21 @@ from pydub.utils import audioop
import wavio import wavio
from hashlib import sha1 from hashlib import sha1
def unique_hash(filepath): def unique_hash(filepath, blocksize=2**20):
""" Small function to generate a hash to uniquely generate """ Small function to generate a hash to uniquely generate
a file. Taken / inspired from git's way via stackoverflow: a file. Inspired by MD5 version here:
http://stackoverflow.com/questions/552659 http://stackoverflow.com/a/1131255/712997
Works with large files.
""" """
filesize_bytes = os.path.getsize(filepath)
s = sha1() s = sha1()
s.update(("blob %u\0" % filesize_bytes).encode('ascii')) with open(filepath , "rb") as f:
with open(filepath, 'rb') as f: while True:
s.update(f.read()) buf = f.read(blocksize)
return s.hexdigest() if not buf:
break
s.update(buf)
return s.hexdigest().upper()
def find_files(path, extensions): def find_files(path, extensions):