mirror of
https://github.com/correl/dejavu.git
synced 2024-12-27 11:07:37 +00:00
Completed integration for SHA1 file hashing to avoid duplicates
This commit is contained in:
parent
27ed19b0f0
commit
254a79cdf5
4 changed files with 46 additions and 42 deletions
|
@ -1,4 +1,4 @@
|
|||
from dejavu.database import get_database
|
||||
from dejavu.database import get_database, Database
|
||||
import dejavu.decoder as decoder
|
||||
import fingerprint
|
||||
import multiprocessing
|
||||
|
@ -39,7 +39,7 @@ class Dejavu(object):
|
|||
self.songs = self.db.get_songs()
|
||||
self.songhashes_set = set() # to know which ones we've computed before
|
||||
for song in self.songs:
|
||||
song_hash = song[self.db.FIELD_SHA1]
|
||||
song_hash = song[Database.FIELD_FILE_SHA1]
|
||||
self.songhashes_set.add(song_hash)
|
||||
|
||||
def fingerprint_directory(self, path, extensions, nprocesses=None):
|
||||
|
@ -154,13 +154,12 @@ class Dejavu(object):
|
|||
fingerprint.DEFAULT_WINDOW_SIZE *
|
||||
fingerprint.DEFAULT_OVERLAP_RATIO, 5)
|
||||
song = {
|
||||
Dejavu.SONG_ID: song_id,
|
||||
Dejavu.SONG_NAME: songname,
|
||||
Dejavu.CONFIDENCE: largest_count,
|
||||
Dejavu.OFFSET: int(largest),
|
||||
Dejavu.OFFSET_SECS: nseconds
|
||||
}
|
||||
|
||||
Dejavu.SONG_ID : song_id,
|
||||
Dejavu.SONG_NAME : songname,
|
||||
Dejavu.CONFIDENCE : largest_count,
|
||||
Dejavu.OFFSET : int(largest),
|
||||
Dejavu.OFFSET_SECS : nseconds,
|
||||
Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),}
|
||||
return song
|
||||
|
||||
def recognize(self, recognizer, *options, **kwoptions):
|
||||
|
|
|
@ -5,6 +5,12 @@ import abc
|
|||
class Database(object):
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
FIELD_FILE_SHA1 = 'file_sha1'
|
||||
FIELD_SONG_ID = 'song_id'
|
||||
FIELD_SONGNAME = 'song_name'
|
||||
FIELD_OFFSET = 'offset'
|
||||
FIELD_HASH = 'hash'
|
||||
|
||||
# Name of your Database subclass, this is used in configuration
|
||||
# to refer to your class
|
||||
type = None
|
||||
|
|
|
@ -51,11 +51,6 @@ class SQLDatabase(Database):
|
|||
SONGS_TABLENAME = "songs"
|
||||
|
||||
# fields
|
||||
FIELD_HASH = "hash"
|
||||
FIELD_SONG_ID = "song_id"
|
||||
FIELD_OFFSET = "offset"
|
||||
FIELD_SHA1 = 'file_sha1'
|
||||
FIELD_SONGNAME = "song_name"
|
||||
FIELD_FINGERPRINTED = "fingerprinted"
|
||||
|
||||
# creates
|
||||
|
@ -68,10 +63,10 @@ class SQLDatabase(Database):
|
|||
UNIQUE KEY `unique_constraint` (%s, %s, %s),
|
||||
FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
|
||||
) ENGINE=INNODB;""" % (
|
||||
FINGERPRINTS_TABLENAME, FIELD_HASH,
|
||||
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
|
||||
FIELD_SONG_ID, FIELD_OFFSET, FIELD_HASH,
|
||||
FIELD_SONG_ID, SONGS_TABLENAME, FIELD_SONG_ID
|
||||
FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
|
||||
Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
|
||||
Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
|
||||
Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
|
||||
)
|
||||
|
||||
CREATE_SONGS_TABLE = """
|
||||
|
@ -79,41 +74,41 @@ class SQLDatabase(Database):
|
|||
`%s` mediumint unsigned not null auto_increment,
|
||||
`%s` varchar(250) not null,
|
||||
`%s` tinyint default 0,
|
||||
`%s` binary(10) not null,
|
||||
`%s` binary(20) not null,
|
||||
PRIMARY KEY (`%s`),
|
||||
UNIQUE KEY `%s` (`%s`)
|
||||
) ENGINE=INNODB;""" % (
|
||||
SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME, FIELD_FINGERPRINTED,
|
||||
FIELD_SHA1,
|
||||
FIELD_SONG_ID, FIELD_SONG_ID, FIELD_SONG_ID,
|
||||
SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, FIELD_FINGERPRINTED,
|
||||
Database.FIELD_FILE_SHA1,
|
||||
Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
|
||||
)
|
||||
|
||||
# inserts (ignores duplicates)
|
||||
INSERT_FINGERPRINT = """
|
||||
INSERT IGNORE INTO %s (%s, %s, %s) values
|
||||
(UNHEX(%%s), %%s, %%s);
|
||||
""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET)
|
||||
""" % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
|
||||
|
||||
INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % (
|
||||
SONGS_TABLENAME, FIELD_SONGNAME, FIELD_SHA1)
|
||||
SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1)
|
||||
|
||||
# selects
|
||||
SELECT = """
|
||||
SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
|
||||
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH)
|
||||
""" % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
||||
|
||||
SELECT_MULTIPLE = """
|
||||
SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
|
||||
""" % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET,
|
||||
FINGERPRINTS_TABLENAME, FIELD_HASH)
|
||||
""" % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
|
||||
FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
||||
|
||||
SELECT_ALL = """
|
||||
SELECT %s, %s FROM %s;
|
||||
""" % (FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME)
|
||||
""" % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
|
||||
|
||||
SELECT_SONG = """
|
||||
SELECT %s, HEX(%s) FROM %s WHERE %s = %%s
|
||||
""" % (FIELD_SONGNAME, FIELD_SHA1, SONGS_TABLENAME, FIELD_SONG_ID)
|
||||
SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
|
||||
""" % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
|
||||
|
||||
SELECT_NUM_FINGERPRINTS = """
|
||||
SELECT COUNT(*) as n FROM %s
|
||||
|
@ -121,11 +116,11 @@ class SQLDatabase(Database):
|
|||
|
||||
SELECT_UNIQUE_SONG_IDS = """
|
||||
SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
|
||||
""" % (FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
||||
""" % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
||||
|
||||
SELECT_SONGS = """
|
||||
SELECT %s, %s, HEX(%s) FROM %s WHERE %s = 1;
|
||||
""" % (FIELD_SONG_ID, FIELD_SONGNAME, FIELD_SHA1,
|
||||
SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
|
||||
""" % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
|
||||
SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
||||
|
||||
# drops
|
||||
|
@ -135,7 +130,7 @@ class SQLDatabase(Database):
|
|||
# update
|
||||
UPDATE_SONG_FINGERPRINTED = """
|
||||
UPDATE %s SET %s = 1 WHERE %s = %%s
|
||||
""" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, FIELD_SONG_ID)
|
||||
""" % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
|
||||
|
||||
# delete
|
||||
DELETE_UNFINGERPRINTED = """
|
||||
|
|
|
@ -6,17 +6,21 @@ from pydub.utils import audioop
|
|||
import wavio
|
||||
from hashlib import sha1
|
||||
|
||||
def unique_hash(filepath):
|
||||
def unique_hash(filepath, blocksize=2**20):
|
||||
""" Small function to generate a hash to uniquely generate
|
||||
a file. Taken / inspired from git's way via stackoverflow:
|
||||
http://stackoverflow.com/questions/552659
|
||||
a file. Inspired by MD5 version here:
|
||||
http://stackoverflow.com/a/1131255/712997
|
||||
|
||||
Works with large files.
|
||||
"""
|
||||
filesize_bytes = os.path.getsize(filepath)
|
||||
s = sha1()
|
||||
s.update(("blob %u\0" % filesize_bytes).encode('ascii'))
|
||||
with open(filepath, 'rb') as f:
|
||||
s.update(f.read())
|
||||
return s.hexdigest()
|
||||
with open(filepath , "rb") as f:
|
||||
while True:
|
||||
buf = f.read(blocksize)
|
||||
if not buf:
|
||||
break
|
||||
s.update(buf)
|
||||
return s.hexdigest().upper()
|
||||
|
||||
|
||||
def find_files(path, extensions):
|
||||
|
|
Loading…
Reference in a new issue