import librosa import numpy as np from sklearn.preprocessing import MinMaxScaler uni_genres_list = ['House', 'Soundtrack', 'Composed Music', 'Drone', 'Instrumental', 'Ambient Electronic', 'Blues', 'Easy Listening', 'Classical', 'Jazz', 'Christmas', 'Electronic', 'Ambient', 'Lo-fi Instrumental', 'Lounge', 'Contemporary Classical', 'Indie-Rock', 'Dance', 'New Age', 'Halloween', 'Lo-fi Electronic', '20th Century Classical', 'Piano', 'Chill-out', 'Pop'] genres2idx = {genre: idx for idx, genre in enumerate(uni_genres_list)} idx2genres = {idx: genre for genre, idx in genres2idx.items()} def tokenize(genres): return [genres2idx[genre] for genre in genres if genre in genres2idx] def detokenize_tolist(tokens): return [idx2genres[token] for token in tokens if token in idx2genres] def onehot_encode(tokens, max_genres): onehot = np.zeros(max_genres) onehot[tokens] = 1 return onehot def onehot_decode(onehot): return [idx for idx, val in enumerate(onehot) if val == 1] def load_and_resample_audio(file_path, target_sr=22050, max_duration=15): audio, sr = librosa.load(file_path, sr=None) if sr != target_sr: audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) if len(audio) > target_sr * max_duration: audio = audio[:target_sr * max_duration] return audio, target_sr def audio_to_melspec(audio, sr, n_mels=256, n_fft=2048, hop_length=512, to_db=False): spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect', power=2.0, n_mels=n_mels) if to_db: spec = librosa.power_to_db(spec, ref=np.max) return spec # Normalize the Mel spectrogram def normalize_melspec(melspec, norm_range=(0, 1)): scaler = MinMaxScaler(feature_range=norm_range) melspec = melspec.T melspec_normalized = scaler.fit_transform(melspec) return melspec_normalized.T # Denormalize the Mel spectrogram def denormalize_melspec(melspec_normalized, original_melspec, norm_range=(0, 1)): scaler = MinMaxScaler(feature_range=norm_range) melspec = original_melspec.T scaler.fit(melspec) melspec_denormalized = scaler.inverse_transform(melspec_normalized.T) return melspec_denormalized.T # Function to convert Mel spectrogram back to audio def melspec_to_audio(melspec, sr=22050, n_fft=2048, hop_length=512, n_iter=64): if np.any(melspec < 0): melspec = librosa.db_to_power(melspec) audio_reconstructed = librosa.feature.inverse.mel_to_audio(melspec, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True, pad_mode='reflect', power=2.0, # Ensure the correct inverse transformation n_iter=n_iter) return audio_reconstructed