Upload 544 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- LICENSE +21 -0
- README.md +60 -14
- api.py +185 -0
- app.py +71 -4
- example.wav +3 -0
- example2.wav +3 -0
- requirements.txt +6 -0
- src/.ipynb_checkpoints/prepare_clap-checkpoint.py +39 -0
- src/.ipynb_checkpoints/test-checkpoint.py +140 -0
- src/.ipynb_checkpoints/train-checkpoint.py +208 -0
- src/.ipynb_checkpoints/val-checkpoint.py +141 -0
- src/clap_embedding/Accelerating, revving, vroom.pt +3 -0
- src/clap_embedding/Air brake.pt +3 -0
- src/clap_embedding/Air conditioning.pt +3 -0
- src/clap_embedding/Air horn, truck horn.pt +3 -0
- src/clap_embedding/Aircraft engine.pt +3 -0
- src/clap_embedding/Aircraft.pt +3 -0
- src/clap_embedding/Alarm clock.pt +3 -0
- src/clap_embedding/Alarm.pt +3 -0
- src/clap_embedding/Alert.pt +3 -0
- src/clap_embedding/Ambulance (siren).pt +3 -0
- src/clap_embedding/Animal.pt +3 -0
- src/clap_embedding/Applause.pt +3 -0
- src/clap_embedding/Arrow.pt +3 -0
- src/clap_embedding/Artillery fire.pt +3 -0
- src/clap_embedding/Audio logo.pt +3 -0
- src/clap_embedding/Babbling.pt +3 -0
- src/clap_embedding/Baby cry, infant cry.pt +3 -0
- src/clap_embedding/Baby laughter.pt +3 -0
- src/clap_embedding/Background noise.pt +3 -0
- src/clap_embedding/Bang.pt +3 -0
- src/clap_embedding/Bark.pt +3 -0
- src/clap_embedding/Basketball bounce.pt +3 -0
- src/clap_embedding/Bathroom sounds.pt +3 -0
- src/clap_embedding/Bathtub (filling or washing).pt +3 -0
- src/clap_embedding/Battle cry.pt +3 -0
- src/clap_embedding/Bee, wasp, etc..pt +3 -0
- src/clap_embedding/Beep, bleep.pt +3 -0
- src/clap_embedding/Bell.pt +3 -0
- src/clap_embedding/Bellow.pt +3 -0
- src/clap_embedding/Belly laugh.pt +3 -0
- src/clap_embedding/Bicycle bell.pt +3 -0
- src/clap_embedding/Bicycle, tricycle.pt +3 -0
- src/clap_embedding/Bird flight, flapping wings.pt +3 -0
- src/clap_embedding/Bird vocalization, bird call, bird song.pt +3 -0
- src/clap_embedding/Bird.pt +3 -0
- src/clap_embedding/Biting.pt +3 -0
- src/clap_embedding/Bleat.pt +3 -0
- src/clap_embedding/Blender, food processor.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
example.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
example2.wav filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Jiarui Hai
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,14 +1,60 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FlexSED: Towards Open-Vocabulary Sound Event Detection
|
| 2 |
+
|
| 3 |
+
[](https://arxiv.org/abs/2509.18606)
|
| 4 |
+
[](https://huggingface.co/Higobeatz/FlexSED/tree/main)
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
## News
|
| 8 |
+
- Oct 2025: 📦 Released code and pretrained checkpoint
|
| 9 |
+
- Sep 2025: 🎉 FlexSED Spotlighted at WASPAA 2025
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Installation
|
| 13 |
+
|
| 14 |
+
Clone the repository:
|
| 15 |
+
```
|
| 16 |
+
git clone [email protected]:JHU-LCAP/FlexSED.git
|
| 17 |
+
```
|
| 18 |
+
Install the dependencies:
|
| 19 |
+
```
|
| 20 |
+
cd FlexSED
|
| 21 |
+
pip install -r requirements.txt
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## Usage
|
| 25 |
+
```python
|
| 26 |
+
from api import FlexSED
|
| 27 |
+
import torch
|
| 28 |
+
import soundfile as sf
|
| 29 |
+
|
| 30 |
+
# load model
|
| 31 |
+
flexsed = FlexSED(device='cuda')
|
| 32 |
+
|
| 33 |
+
# run inference
|
| 34 |
+
events = ["Dog"]
|
| 35 |
+
preds = flexsed.run_inference("example.wav", events)
|
| 36 |
+
|
| 37 |
+
# visualize prediciton
|
| 38 |
+
flexsed.to_multi_plot(preds, events, fname="example2")
|
| 39 |
+
|
| 40 |
+
# (Optional) visualize prediciton by video
|
| 41 |
+
# flexsed.to_multi_video(preds, events, audio_path="example2.wav", fname="example2")
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## Training
|
| 45 |
+
|
| 46 |
+
WIP
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
## Reference
|
| 50 |
+
|
| 51 |
+
If you find the code useful for your research, please consider citing:
|
| 52 |
+
|
| 53 |
+
```bibtex
|
| 54 |
+
@article{hai2025flexsed,
|
| 55 |
+
title={FlexSED: Towards Open-Vocabulary Sound Event Detection},
|
| 56 |
+
author={Hai, Jiarui and Wang, Helin and Guo, Weizhe and Elhilali, Mounya},
|
| 57 |
+
journal={arXiv preprint arXiv:2509.18606},
|
| 58 |
+
year={2025}
|
| 59 |
+
}
|
| 60 |
+
```
|
api.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import librosa
|
| 3 |
+
import os
|
| 4 |
+
import numpy as np
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
from transformers import AutoTokenizer, ClapTextModelWithProjection
|
| 7 |
+
from src.models.transformer import Dasheng_Encoder
|
| 8 |
+
from src.models.sed_decoder import Decoder, TSED_Wrapper
|
| 9 |
+
from src.utils import load_yaml_with_includes
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FlexSED:
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
config_path='src/configs/model.yml',
|
| 16 |
+
ckpt_path='ckpts/flexsed_as.pt',
|
| 17 |
+
ckpt_url='https://huggingface.co/Higobeatz/FlexSED/resolve/main/ckpts/flexsed_as.pt',
|
| 18 |
+
device='cuda'
|
| 19 |
+
):
|
| 20 |
+
"""
|
| 21 |
+
Initialize FlexSED with model, CLAP, and tokenizer loaded once.
|
| 22 |
+
If the checkpoint is not available locally, it will be downloaded automatically.
|
| 23 |
+
"""
|
| 24 |
+
self.device = device
|
| 25 |
+
params = load_yaml_with_includes(config_path)
|
| 26 |
+
|
| 27 |
+
# Ensure checkpoint exists
|
| 28 |
+
if not os.path.exists(ckpt_path):
|
| 29 |
+
print(f"[FlexSED] Downloading checkpoint from {ckpt_url} ...")
|
| 30 |
+
state_dict = torch.hub.load_state_dict_from_url(ckpt_url, map_location="cpu")
|
| 31 |
+
else:
|
| 32 |
+
state_dict = torch.load(ckpt_path, map_location="cpu")
|
| 33 |
+
|
| 34 |
+
# Encoder + Decoder
|
| 35 |
+
encoder = Dasheng_Encoder(**params['encoder']).to(self.device)
|
| 36 |
+
decoder = Decoder(**params['decoder']).to(self.device)
|
| 37 |
+
self.model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder'])
|
| 38 |
+
self.model.load_state_dict(state_dict['model'])
|
| 39 |
+
self.model.eval()
|
| 40 |
+
|
| 41 |
+
# CLAP text model
|
| 42 |
+
self.clap = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
|
| 43 |
+
self.clap.eval()
|
| 44 |
+
self.tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
|
| 45 |
+
|
| 46 |
+
def run_inference(self, audio_path, events, norm_audio=True):
|
| 47 |
+
"""
|
| 48 |
+
Run inference on audio for given events.
|
| 49 |
+
"""
|
| 50 |
+
audio, sr = librosa.load(audio_path, sr=16000)
|
| 51 |
+
audio = torch.tensor([audio]).to(self.device)
|
| 52 |
+
|
| 53 |
+
if norm_audio:
|
| 54 |
+
eps = 1e-9
|
| 55 |
+
max_val = torch.max(torch.abs(audio))
|
| 56 |
+
audio = audio / (max_val + eps)
|
| 57 |
+
|
| 58 |
+
clap_embeds = []
|
| 59 |
+
with torch.no_grad():
|
| 60 |
+
for event in events:
|
| 61 |
+
text = f"The sound of {event.replace('_',' ')}"
|
| 62 |
+
inputs = self.tokenizer([text], padding=True, return_tensors="pt")
|
| 63 |
+
outputs = self.clap(**inputs)
|
| 64 |
+
text_embeds = outputs.text_embeds.unsqueeze(1)
|
| 65 |
+
clap_embeds.append(text_embeds)
|
| 66 |
+
|
| 67 |
+
query = torch.cat(clap_embeds, dim=1).to(self.device)
|
| 68 |
+
mel = self.model.forward_to_spec(audio)
|
| 69 |
+
preds = self.model(mel, query)
|
| 70 |
+
preds = torch.sigmoid(preds).cpu()
|
| 71 |
+
|
| 72 |
+
return preds # shape: [num_events, 1, T]
|
| 73 |
+
|
| 74 |
+
# ---------- Multi-event plotting ----------
|
| 75 |
+
@staticmethod
|
| 76 |
+
def plot_and_save_multi(preds, events, sr=25, out_dir="./plots", fname="all_events"):
|
| 77 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 78 |
+
preds_np = preds.squeeze(1).numpy() # [num_events, T]
|
| 79 |
+
T = preds_np.shape[1]
|
| 80 |
+
|
| 81 |
+
plt.figure(figsize=(12, len(events) * 0.6 + 2))
|
| 82 |
+
plt.imshow(
|
| 83 |
+
preds_np,
|
| 84 |
+
aspect="auto",
|
| 85 |
+
cmap="Blues",
|
| 86 |
+
extent=[0, T/sr, 0, len(events)],
|
| 87 |
+
vmin=0, vmax=1, origin="lower"
|
| 88 |
+
|
| 89 |
+
)
|
| 90 |
+
plt.colorbar(label="Probability")
|
| 91 |
+
plt.yticks(np.arange(len(events)) + 0.5, events)
|
| 92 |
+
plt.xlabel("Time (s)")
|
| 93 |
+
plt.ylabel("Events")
|
| 94 |
+
plt.title("Event Predictions")
|
| 95 |
+
|
| 96 |
+
save_path = os.path.join(out_dir, f"{fname}.png")
|
| 97 |
+
plt.savefig(save_path, dpi=200, bbox_inches="tight")
|
| 98 |
+
plt.close()
|
| 99 |
+
return save_path
|
| 100 |
+
|
| 101 |
+
def to_multi_plot(self, preds, events, out_dir="./plots", fname="all_events"):
|
| 102 |
+
return self.plot_and_save_multi(preds, events, out_dir=out_dir, fname=fname)
|
| 103 |
+
|
| 104 |
+
# ---------- Multi-event video ----------
|
| 105 |
+
@staticmethod
|
| 106 |
+
def make_multi_event_video(preds, events, sr=25, out_dir="./videos",
|
| 107 |
+
audio_path=None, fps=25, highlight=True, fname="all_events"):
|
| 108 |
+
from moviepy.editor import ImageSequenceClip, AudioFileClip
|
| 109 |
+
from tqdm import tqdm
|
| 110 |
+
|
| 111 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 112 |
+
preds_np = preds.squeeze(1).numpy() # [num_events, T]
|
| 113 |
+
T = preds_np.shape[1]
|
| 114 |
+
duration = T / sr
|
| 115 |
+
|
| 116 |
+
frames = []
|
| 117 |
+
n_frames = int(duration * fps)
|
| 118 |
+
|
| 119 |
+
for i in tqdm(range(n_frames)):
|
| 120 |
+
t = int(i * T / n_frames)
|
| 121 |
+
plt.figure(figsize=(12, len(events) * 0.6 + 2))
|
| 122 |
+
|
| 123 |
+
if highlight:
|
| 124 |
+
mask = np.zeros_like(preds_np)
|
| 125 |
+
mask[:, :t+1] = preds_np[:, :t+1]
|
| 126 |
+
plt.imshow(
|
| 127 |
+
mask,
|
| 128 |
+
aspect="auto",
|
| 129 |
+
cmap="Blues",
|
| 130 |
+
extent=[0, T/sr, 0, len(events)],
|
| 131 |
+
vmin=0, vmax=1, origin="lower"
|
| 132 |
+
)
|
| 133 |
+
else:
|
| 134 |
+
plt.imshow(
|
| 135 |
+
preds_np[:, :t+1],
|
| 136 |
+
aspect="auto",
|
| 137 |
+
cmap="Blues",
|
| 138 |
+
extent=[0, (t+1)/sr, 0, len(events)],
|
| 139 |
+
vmin=0, vmax=1, origin="lower"
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
plt.colorbar(label="Probability")
|
| 143 |
+
plt.yticks(np.arange(len(events)) + 0.5, events)
|
| 144 |
+
plt.xlabel("Time (s)")
|
| 145 |
+
plt.ylabel("Events")
|
| 146 |
+
plt.title("Event Predictions")
|
| 147 |
+
|
| 148 |
+
frame_path = f"/tmp/frame_{i:04d}.png"
|
| 149 |
+
plt.savefig(frame_path, dpi=150, bbox_inches="tight")
|
| 150 |
+
plt.close()
|
| 151 |
+
frames.append(frame_path)
|
| 152 |
+
|
| 153 |
+
clip = ImageSequenceClip(frames, fps=fps)
|
| 154 |
+
if audio_path is not None:
|
| 155 |
+
audio = AudioFileClip(audio_path).subclip(0, duration)
|
| 156 |
+
clip = clip.set_audio(audio)
|
| 157 |
+
|
| 158 |
+
save_path = os.path.join(out_dir, f"{fname}.mp4")
|
| 159 |
+
clip.write_videofile(
|
| 160 |
+
save_path,
|
| 161 |
+
fps=fps,
|
| 162 |
+
codec="mpeg4",
|
| 163 |
+
audio_codec="aac"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
for f in frames:
|
| 167 |
+
os.remove(f)
|
| 168 |
+
|
| 169 |
+
return save_path
|
| 170 |
+
|
| 171 |
+
def to_multi_video(self, preds, events, audio_path, out_dir="./videos", fname="all_events"):
|
| 172 |
+
return self.make_multi_event_video(
|
| 173 |
+
preds, events, audio_path=audio_path, out_dir=out_dir, fname=fname
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
if __name__ == "__main__":
|
| 178 |
+
flexsed = FlexSED(device='cuda')
|
| 179 |
+
|
| 180 |
+
events = ["Door", "Laughter", "Dog"]
|
| 181 |
+
preds = flexsed.run_inference("example2.wav", events)
|
| 182 |
+
|
| 183 |
+
# Combined plot & video
|
| 184 |
+
flexsed.to_multi_plot(preds, events, fname="example2")
|
| 185 |
+
# flexsed.to_multi_video(preds, events, audio_path="example2.wav", fname="example2")
|
app.py
CHANGED
|
@@ -1,7 +1,74 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
from api import FlexSED
|
| 4 |
+
import tempfile
|
| 5 |
+
import os
|
| 6 |
|
| 7 |
+
# Load model once on startup
|
| 8 |
+
flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu")
|
| 9 |
|
| 10 |
+
def run_flexsed(audio_file, event_list):
|
| 11 |
+
"""
|
| 12 |
+
Run inference using FlexSED and return prediction plot.
|
| 13 |
+
"""
|
| 14 |
+
if not audio_file:
|
| 15 |
+
return None
|
| 16 |
+
|
| 17 |
+
# Split events by semicolon or comma
|
| 18 |
+
events = [e.strip() for e in event_list.split(";") if e.strip()]
|
| 19 |
+
if not events:
|
| 20 |
+
return None
|
| 21 |
+
|
| 22 |
+
# Run inference
|
| 23 |
+
preds = flexsed.run_inference(audio_file, events)
|
| 24 |
+
|
| 25 |
+
# Generate visualization
|
| 26 |
+
output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output")
|
| 27 |
+
flexsed.to_multi_plot(preds, events, fname=output_fname)
|
| 28 |
+
plot_path = f"{output_fname}.png"
|
| 29 |
+
|
| 30 |
+
return plot_path
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# App layout
|
| 34 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app:
|
| 35 |
+
# Header
|
| 36 |
+
gr.Markdown("""
|
| 37 |
+
## 🎧 FlexSED: A Flexible Open-Vocabulary Sound Event Detection System
|
| 38 |
+
|
| 39 |
+
👋 Welcome to the **FlexSED live demo** — explore **prompt-guided sound event detection** in real audio clips.
|
| 40 |
+
|
| 41 |
+
🔗 Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED)
|
| 42 |
+
""")
|
| 43 |
+
|
| 44 |
+
gr.Markdown("### 🔍 Upload or choose an example below to detect sound events:")
|
| 45 |
+
|
| 46 |
+
with gr.Row():
|
| 47 |
+
# Left column: Inputs
|
| 48 |
+
with gr.Column(scale=1):
|
| 49 |
+
audio_input = gr.Audio(type="filepath", label="🎵 Upload Audio (.wav)")
|
| 50 |
+
text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter")
|
| 51 |
+
|
| 52 |
+
with gr.Row():
|
| 53 |
+
detect_btn = gr.Button("🎯 Detect", variant="primary")
|
| 54 |
+
clear_btn = gr.Button("🧹 Clear")
|
| 55 |
+
|
| 56 |
+
# Right column: Output
|
| 57 |
+
with gr.Column(scale=1):
|
| 58 |
+
image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image")
|
| 59 |
+
gr.Examples(
|
| 60 |
+
examples=[
|
| 61 |
+
["example.wav", "Male speech; Door; Dog; Laughter"],
|
| 62 |
+
["example2.wav", "Male speech; Bee; Gunshot, gunfire"],
|
| 63 |
+
],
|
| 64 |
+
inputs=[audio_input, text_input],
|
| 65 |
+
label="Example Audios"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Function bindings
|
| 69 |
+
detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output)
|
| 70 |
+
clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input])
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
app.launch(share=True)
|
example.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:371ee4358cd3b12330f406d7d576fecb2329057132696360278b602043009562
|
| 3 |
+
size 480044
|
example2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ceca83fd7bd5e1ab16dd61a445c3f3fb11b87c67d8a56b277d4ee293c56b23ed
|
| 3 |
+
size 480044
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
torch
|
| 3 |
+
soundfile
|
| 4 |
+
matplotlib
|
| 5 |
+
numpy
|
| 6 |
+
librosa
|
src/.ipynb_checkpoints/prepare_clap-checkpoint.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import AutoTokenizer, ClapTextModelWithProjection
|
| 5 |
+
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
# Load the CLAP model and tokenizer
|
| 8 |
+
model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
|
| 9 |
+
model.eval()
|
| 10 |
+
tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
|
| 11 |
+
|
| 12 |
+
# Path to the input CSV file
|
| 13 |
+
input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv'
|
| 14 |
+
output_path = 'clap_embedding/' # Replace with your desired output folder path
|
| 15 |
+
|
| 16 |
+
# Create the output folder if it doesn't exist
|
| 17 |
+
os.makedirs(output_path, exist_ok=True)
|
| 18 |
+
|
| 19 |
+
# Read the CSV file
|
| 20 |
+
df = pd.read_csv(input_csv_path)
|
| 21 |
+
|
| 22 |
+
# Get unique event labels
|
| 23 |
+
events = df['label'].unique()
|
| 24 |
+
|
| 25 |
+
with torch.no_grad(): # Disable gradient computation
|
| 26 |
+
# Process each event
|
| 27 |
+
for event in events:
|
| 28 |
+
text = event.replace('_', ' ') # Replace underscores with spaces
|
| 29 |
+
text = f'The sound of {text}'
|
| 30 |
+
print(text)
|
| 31 |
+
inputs = tokenizer([text], padding=True, return_tensors="pt")
|
| 32 |
+
outputs = model(**inputs)
|
| 33 |
+
text_embeds = outputs.text_embeds
|
| 34 |
+
|
| 35 |
+
# Save the embeddings to a .pt file
|
| 36 |
+
output_file = os.path.join(output_path, f"{event}.pt")
|
| 37 |
+
torch.save(text_embeds, output_file)
|
| 38 |
+
|
| 39 |
+
print("Embedding extraction and saving complete!")
|
src/.ipynb_checkpoints/test-checkpoint.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
from torch.utils.data import DataLoader
|
| 13 |
+
|
| 14 |
+
from accelerate import Accelerator
|
| 15 |
+
|
| 16 |
+
from models.transformer import Dasheng_Encoder
|
| 17 |
+
from models.sed_decoder import Decoder, TSED_Wrapper
|
| 18 |
+
from dataset.tsed import TSED_AS
|
| 19 |
+
from dataset.tsed_val import TSED_Val
|
| 20 |
+
from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler
|
| 21 |
+
from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation
|
| 22 |
+
from val import val_psds
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def parse_args():
|
| 26 |
+
parser = argparse.ArgumentParser()
|
| 27 |
+
|
| 28 |
+
# Config settings
|
| 29 |
+
parser.add_argument('--config-name', type=str, default='configs/model.yml')
|
| 30 |
+
parser.add_argument('--ckpt', type=str, default='20000.pt')
|
| 31 |
+
|
| 32 |
+
# Training settings
|
| 33 |
+
parser.add_argument("--amp", type=str, default='fp16')
|
| 34 |
+
parser.add_argument('--epochs', type=int, default=20)
|
| 35 |
+
parser.add_argument('--num-workers', type=int, default=8)
|
| 36 |
+
parser.add_argument('--num-threads', type=int, default=1)
|
| 37 |
+
parser.add_argument('--eval-every-step', type=int, default=5000)
|
| 38 |
+
parser.add_argument('--save-every-step', type=int, default=5000)
|
| 39 |
+
# parser.add_argument('--dataloader', type=str, default='EACaps')
|
| 40 |
+
parser.add_argument("--logit-normal-indices", type=bool, default=False)
|
| 41 |
+
|
| 42 |
+
# Log and random seed
|
| 43 |
+
parser.add_argument('--random-seed', type=int, default=2024)
|
| 44 |
+
parser.add_argument('--log-step', type=int, default=100)
|
| 45 |
+
parser.add_argument('--log-dir', type=str, default='../logs/')
|
| 46 |
+
parser.add_argument('--save-dir', type=str, default='../ckpts/')
|
| 47 |
+
return parser.parse_args()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def setup_directories(args, params):
|
| 51 |
+
args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/'
|
| 52 |
+
args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/'
|
| 53 |
+
|
| 54 |
+
os.makedirs(args.log_dir, exist_ok=True)
|
| 55 |
+
os.makedirs(args.save_dir, exist_ok=True)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def set_device(args):
|
| 59 |
+
torch.set_num_threads(args.num_threads)
|
| 60 |
+
if torch.cuda.is_available():
|
| 61 |
+
args.device = 'cuda'
|
| 62 |
+
torch.cuda.manual_seed_all(args.random_seed)
|
| 63 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 64 |
+
if torch.backends.cudnn.is_available():
|
| 65 |
+
torch.backends.cudnn.deterministic = True
|
| 66 |
+
torch.backends.cudnn.benchmark = False
|
| 67 |
+
else:
|
| 68 |
+
args.device = 'cpu'
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == '__main__':
|
| 72 |
+
args = parse_args()
|
| 73 |
+
params = load_yaml_with_includes(args.config_name)
|
| 74 |
+
set_device(args)
|
| 75 |
+
setup_directories(args, params)
|
| 76 |
+
|
| 77 |
+
random.seed(args.random_seed)
|
| 78 |
+
torch.manual_seed(args.random_seed)
|
| 79 |
+
|
| 80 |
+
# use accelerator for multi-gpu training
|
| 81 |
+
accelerator = Accelerator(mixed_precision=args.amp,
|
| 82 |
+
gradient_accumulation_steps=params['opt']['accumulation_steps'],
|
| 83 |
+
step_scheduler_with_optimizer=False)
|
| 84 |
+
|
| 85 |
+
train_set = TSED_AS(**params['data']['train_data'])
|
| 86 |
+
train_loader = DataLoader(train_set, batch_size=params['opt']['batch_size'], num_workers=args.num_workers)
|
| 87 |
+
|
| 88 |
+
# val_set = TSED_Val(**params['data']['val_data'])
|
| 89 |
+
# val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False)
|
| 90 |
+
|
| 91 |
+
test_set = TSED_Val(**params['data']['test_data'])
|
| 92 |
+
test_loader = DataLoader(test_set, num_workers=0, batch_size=1, shuffle=False)
|
| 93 |
+
|
| 94 |
+
encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device)
|
| 95 |
+
pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1'
|
| 96 |
+
dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
|
| 97 |
+
model_parmeters = dump['model']
|
| 98 |
+
# pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1'
|
| 99 |
+
# dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
|
| 100 |
+
# model_parmeters = dump
|
| 101 |
+
encoder.load_state_dict(model_parmeters)
|
| 102 |
+
|
| 103 |
+
decoder = Decoder(**params['decoder']).to(accelerator.device)
|
| 104 |
+
|
| 105 |
+
model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder'])
|
| 106 |
+
print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M")
|
| 107 |
+
|
| 108 |
+
model.load_state_dict(torch.load(args.ckpt, map_location='cpu')['model'])
|
| 109 |
+
|
| 110 |
+
if params['frozen_encoder']:
|
| 111 |
+
optimizer = torch.optim.AdamW(
|
| 112 |
+
model.parameters(),
|
| 113 |
+
lr=params['opt']['learning_rate'],
|
| 114 |
+
weight_decay=params['opt']['weight_decay'],
|
| 115 |
+
betas=(params['opt']['beta1'], params['opt']['beta2']),
|
| 116 |
+
eps=params['opt']['adam_epsilon'])
|
| 117 |
+
else:
|
| 118 |
+
optimizer = torch.optim.AdamW(
|
| 119 |
+
[
|
| 120 |
+
{'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']},
|
| 121 |
+
{'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']}
|
| 122 |
+
],
|
| 123 |
+
weight_decay=params['opt']['weight_decay'],
|
| 124 |
+
betas=(params['opt']['beta1'], params['opt']['beta2']),
|
| 125 |
+
eps=params['opt']['adam_epsilon'])
|
| 126 |
+
|
| 127 |
+
lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler'])
|
| 128 |
+
|
| 129 |
+
strong_loss_func = nn.BCEWithLogitsLoss()
|
| 130 |
+
|
| 131 |
+
model, optimizer, lr_scheduler, train_loader, test_loader = accelerator.prepare(
|
| 132 |
+
model, optimizer, lr_scheduler, train_loader, test_loader)
|
| 133 |
+
|
| 134 |
+
global_step = 0.0
|
| 135 |
+
losses = 0.0
|
| 136 |
+
|
| 137 |
+
if accelerator.is_main_process:
|
| 138 |
+
model_module = model.module if hasattr(model, 'module') else model
|
| 139 |
+
val_psds(model_module, test_loader, params, epoch='test_full', split='test',
|
| 140 |
+
save_path=args.log_dir + 'output/', device=accelerator.device)
|
src/.ipynb_checkpoints/train-checkpoint.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
from torch.utils.data import DataLoader
|
| 13 |
+
|
| 14 |
+
from accelerate import Accelerator
|
| 15 |
+
|
| 16 |
+
from models.transformer import Dasheng_Encoder
|
| 17 |
+
from models.sed_decoder import Decoder, TSED_Wrapper
|
| 18 |
+
from dataset.tsed import TSED_AS
|
| 19 |
+
from dataset.tsed_val import TSED_Val
|
| 20 |
+
from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler
|
| 21 |
+
from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation
|
| 22 |
+
from val import val_psds
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def parse_args():
|
| 26 |
+
parser = argparse.ArgumentParser()
|
| 27 |
+
|
| 28 |
+
# Config settings
|
| 29 |
+
parser.add_argument('--config-name', type=str, default='configs/model.yml')
|
| 30 |
+
|
| 31 |
+
# Training settings
|
| 32 |
+
parser.add_argument("--amp", type=str, default='fp16')
|
| 33 |
+
parser.add_argument('--epochs', type=int, default=20)
|
| 34 |
+
parser.add_argument('--num-workers', type=int, default=8)
|
| 35 |
+
parser.add_argument('--num-threads', type=int, default=1)
|
| 36 |
+
parser.add_argument('--eval-every-step', type=int, default=5000)
|
| 37 |
+
parser.add_argument('--save-every-step', type=int, default=5000)
|
| 38 |
+
# parser.add_argument('--dataloader', type=str, default='EACaps')
|
| 39 |
+
parser.add_argument("--logit-normal-indices", type=bool, default=False)
|
| 40 |
+
|
| 41 |
+
# Log and random seed
|
| 42 |
+
parser.add_argument('--random-seed', type=int, default=2024)
|
| 43 |
+
parser.add_argument('--log-step', type=int, default=100)
|
| 44 |
+
parser.add_argument('--log-dir', type=str, default='../logs/')
|
| 45 |
+
parser.add_argument('--save-dir', type=str, default='../ckpts/')
|
| 46 |
+
return parser.parse_args()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def setup_directories(args, params):
|
| 50 |
+
args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/'
|
| 51 |
+
args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/'
|
| 52 |
+
|
| 53 |
+
os.makedirs(args.log_dir, exist_ok=True)
|
| 54 |
+
os.makedirs(args.save_dir, exist_ok=True)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def set_device(args):
|
| 58 |
+
torch.set_num_threads(args.num_threads)
|
| 59 |
+
if torch.cuda.is_available():
|
| 60 |
+
args.device = 'cuda'
|
| 61 |
+
torch.cuda.manual_seed_all(args.random_seed)
|
| 62 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 63 |
+
if torch.backends.cudnn.is_available():
|
| 64 |
+
torch.backends.cudnn.deterministic = True
|
| 65 |
+
torch.backends.cudnn.benchmark = False
|
| 66 |
+
else:
|
| 67 |
+
args.device = 'cpu'
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == '__main__':
|
| 71 |
+
args = parse_args()
|
| 72 |
+
params = load_yaml_with_includes(args.config_name)
|
| 73 |
+
set_device(args)
|
| 74 |
+
setup_directories(args, params)
|
| 75 |
+
|
| 76 |
+
random.seed(args.random_seed)
|
| 77 |
+
torch.manual_seed(args.random_seed)
|
| 78 |
+
|
| 79 |
+
# use accelerator for multi-gpu training
|
| 80 |
+
accelerator = Accelerator(mixed_precision=args.amp,
|
| 81 |
+
gradient_accumulation_steps=params['opt']['accumulation_steps'],
|
| 82 |
+
step_scheduler_with_optimizer=False)
|
| 83 |
+
|
| 84 |
+
train_set = TSED_AS(**params['data']['train_data'])
|
| 85 |
+
train_loader = DataLoader(train_set, shuffle=True,
|
| 86 |
+
batch_size=params['opt']['batch_size'],
|
| 87 |
+
num_workers=args.num_workers)
|
| 88 |
+
|
| 89 |
+
val_set = TSED_Val(**params['data']['val_data'])
|
| 90 |
+
val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False)
|
| 91 |
+
|
| 92 |
+
# test_set = TSED_Val(**params['data']['test_data'])
|
| 93 |
+
# test_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False)
|
| 94 |
+
|
| 95 |
+
encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device)
|
| 96 |
+
pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1'
|
| 97 |
+
dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
|
| 98 |
+
model_parmeters = dump['model']
|
| 99 |
+
# pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1'
|
| 100 |
+
# dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
|
| 101 |
+
# model_parmeters = dump
|
| 102 |
+
encoder.load_state_dict(model_parmeters)
|
| 103 |
+
|
| 104 |
+
decoder = Decoder(**params['decoder']).to(accelerator.device)
|
| 105 |
+
|
| 106 |
+
model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder'])
|
| 107 |
+
print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M")
|
| 108 |
+
|
| 109 |
+
# model.load_state_dict(torch.load('../ckpts/TSED_AS_filter/20000.0.pt', map_location='cpu')['model'])
|
| 110 |
+
|
| 111 |
+
if params['frozen_encoder']:
|
| 112 |
+
optimizer = torch.optim.AdamW(
|
| 113 |
+
model.parameters(),
|
| 114 |
+
lr=params['opt']['learning_rate'],
|
| 115 |
+
weight_decay=params['opt']['weight_decay'],
|
| 116 |
+
betas=(params['opt']['beta1'], params['opt']['beta2']),
|
| 117 |
+
eps=params['opt']['adam_epsilon'])
|
| 118 |
+
else:
|
| 119 |
+
optimizer = torch.optim.AdamW(
|
| 120 |
+
[
|
| 121 |
+
{'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']},
|
| 122 |
+
{'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']}
|
| 123 |
+
],
|
| 124 |
+
weight_decay=params['opt']['weight_decay'],
|
| 125 |
+
betas=(params['opt']['beta1'], params['opt']['beta2']),
|
| 126 |
+
eps=params['opt']['adam_epsilon'])
|
| 127 |
+
|
| 128 |
+
lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler'])
|
| 129 |
+
|
| 130 |
+
strong_loss_func = nn.BCEWithLogitsLoss()
|
| 131 |
+
|
| 132 |
+
model, optimizer, lr_scheduler, train_loader, val_loader = accelerator.prepare(
|
| 133 |
+
model, optimizer, lr_scheduler, train_loader, val_loader)
|
| 134 |
+
|
| 135 |
+
global_step = 0.0
|
| 136 |
+
losses = 0.0
|
| 137 |
+
|
| 138 |
+
if accelerator.is_main_process:
|
| 139 |
+
model_module = model.module if hasattr(model, 'module') else model
|
| 140 |
+
val_psds(model_module, val_loader, params, epoch='debug', split='val',
|
| 141 |
+
save_path=args.log_dir + 'output/', device=accelerator.device)
|
| 142 |
+
|
| 143 |
+
for epoch in range(args.epochs):
|
| 144 |
+
model.train()
|
| 145 |
+
for step, batch in enumerate(tqdm(train_loader)):
|
| 146 |
+
with accelerator.accumulate(model):
|
| 147 |
+
audio, cls, label, _ = batch
|
| 148 |
+
mel = model.forward_to_spec(audio)
|
| 149 |
+
|
| 150 |
+
# data aug
|
| 151 |
+
mel, label = frame_shift(mel, label, params['net_pooling'])
|
| 152 |
+
mel, label = time_mask(mel, label, params["net_pooling"],
|
| 153 |
+
mask_ratios=params['data_aug']["time_mask_ratios"])
|
| 154 |
+
mel, _ = feature_transformation(mel, **params['data_aug']["transform"])
|
| 155 |
+
|
| 156 |
+
strong_pred = model(mel, cls)
|
| 157 |
+
|
| 158 |
+
B, N, L = label.shape
|
| 159 |
+
label = label.reshape(B * N, L)
|
| 160 |
+
label = label.unsqueeze(1)
|
| 161 |
+
|
| 162 |
+
loss = strong_loss_func(strong_pred, label)
|
| 163 |
+
|
| 164 |
+
accelerator.backward(loss)
|
| 165 |
+
|
| 166 |
+
# clip grad up
|
| 167 |
+
if accelerator.sync_gradients:
|
| 168 |
+
if 'grad_clip' in params['opt'] and params['opt']['grad_clip'] > 0:
|
| 169 |
+
accelerator.clip_grad_norm_(model.parameters(),
|
| 170 |
+
max_norm=params['opt']['grad_clip'])
|
| 171 |
+
optimizer.step()
|
| 172 |
+
lr_scheduler.step()
|
| 173 |
+
optimizer.zero_grad()
|
| 174 |
+
|
| 175 |
+
global_step += 1/params['opt']['accumulation_steps']
|
| 176 |
+
losses += loss.item()/params['opt']['accumulation_steps']
|
| 177 |
+
|
| 178 |
+
if accelerator.is_main_process:
|
| 179 |
+
if global_step % args.log_step == 0:
|
| 180 |
+
current_time = time.asctime(time.localtime(time.time()))
|
| 181 |
+
epoch_info = f'Epoch: [{epoch + 1}][{args.epochs}]'
|
| 182 |
+
batch_info = f'Global Step: {global_step}'
|
| 183 |
+
loss_info = f'Loss: {losses / args.log_step:.6f}'
|
| 184 |
+
|
| 185 |
+
# Extract the learning rate from the optimizer
|
| 186 |
+
lr = optimizer.param_groups[0]['lr']
|
| 187 |
+
lr_info = f'Learning Rate: {lr:.6f}'
|
| 188 |
+
|
| 189 |
+
log_message = f'{current_time}\n{epoch_info} {batch_info} {loss_info} {lr_info}\n'
|
| 190 |
+
|
| 191 |
+
with open(args.log_dir + 'log.txt', mode='a') as n:
|
| 192 |
+
n.write(log_message)
|
| 193 |
+
|
| 194 |
+
losses = 0.0
|
| 195 |
+
|
| 196 |
+
# check performance
|
| 197 |
+
if (global_step + 1) % args.eval_every_step == 0:
|
| 198 |
+
if accelerator.is_main_process:
|
| 199 |
+
model_module = model.module if hasattr(model, 'module') else model
|
| 200 |
+
val_psds(model_module, val_loader, params, epoch=global_step+1, split='val',
|
| 201 |
+
save_path=args.log_dir + 'output/', device=accelerator.device)
|
| 202 |
+
# save model
|
| 203 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
| 204 |
+
accelerator.save({
|
| 205 |
+
"model": model.state_dict(),
|
| 206 |
+
}, args.save_dir + str(global_step+1) + '.pt')
|
| 207 |
+
accelerator.wait_for_everyone()
|
| 208 |
+
model.train()
|
src/.ipynb_checkpoints/val-checkpoint.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import sed_scores_eval
|
| 6 |
+
from desed_task.evaluation.evaluation_measures import (compute_per_intersection_macro_f1,
|
| 7 |
+
compute_psds_from_operating_points,
|
| 8 |
+
compute_psds_from_scores)
|
| 9 |
+
from local.utils import (batched_decode_preds,)
|
| 10 |
+
from utils.sed import Encoder
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@torch.no_grad()
|
| 15 |
+
def val_psds(model, val_loader, params, epoch, split, save_path, device):
|
| 16 |
+
label_df = pd.read_csv(params['data'][split]['label'])
|
| 17 |
+
EVENTS = label_df['label'].tolist()
|
| 18 |
+
|
| 19 |
+
clap_emb = []
|
| 20 |
+
for event in EVENTS:
|
| 21 |
+
cls = torch.load(params['data']['train_data']['clap_dir'] + event + '.pt').to(device)
|
| 22 |
+
cls = cls.unsqueeze(1)
|
| 23 |
+
clap_emb.append(cls)
|
| 24 |
+
cls = torch.cat(clap_emb, dim=1)
|
| 25 |
+
|
| 26 |
+
encoder = Encoder(EVENTS, audio_len=10, frame_len=160, frame_hop=160, net_pooling=4, sr=16000)
|
| 27 |
+
|
| 28 |
+
model.eval()
|
| 29 |
+
test_csv = params['data'][split]["csv"]
|
| 30 |
+
test_dur = params['data'][split]["dur"]
|
| 31 |
+
|
| 32 |
+
gt = pd.read_csv(test_csv, sep='\t')
|
| 33 |
+
|
| 34 |
+
test_scores_postprocessed_buffer = {}
|
| 35 |
+
test_scores_postprocessed_buffer_tsed = {}
|
| 36 |
+
test_thresholds = [0.5]
|
| 37 |
+
test_psds_buffer = {k: pd.DataFrame() for k in test_thresholds}
|
| 38 |
+
test_psds_buffer_tsed = {k: pd.DataFrame() for k in test_thresholds}
|
| 39 |
+
|
| 40 |
+
for batch in tqdm(val_loader):
|
| 41 |
+
audio, filenames = batch
|
| 42 |
+
B = audio.shape[0]
|
| 43 |
+
N = cls.shape[1]
|
| 44 |
+
cls = cls.expand(B, -1, -1)
|
| 45 |
+
|
| 46 |
+
audio = audio.to(device)
|
| 47 |
+
mel = model.forward_to_spec(audio)
|
| 48 |
+
|
| 49 |
+
preds = model(mel, cls)
|
| 50 |
+
preds = torch.sigmoid(preds)
|
| 51 |
+
preds = preds.reshape(B, N, -1)
|
| 52 |
+
preds_tsed = preds.clone()
|
| 53 |
+
# tsed assumes sound exitencance is known
|
| 54 |
+
for idx, filename in enumerate(filenames):
|
| 55 |
+
weak_label = list(gt[gt['filename'] == filename]['event_label'].unique())
|
| 56 |
+
for j, event in enumerate(EVENTS):
|
| 57 |
+
if event not in weak_label:
|
| 58 |
+
preds_tsed[idx][j] = 0.0
|
| 59 |
+
# preds = preds.transpose(1, 2)
|
| 60 |
+
|
| 61 |
+
(_, scores_postprocessed_strong, _,) = \
|
| 62 |
+
batched_decode_preds(
|
| 63 |
+
preds,
|
| 64 |
+
filenames,
|
| 65 |
+
encoder,
|
| 66 |
+
median_filter=9,
|
| 67 |
+
thresholds=list(test_psds_buffer.keys()), )
|
| 68 |
+
test_scores_postprocessed_buffer.update(scores_postprocessed_strong)
|
| 69 |
+
|
| 70 |
+
(_, scores_postprocessed_strong_tsed, _,) = \
|
| 71 |
+
batched_decode_preds(
|
| 72 |
+
preds_tsed,
|
| 73 |
+
filenames,
|
| 74 |
+
encoder,
|
| 75 |
+
median_filter=9,
|
| 76 |
+
thresholds=list(test_psds_buffer_tsed.keys()), )
|
| 77 |
+
test_scores_postprocessed_buffer_tsed.update(scores_postprocessed_strong_tsed)
|
| 78 |
+
|
| 79 |
+
ground_truth = sed_scores_eval.io.read_ground_truth_events(test_csv)
|
| 80 |
+
audio_durations = sed_scores_eval.io.read_audio_durations(test_dur)
|
| 81 |
+
|
| 82 |
+
ground_truth = {
|
| 83 |
+
audio_id: ground_truth[audio_id]
|
| 84 |
+
for audio_id in test_scores_postprocessed_buffer
|
| 85 |
+
}
|
| 86 |
+
audio_durations = {
|
| 87 |
+
audio_id: audio_durations[audio_id]
|
| 88 |
+
for audio_id in test_scores_postprocessed_buffer
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
psds1_sed_scores_eval, psds1_cls = compute_psds_from_scores(
|
| 92 |
+
test_scores_postprocessed_buffer,
|
| 93 |
+
ground_truth,
|
| 94 |
+
audio_durations,
|
| 95 |
+
dtc_threshold=0.7,
|
| 96 |
+
gtc_threshold=0.7,
|
| 97 |
+
cttc_threshold=None,
|
| 98 |
+
alpha_ct=0.0,
|
| 99 |
+
alpha_st=0.0,
|
| 100 |
+
# save_dir=os.path.join(save_dir, "student", "scenario1"),
|
| 101 |
+
)
|
| 102 |
+
psds1_cls['overall'] = psds1_sed_scores_eval
|
| 103 |
+
psds1_cls['macro_averaged'] = np.array([v for k, v in psds1_cls.items()]).mean()
|
| 104 |
+
psds1_cls['name'] = 'psds1'
|
| 105 |
+
|
| 106 |
+
psds1_sed_scores_eval_tsed, psds1_cls_tsed = compute_psds_from_scores(
|
| 107 |
+
test_scores_postprocessed_buffer_tsed,
|
| 108 |
+
ground_truth,
|
| 109 |
+
audio_durations,
|
| 110 |
+
dtc_threshold=0.7,
|
| 111 |
+
gtc_threshold=0.7,
|
| 112 |
+
cttc_threshold=None,
|
| 113 |
+
alpha_ct=0.0,
|
| 114 |
+
alpha_st=0.0,
|
| 115 |
+
# save_dir=os.path.join(save_dir, "student", "scenario1"),
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
psds1_cls_tsed['overall'] = psds1_sed_scores_eval_tsed
|
| 119 |
+
psds1_cls_tsed['macro_averaged'] = np.array([v for k, v in psds1_cls_tsed.items()]).mean()
|
| 120 |
+
psds1_cls_tsed['name'] = 'psds1_tsed'
|
| 121 |
+
|
| 122 |
+
# psds2_sed_scores_eval, psds2_cls = compute_psds_from_scores(
|
| 123 |
+
# test_scores_postprocessed_buffer,
|
| 124 |
+
# ground_truth,
|
| 125 |
+
# audio_durations,
|
| 126 |
+
# dtc_threshold=0.1,
|
| 127 |
+
# gtc_threshold=0.1,
|
| 128 |
+
# cttc_threshold=0.3,
|
| 129 |
+
# alpha_ct=0.5,
|
| 130 |
+
# alpha_st=1,
|
| 131 |
+
# # save_dir=os.path.join(save_dir, "student", "scenario1"),
|
| 132 |
+
# )
|
| 133 |
+
# psds2_cls['overall'] = psds2_sed_scores_eval
|
| 134 |
+
# psds2_cls['macro_averaged'] = np.array([v for k, v in psds2_cls.items()]).mean()
|
| 135 |
+
# psds2_cls['name'] = 'psds2'
|
| 136 |
+
psds_cls = pd.DataFrame([psds1_cls, psds1_cls_tsed])
|
| 137 |
+
# psds_cls = pd.DataFrame([psds1_cls, psds2_cls])
|
| 138 |
+
os.makedirs(f'{save_path}/psds_cls/', exist_ok=True)
|
| 139 |
+
psds_cls.to_csv(f'{save_path}/psds_cls/{epoch}.csv', index=False)
|
| 140 |
+
|
| 141 |
+
return psds1_sed_scores_eval, psds1_sed_scores_eval_tsed
|
src/clap_embedding/Accelerating, revving, vroom.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b4a351451f29ac729cdf638e3c3e81da4c1ff7963cdbbc17ca64c49f2e0a7f8
|
| 3 |
+
size 3397
|
src/clap_embedding/Air brake.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34ca30c586a0c92b86b136aa8fd69c27a31a10b454159e6fdfc1197e8c1585b5
|
| 3 |
+
size 3238
|
src/clap_embedding/Air conditioning.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8238a7ffa14c033b4d41f2331ce94424d925a0e3b1a37f4d5b491a111d518425
|
| 3 |
+
size 3273
|
src/clap_embedding/Air horn, truck horn.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a054ca636e138e115e085ed4b5f315ffaa60b647400f657dd6d334720fbc8e73
|
| 3 |
+
size 3293
|
src/clap_embedding/Aircraft engine.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac2b736bed945841a2d066cb4ad5218b55903c88083d34870b6d27eccc9b1d55
|
| 3 |
+
size 3268
|
src/clap_embedding/Aircraft.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:902502af6d7e3ff22b6650282c1c8e3f98d6c1687b1f3078465bf942c30620cf
|
| 3 |
+
size 3233
|
src/clap_embedding/Alarm clock.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e0d17a345893c6f0fe6a9d6fb11f060da277b8d82475b95c2249138919beb5b
|
| 3 |
+
size 3248
|
src/clap_embedding/Alarm.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43fce07d002c946daad296e9b637943a9941cc703bb3c1755fb497f72afcccc1
|
| 3 |
+
size 3154
|
src/clap_embedding/Alert.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:751b03e62094f66ddd4965d55583ffc8db0f37a621a614b8aec953ab284d5e23
|
| 3 |
+
size 3154
|
src/clap_embedding/Ambulance (siren).pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b8d7ab047ba136bc0c8979ce97b917cbba1181e60257c046a748612dae58660
|
| 3 |
+
size 3278
|
src/clap_embedding/Animal.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:060e4de7c1aa9e784f8d4245b8ed7d17e001a68615005bdde858aeb044f61aac
|
| 3 |
+
size 3159
|
src/clap_embedding/Applause.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:053458a5422c7a2dc316dfe803963118a11093ef324b07c07d79f98e11001bbb
|
| 3 |
+
size 3233
|
src/clap_embedding/Arrow.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ad03adea0e391dce55cf4e1ef13d4d299ea741d56e06a379915305b4ae56d03
|
| 3 |
+
size 3154
|
src/clap_embedding/Artillery fire.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a8aafd9fe1fc64424eb28aa0206bf3ad5ad505cb5eeb471164453f6d3a61313
|
| 3 |
+
size 3263
|
src/clap_embedding/Audio logo.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6e3a7f4f827f4e9d2de401956568f7ff771a7e7c11cb547306454cf0ea0c4ab
|
| 3 |
+
size 3243
|
src/clap_embedding/Babbling.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c7ccbb7c692f6d2a5a1039039ea127d06189788e2cab5d25302d8d0bd4ddef5
|
| 3 |
+
size 3233
|
src/clap_embedding/Baby cry, infant cry.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebad3b3fd6e97f3a4681f2cbf3bc7b9dad2eed05715b2ab28d095ee156d204f7
|
| 3 |
+
size 3293
|
src/clap_embedding/Baby laughter.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13abd466060f1997a8f0251fd7a32456824713cfec0ca8754a01a2f245ae03af
|
| 3 |
+
size 3258
|
src/clap_embedding/Background noise.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e64c97573b4d05dfcb0e50362afff050b9e287e9c26195b10e6da1182a8b104f
|
| 3 |
+
size 3273
|
src/clap_embedding/Bang.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e28d04d1f0b6e407dda531d7df5c1883ac907c218feaffabb4a213445d874e5
|
| 3 |
+
size 3149
|
src/clap_embedding/Bark.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f866f230ad42ff9be34383a120ae6563606d149b1bd00039cd360c61cbbb371a
|
| 3 |
+
size 3149
|
src/clap_embedding/Basketball bounce.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cac0a6b3d44c42d8d1eeb93ea1fb59abcc095127ae88ea9f7f684ff0ba5f52d
|
| 3 |
+
size 3278
|
src/clap_embedding/Bathroom sounds.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e781b8e4ea847ad5acfa5cdc7cbecf294019e96d156f5e8e9d71fd384421f3c4
|
| 3 |
+
size 3268
|
src/clap_embedding/Bathtub (filling or washing).pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7af853e52974d5cd77f2d79b207303df6299c23f508ed481cf343c1b7034bed8
|
| 3 |
+
size 3397
|
src/clap_embedding/Battle cry.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f06286c4fc69426a7fe8685fa6c74a337b97f552ab503009b11d11b70498a45
|
| 3 |
+
size 3243
|
src/clap_embedding/Bee, wasp, etc..pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eaabbd88980a77e0e1a4293d912c9f21fefe9dc8cc288c75fd4c07020e86ef58
|
| 3 |
+
size 3268
|
src/clap_embedding/Beep, bleep.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6d6a2021aa05b8f325efdd8e8aa163df1829cc9350b650da63a46284087d142
|
| 3 |
+
size 3248
|
src/clap_embedding/Bell.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3252fcd0b5f0e000410ac308b5f012167770d0ac0e27f4897945b922e02485c2
|
| 3 |
+
size 3149
|
src/clap_embedding/Bellow.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b43dd2c5a59384af15520a845ee7801138265eb28d0050fd96156f096ec822a1
|
| 3 |
+
size 3159
|
src/clap_embedding/Belly laugh.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61b60d96453309b2791d7e17a36e63e4ea98d5e55f687b5083349ac67a1e9cc7
|
| 3 |
+
size 3248
|
src/clap_embedding/Bicycle bell.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0a0a337a1b9c19819393b6eab07aace307ad402fec3f9e84f72d6ffd5501e76
|
| 3 |
+
size 3253
|
src/clap_embedding/Bicycle, tricycle.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f950ffa51035922839dcebe144e6c395c463288870f719a286064c02adf1f4f
|
| 3 |
+
size 3278
|
src/clap_embedding/Bird flight, flapping wings.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41e0c3d59dcfb50e5169cbee4ca8642f9b78ca047c0f3e44d0fa7f2b46bfb320
|
| 3 |
+
size 3392
|
src/clap_embedding/Bird vocalization, bird call, bird song.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1445a61059d9164ced63c01abf60de91637b79d26fed015004124158824c04df
|
| 3 |
+
size 3452
|
src/clap_embedding/Bird.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c94d39ae43d5c16d35c77a6ddbe500dc7aae6a044a3a10f43d33dcf14da48e91
|
| 3 |
+
size 3149
|
src/clap_embedding/Biting.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5c42b9c0b3fbfa6c0b43428a0bbbb199706c3d1babd7942d2f45e8a8874106d
|
| 3 |
+
size 3159
|
src/clap_embedding/Bleat.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2b9513225200a2f5ddeea1ee54d167c85c9b180af1bbe3766f5ff0fb044ee2c
|
| 3 |
+
size 3154
|
src/clap_embedding/Blender, food processor.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b82c822d42ea9ada0d83695e565af6b90a016942f7f02b0bbb27b9f5d755f5c
|
| 3 |
+
size 3372
|