OpenSound commited on
Commit
3b6a091
·
verified ·
1 Parent(s): 693498e

Upload 544 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. LICENSE +21 -0
  3. README.md +60 -14
  4. api.py +185 -0
  5. app.py +71 -4
  6. example.wav +3 -0
  7. example2.wav +3 -0
  8. requirements.txt +6 -0
  9. src/.ipynb_checkpoints/prepare_clap-checkpoint.py +39 -0
  10. src/.ipynb_checkpoints/test-checkpoint.py +140 -0
  11. src/.ipynb_checkpoints/train-checkpoint.py +208 -0
  12. src/.ipynb_checkpoints/val-checkpoint.py +141 -0
  13. src/clap_embedding/Accelerating, revving, vroom.pt +3 -0
  14. src/clap_embedding/Air brake.pt +3 -0
  15. src/clap_embedding/Air conditioning.pt +3 -0
  16. src/clap_embedding/Air horn, truck horn.pt +3 -0
  17. src/clap_embedding/Aircraft engine.pt +3 -0
  18. src/clap_embedding/Aircraft.pt +3 -0
  19. src/clap_embedding/Alarm clock.pt +3 -0
  20. src/clap_embedding/Alarm.pt +3 -0
  21. src/clap_embedding/Alert.pt +3 -0
  22. src/clap_embedding/Ambulance (siren).pt +3 -0
  23. src/clap_embedding/Animal.pt +3 -0
  24. src/clap_embedding/Applause.pt +3 -0
  25. src/clap_embedding/Arrow.pt +3 -0
  26. src/clap_embedding/Artillery fire.pt +3 -0
  27. src/clap_embedding/Audio logo.pt +3 -0
  28. src/clap_embedding/Babbling.pt +3 -0
  29. src/clap_embedding/Baby cry, infant cry.pt +3 -0
  30. src/clap_embedding/Baby laughter.pt +3 -0
  31. src/clap_embedding/Background noise.pt +3 -0
  32. src/clap_embedding/Bang.pt +3 -0
  33. src/clap_embedding/Bark.pt +3 -0
  34. src/clap_embedding/Basketball bounce.pt +3 -0
  35. src/clap_embedding/Bathroom sounds.pt +3 -0
  36. src/clap_embedding/Bathtub (filling or washing).pt +3 -0
  37. src/clap_embedding/Battle cry.pt +3 -0
  38. src/clap_embedding/Bee, wasp, etc..pt +3 -0
  39. src/clap_embedding/Beep, bleep.pt +3 -0
  40. src/clap_embedding/Bell.pt +3 -0
  41. src/clap_embedding/Bellow.pt +3 -0
  42. src/clap_embedding/Belly laugh.pt +3 -0
  43. src/clap_embedding/Bicycle bell.pt +3 -0
  44. src/clap_embedding/Bicycle, tricycle.pt +3 -0
  45. src/clap_embedding/Bird flight, flapping wings.pt +3 -0
  46. src/clap_embedding/Bird vocalization, bird call, bird song.pt +3 -0
  47. src/clap_embedding/Bird.pt +3 -0
  48. src/clap_embedding/Biting.pt +3 -0
  49. src/clap_embedding/Bleat.pt +3 -0
  50. src/clap_embedding/Blender, food processor.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ example.wav filter=lfs diff=lfs merge=lfs -text
37
+ example2.wav filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jiarui Hai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,14 +1,60 @@
1
- ---
2
- title: FlexSED
3
- emoji: 🏆
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: 'FlexSED: An Open-Vocabulary Sound Event Detection System'
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FlexSED: Towards Open-Vocabulary Sound Event Detection
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-2409.10819-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2509.18606)
4
+ [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/Higobeatz/FlexSED/tree/main)
5
+
6
+
7
+ ## News
8
+ - Oct 2025: 📦 Released code and pretrained checkpoint
9
+ - Sep 2025: 🎉 FlexSED Spotlighted at WASPAA 2025
10
+
11
+
12
+ ## Installation
13
+
14
+ Clone the repository:
15
+ ```
16
+ git clone [email protected]:JHU-LCAP/FlexSED.git
17
+ ```
18
+ Install the dependencies:
19
+ ```
20
+ cd FlexSED
21
+ pip install -r requirements.txt
22
+ ```
23
+
24
+ ## Usage
25
+ ```python
26
+ from api import FlexSED
27
+ import torch
28
+ import soundfile as sf
29
+
30
+ # load model
31
+ flexsed = FlexSED(device='cuda')
32
+
33
+ # run inference
34
+ events = ["Dog"]
35
+ preds = flexsed.run_inference("example.wav", events)
36
+
37
+ # visualize prediciton
38
+ flexsed.to_multi_plot(preds, events, fname="example2")
39
+
40
+ # (Optional) visualize prediciton by video
41
+ # flexsed.to_multi_video(preds, events, audio_path="example2.wav", fname="example2")
42
+ ```
43
+
44
+ ## Training
45
+
46
+ WIP
47
+
48
+
49
+ ## Reference
50
+
51
+ If you find the code useful for your research, please consider citing:
52
+
53
+ ```bibtex
54
+ @article{hai2025flexsed,
55
+ title={FlexSED: Towards Open-Vocabulary Sound Event Detection},
56
+ author={Hai, Jiarui and Wang, Helin and Guo, Weizhe and Elhilali, Mounya},
57
+ journal={arXiv preprint arXiv:2509.18606},
58
+ year={2025}
59
+ }
60
+ ```
api.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import os
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ from transformers import AutoTokenizer, ClapTextModelWithProjection
7
+ from src.models.transformer import Dasheng_Encoder
8
+ from src.models.sed_decoder import Decoder, TSED_Wrapper
9
+ from src.utils import load_yaml_with_includes
10
+
11
+
12
+ class FlexSED:
13
+ def __init__(
14
+ self,
15
+ config_path='src/configs/model.yml',
16
+ ckpt_path='ckpts/flexsed_as.pt',
17
+ ckpt_url='https://huggingface.co/Higobeatz/FlexSED/resolve/main/ckpts/flexsed_as.pt',
18
+ device='cuda'
19
+ ):
20
+ """
21
+ Initialize FlexSED with model, CLAP, and tokenizer loaded once.
22
+ If the checkpoint is not available locally, it will be downloaded automatically.
23
+ """
24
+ self.device = device
25
+ params = load_yaml_with_includes(config_path)
26
+
27
+ # Ensure checkpoint exists
28
+ if not os.path.exists(ckpt_path):
29
+ print(f"[FlexSED] Downloading checkpoint from {ckpt_url} ...")
30
+ state_dict = torch.hub.load_state_dict_from_url(ckpt_url, map_location="cpu")
31
+ else:
32
+ state_dict = torch.load(ckpt_path, map_location="cpu")
33
+
34
+ # Encoder + Decoder
35
+ encoder = Dasheng_Encoder(**params['encoder']).to(self.device)
36
+ decoder = Decoder(**params['decoder']).to(self.device)
37
+ self.model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder'])
38
+ self.model.load_state_dict(state_dict['model'])
39
+ self.model.eval()
40
+
41
+ # CLAP text model
42
+ self.clap = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
43
+ self.clap.eval()
44
+ self.tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
45
+
46
+ def run_inference(self, audio_path, events, norm_audio=True):
47
+ """
48
+ Run inference on audio for given events.
49
+ """
50
+ audio, sr = librosa.load(audio_path, sr=16000)
51
+ audio = torch.tensor([audio]).to(self.device)
52
+
53
+ if norm_audio:
54
+ eps = 1e-9
55
+ max_val = torch.max(torch.abs(audio))
56
+ audio = audio / (max_val + eps)
57
+
58
+ clap_embeds = []
59
+ with torch.no_grad():
60
+ for event in events:
61
+ text = f"The sound of {event.replace('_',' ')}"
62
+ inputs = self.tokenizer([text], padding=True, return_tensors="pt")
63
+ outputs = self.clap(**inputs)
64
+ text_embeds = outputs.text_embeds.unsqueeze(1)
65
+ clap_embeds.append(text_embeds)
66
+
67
+ query = torch.cat(clap_embeds, dim=1).to(self.device)
68
+ mel = self.model.forward_to_spec(audio)
69
+ preds = self.model(mel, query)
70
+ preds = torch.sigmoid(preds).cpu()
71
+
72
+ return preds # shape: [num_events, 1, T]
73
+
74
+ # ---------- Multi-event plotting ----------
75
+ @staticmethod
76
+ def plot_and_save_multi(preds, events, sr=25, out_dir="./plots", fname="all_events"):
77
+ os.makedirs(out_dir, exist_ok=True)
78
+ preds_np = preds.squeeze(1).numpy() # [num_events, T]
79
+ T = preds_np.shape[1]
80
+
81
+ plt.figure(figsize=(12, len(events) * 0.6 + 2))
82
+ plt.imshow(
83
+ preds_np,
84
+ aspect="auto",
85
+ cmap="Blues",
86
+ extent=[0, T/sr, 0, len(events)],
87
+ vmin=0, vmax=1, origin="lower"
88
+
89
+ )
90
+ plt.colorbar(label="Probability")
91
+ plt.yticks(np.arange(len(events)) + 0.5, events)
92
+ plt.xlabel("Time (s)")
93
+ plt.ylabel("Events")
94
+ plt.title("Event Predictions")
95
+
96
+ save_path = os.path.join(out_dir, f"{fname}.png")
97
+ plt.savefig(save_path, dpi=200, bbox_inches="tight")
98
+ plt.close()
99
+ return save_path
100
+
101
+ def to_multi_plot(self, preds, events, out_dir="./plots", fname="all_events"):
102
+ return self.plot_and_save_multi(preds, events, out_dir=out_dir, fname=fname)
103
+
104
+ # ---------- Multi-event video ----------
105
+ @staticmethod
106
+ def make_multi_event_video(preds, events, sr=25, out_dir="./videos",
107
+ audio_path=None, fps=25, highlight=True, fname="all_events"):
108
+ from moviepy.editor import ImageSequenceClip, AudioFileClip
109
+ from tqdm import tqdm
110
+
111
+ os.makedirs(out_dir, exist_ok=True)
112
+ preds_np = preds.squeeze(1).numpy() # [num_events, T]
113
+ T = preds_np.shape[1]
114
+ duration = T / sr
115
+
116
+ frames = []
117
+ n_frames = int(duration * fps)
118
+
119
+ for i in tqdm(range(n_frames)):
120
+ t = int(i * T / n_frames)
121
+ plt.figure(figsize=(12, len(events) * 0.6 + 2))
122
+
123
+ if highlight:
124
+ mask = np.zeros_like(preds_np)
125
+ mask[:, :t+1] = preds_np[:, :t+1]
126
+ plt.imshow(
127
+ mask,
128
+ aspect="auto",
129
+ cmap="Blues",
130
+ extent=[0, T/sr, 0, len(events)],
131
+ vmin=0, vmax=1, origin="lower"
132
+ )
133
+ else:
134
+ plt.imshow(
135
+ preds_np[:, :t+1],
136
+ aspect="auto",
137
+ cmap="Blues",
138
+ extent=[0, (t+1)/sr, 0, len(events)],
139
+ vmin=0, vmax=1, origin="lower"
140
+ )
141
+
142
+ plt.colorbar(label="Probability")
143
+ plt.yticks(np.arange(len(events)) + 0.5, events)
144
+ plt.xlabel("Time (s)")
145
+ plt.ylabel("Events")
146
+ plt.title("Event Predictions")
147
+
148
+ frame_path = f"/tmp/frame_{i:04d}.png"
149
+ plt.savefig(frame_path, dpi=150, bbox_inches="tight")
150
+ plt.close()
151
+ frames.append(frame_path)
152
+
153
+ clip = ImageSequenceClip(frames, fps=fps)
154
+ if audio_path is not None:
155
+ audio = AudioFileClip(audio_path).subclip(0, duration)
156
+ clip = clip.set_audio(audio)
157
+
158
+ save_path = os.path.join(out_dir, f"{fname}.mp4")
159
+ clip.write_videofile(
160
+ save_path,
161
+ fps=fps,
162
+ codec="mpeg4",
163
+ audio_codec="aac"
164
+ )
165
+
166
+ for f in frames:
167
+ os.remove(f)
168
+
169
+ return save_path
170
+
171
+ def to_multi_video(self, preds, events, audio_path, out_dir="./videos", fname="all_events"):
172
+ return self.make_multi_event_video(
173
+ preds, events, audio_path=audio_path, out_dir=out_dir, fname=fname
174
+ )
175
+
176
+
177
+ if __name__ == "__main__":
178
+ flexsed = FlexSED(device='cuda')
179
+
180
+ events = ["Door", "Laughter", "Dog"]
181
+ preds = flexsed.run_inference("example2.wav", events)
182
+
183
+ # Combined plot & video
184
+ flexsed.to_multi_plot(preds, events, fname="example2")
185
+ # flexsed.to_multi_video(preds, events, audio_path="example2.wav", fname="example2")
app.py CHANGED
@@ -1,7 +1,74 @@
1
  import gradio as gr
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from api import FlexSED
4
+ import tempfile
5
+ import os
6
 
7
+ # Load model once on startup
8
+ flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu")
9
 
10
+ def run_flexsed(audio_file, event_list):
11
+ """
12
+ Run inference using FlexSED and return prediction plot.
13
+ """
14
+ if not audio_file:
15
+ return None
16
+
17
+ # Split events by semicolon or comma
18
+ events = [e.strip() for e in event_list.split(";") if e.strip()]
19
+ if not events:
20
+ return None
21
+
22
+ # Run inference
23
+ preds = flexsed.run_inference(audio_file, events)
24
+
25
+ # Generate visualization
26
+ output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output")
27
+ flexsed.to_multi_plot(preds, events, fname=output_fname)
28
+ plot_path = f"{output_fname}.png"
29
+
30
+ return plot_path
31
+
32
+
33
+ # App layout
34
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app:
35
+ # Header
36
+ gr.Markdown("""
37
+ ## 🎧 FlexSED: A Flexible Open-Vocabulary Sound Event Detection System
38
+
39
+ 👋 Welcome to the **FlexSED live demo** — explore **prompt-guided sound event detection** in real audio clips.
40
+
41
+ 🔗 Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED)
42
+ """)
43
+
44
+ gr.Markdown("### 🔍 Upload or choose an example below to detect sound events:")
45
+
46
+ with gr.Row():
47
+ # Left column: Inputs
48
+ with gr.Column(scale=1):
49
+ audio_input = gr.Audio(type="filepath", label="🎵 Upload Audio (.wav)")
50
+ text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter")
51
+
52
+ with gr.Row():
53
+ detect_btn = gr.Button("🎯 Detect", variant="primary")
54
+ clear_btn = gr.Button("🧹 Clear")
55
+
56
+ # Right column: Output
57
+ with gr.Column(scale=1):
58
+ image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image")
59
+ gr.Examples(
60
+ examples=[
61
+ ["example.wav", "Male speech; Door; Dog; Laughter"],
62
+ ["example2.wav", "Male speech; Bee; Gunshot, gunfire"],
63
+ ],
64
+ inputs=[audio_input, text_input],
65
+ label="Example Audios"
66
+ )
67
+
68
+ # Function bindings
69
+ detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output)
70
+ clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input])
71
+
72
+
73
+ if __name__ == "__main__":
74
+ app.launch(share=True)
example.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:371ee4358cd3b12330f406d7d576fecb2329057132696360278b602043009562
3
+ size 480044
example2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceca83fd7bd5e1ab16dd61a445c3f3fb11b87c67d8a56b277d4ee293c56b23ed
3
+ size 480044
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ soundfile
4
+ matplotlib
5
+ numpy
6
+ librosa
src/.ipynb_checkpoints/prepare_clap-checkpoint.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import torch
4
+ from transformers import AutoTokenizer, ClapTextModelWithProjection
5
+
6
+ if __name__ == '__main__':
7
+ # Load the CLAP model and tokenizer
8
+ model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
9
+ model.eval()
10
+ tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
11
+
12
+ # Path to the input CSV file
13
+ input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv'
14
+ output_path = 'clap_embedding/' # Replace with your desired output folder path
15
+
16
+ # Create the output folder if it doesn't exist
17
+ os.makedirs(output_path, exist_ok=True)
18
+
19
+ # Read the CSV file
20
+ df = pd.read_csv(input_csv_path)
21
+
22
+ # Get unique event labels
23
+ events = df['label'].unique()
24
+
25
+ with torch.no_grad(): # Disable gradient computation
26
+ # Process each event
27
+ for event in events:
28
+ text = event.replace('_', ' ') # Replace underscores with spaces
29
+ text = f'The sound of {text}'
30
+ print(text)
31
+ inputs = tokenizer([text], padding=True, return_tensors="pt")
32
+ outputs = model(**inputs)
33
+ text_embeds = outputs.text_embeds
34
+
35
+ # Save the embeddings to a .pt file
36
+ output_file = os.path.join(output_path, f"{event}.pt")
37
+ torch.save(text_embeds, output_file)
38
+
39
+ print("Embedding extraction and saving complete!")
src/.ipynb_checkpoints/test-checkpoint.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import argparse
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
+
14
+ from accelerate import Accelerator
15
+
16
+ from models.transformer import Dasheng_Encoder
17
+ from models.sed_decoder import Decoder, TSED_Wrapper
18
+ from dataset.tsed import TSED_AS
19
+ from dataset.tsed_val import TSED_Val
20
+ from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler
21
+ from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation
22
+ from val import val_psds
23
+
24
+
25
+ def parse_args():
26
+ parser = argparse.ArgumentParser()
27
+
28
+ # Config settings
29
+ parser.add_argument('--config-name', type=str, default='configs/model.yml')
30
+ parser.add_argument('--ckpt', type=str, default='20000.pt')
31
+
32
+ # Training settings
33
+ parser.add_argument("--amp", type=str, default='fp16')
34
+ parser.add_argument('--epochs', type=int, default=20)
35
+ parser.add_argument('--num-workers', type=int, default=8)
36
+ parser.add_argument('--num-threads', type=int, default=1)
37
+ parser.add_argument('--eval-every-step', type=int, default=5000)
38
+ parser.add_argument('--save-every-step', type=int, default=5000)
39
+ # parser.add_argument('--dataloader', type=str, default='EACaps')
40
+ parser.add_argument("--logit-normal-indices", type=bool, default=False)
41
+
42
+ # Log and random seed
43
+ parser.add_argument('--random-seed', type=int, default=2024)
44
+ parser.add_argument('--log-step', type=int, default=100)
45
+ parser.add_argument('--log-dir', type=str, default='../logs/')
46
+ parser.add_argument('--save-dir', type=str, default='../ckpts/')
47
+ return parser.parse_args()
48
+
49
+
50
+ def setup_directories(args, params):
51
+ args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/'
52
+ args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/'
53
+
54
+ os.makedirs(args.log_dir, exist_ok=True)
55
+ os.makedirs(args.save_dir, exist_ok=True)
56
+
57
+
58
+ def set_device(args):
59
+ torch.set_num_threads(args.num_threads)
60
+ if torch.cuda.is_available():
61
+ args.device = 'cuda'
62
+ torch.cuda.manual_seed_all(args.random_seed)
63
+ torch.backends.cuda.matmul.allow_tf32 = True
64
+ if torch.backends.cudnn.is_available():
65
+ torch.backends.cudnn.deterministic = True
66
+ torch.backends.cudnn.benchmark = False
67
+ else:
68
+ args.device = 'cpu'
69
+
70
+
71
+ if __name__ == '__main__':
72
+ args = parse_args()
73
+ params = load_yaml_with_includes(args.config_name)
74
+ set_device(args)
75
+ setup_directories(args, params)
76
+
77
+ random.seed(args.random_seed)
78
+ torch.manual_seed(args.random_seed)
79
+
80
+ # use accelerator for multi-gpu training
81
+ accelerator = Accelerator(mixed_precision=args.amp,
82
+ gradient_accumulation_steps=params['opt']['accumulation_steps'],
83
+ step_scheduler_with_optimizer=False)
84
+
85
+ train_set = TSED_AS(**params['data']['train_data'])
86
+ train_loader = DataLoader(train_set, batch_size=params['opt']['batch_size'], num_workers=args.num_workers)
87
+
88
+ # val_set = TSED_Val(**params['data']['val_data'])
89
+ # val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False)
90
+
91
+ test_set = TSED_Val(**params['data']['test_data'])
92
+ test_loader = DataLoader(test_set, num_workers=0, batch_size=1, shuffle=False)
93
+
94
+ encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device)
95
+ pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1'
96
+ dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
97
+ model_parmeters = dump['model']
98
+ # pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1'
99
+ # dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
100
+ # model_parmeters = dump
101
+ encoder.load_state_dict(model_parmeters)
102
+
103
+ decoder = Decoder(**params['decoder']).to(accelerator.device)
104
+
105
+ model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder'])
106
+ print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M")
107
+
108
+ model.load_state_dict(torch.load(args.ckpt, map_location='cpu')['model'])
109
+
110
+ if params['frozen_encoder']:
111
+ optimizer = torch.optim.AdamW(
112
+ model.parameters(),
113
+ lr=params['opt']['learning_rate'],
114
+ weight_decay=params['opt']['weight_decay'],
115
+ betas=(params['opt']['beta1'], params['opt']['beta2']),
116
+ eps=params['opt']['adam_epsilon'])
117
+ else:
118
+ optimizer = torch.optim.AdamW(
119
+ [
120
+ {'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']},
121
+ {'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']}
122
+ ],
123
+ weight_decay=params['opt']['weight_decay'],
124
+ betas=(params['opt']['beta1'], params['opt']['beta2']),
125
+ eps=params['opt']['adam_epsilon'])
126
+
127
+ lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler'])
128
+
129
+ strong_loss_func = nn.BCEWithLogitsLoss()
130
+
131
+ model, optimizer, lr_scheduler, train_loader, test_loader = accelerator.prepare(
132
+ model, optimizer, lr_scheduler, train_loader, test_loader)
133
+
134
+ global_step = 0.0
135
+ losses = 0.0
136
+
137
+ if accelerator.is_main_process:
138
+ model_module = model.module if hasattr(model, 'module') else model
139
+ val_psds(model_module, test_loader, params, epoch='test_full', split='test',
140
+ save_path=args.log_dir + 'output/', device=accelerator.device)
src/.ipynb_checkpoints/train-checkpoint.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import argparse
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
+
14
+ from accelerate import Accelerator
15
+
16
+ from models.transformer import Dasheng_Encoder
17
+ from models.sed_decoder import Decoder, TSED_Wrapper
18
+ from dataset.tsed import TSED_AS
19
+ from dataset.tsed_val import TSED_Val
20
+ from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler
21
+ from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation
22
+ from val import val_psds
23
+
24
+
25
+ def parse_args():
26
+ parser = argparse.ArgumentParser()
27
+
28
+ # Config settings
29
+ parser.add_argument('--config-name', type=str, default='configs/model.yml')
30
+
31
+ # Training settings
32
+ parser.add_argument("--amp", type=str, default='fp16')
33
+ parser.add_argument('--epochs', type=int, default=20)
34
+ parser.add_argument('--num-workers', type=int, default=8)
35
+ parser.add_argument('--num-threads', type=int, default=1)
36
+ parser.add_argument('--eval-every-step', type=int, default=5000)
37
+ parser.add_argument('--save-every-step', type=int, default=5000)
38
+ # parser.add_argument('--dataloader', type=str, default='EACaps')
39
+ parser.add_argument("--logit-normal-indices", type=bool, default=False)
40
+
41
+ # Log and random seed
42
+ parser.add_argument('--random-seed', type=int, default=2024)
43
+ parser.add_argument('--log-step', type=int, default=100)
44
+ parser.add_argument('--log-dir', type=str, default='../logs/')
45
+ parser.add_argument('--save-dir', type=str, default='../ckpts/')
46
+ return parser.parse_args()
47
+
48
+
49
+ def setup_directories(args, params):
50
+ args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/'
51
+ args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/'
52
+
53
+ os.makedirs(args.log_dir, exist_ok=True)
54
+ os.makedirs(args.save_dir, exist_ok=True)
55
+
56
+
57
+ def set_device(args):
58
+ torch.set_num_threads(args.num_threads)
59
+ if torch.cuda.is_available():
60
+ args.device = 'cuda'
61
+ torch.cuda.manual_seed_all(args.random_seed)
62
+ torch.backends.cuda.matmul.allow_tf32 = True
63
+ if torch.backends.cudnn.is_available():
64
+ torch.backends.cudnn.deterministic = True
65
+ torch.backends.cudnn.benchmark = False
66
+ else:
67
+ args.device = 'cpu'
68
+
69
+
70
+ if __name__ == '__main__':
71
+ args = parse_args()
72
+ params = load_yaml_with_includes(args.config_name)
73
+ set_device(args)
74
+ setup_directories(args, params)
75
+
76
+ random.seed(args.random_seed)
77
+ torch.manual_seed(args.random_seed)
78
+
79
+ # use accelerator for multi-gpu training
80
+ accelerator = Accelerator(mixed_precision=args.amp,
81
+ gradient_accumulation_steps=params['opt']['accumulation_steps'],
82
+ step_scheduler_with_optimizer=False)
83
+
84
+ train_set = TSED_AS(**params['data']['train_data'])
85
+ train_loader = DataLoader(train_set, shuffle=True,
86
+ batch_size=params['opt']['batch_size'],
87
+ num_workers=args.num_workers)
88
+
89
+ val_set = TSED_Val(**params['data']['val_data'])
90
+ val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False)
91
+
92
+ # test_set = TSED_Val(**params['data']['test_data'])
93
+ # test_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False)
94
+
95
+ encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device)
96
+ pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1'
97
+ dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
98
+ model_parmeters = dump['model']
99
+ # pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1'
100
+ # dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu')
101
+ # model_parmeters = dump
102
+ encoder.load_state_dict(model_parmeters)
103
+
104
+ decoder = Decoder(**params['decoder']).to(accelerator.device)
105
+
106
+ model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder'])
107
+ print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M")
108
+
109
+ # model.load_state_dict(torch.load('../ckpts/TSED_AS_filter/20000.0.pt', map_location='cpu')['model'])
110
+
111
+ if params['frozen_encoder']:
112
+ optimizer = torch.optim.AdamW(
113
+ model.parameters(),
114
+ lr=params['opt']['learning_rate'],
115
+ weight_decay=params['opt']['weight_decay'],
116
+ betas=(params['opt']['beta1'], params['opt']['beta2']),
117
+ eps=params['opt']['adam_epsilon'])
118
+ else:
119
+ optimizer = torch.optim.AdamW(
120
+ [
121
+ {'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']},
122
+ {'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']}
123
+ ],
124
+ weight_decay=params['opt']['weight_decay'],
125
+ betas=(params['opt']['beta1'], params['opt']['beta2']),
126
+ eps=params['opt']['adam_epsilon'])
127
+
128
+ lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler'])
129
+
130
+ strong_loss_func = nn.BCEWithLogitsLoss()
131
+
132
+ model, optimizer, lr_scheduler, train_loader, val_loader = accelerator.prepare(
133
+ model, optimizer, lr_scheduler, train_loader, val_loader)
134
+
135
+ global_step = 0.0
136
+ losses = 0.0
137
+
138
+ if accelerator.is_main_process:
139
+ model_module = model.module if hasattr(model, 'module') else model
140
+ val_psds(model_module, val_loader, params, epoch='debug', split='val',
141
+ save_path=args.log_dir + 'output/', device=accelerator.device)
142
+
143
+ for epoch in range(args.epochs):
144
+ model.train()
145
+ for step, batch in enumerate(tqdm(train_loader)):
146
+ with accelerator.accumulate(model):
147
+ audio, cls, label, _ = batch
148
+ mel = model.forward_to_spec(audio)
149
+
150
+ # data aug
151
+ mel, label = frame_shift(mel, label, params['net_pooling'])
152
+ mel, label = time_mask(mel, label, params["net_pooling"],
153
+ mask_ratios=params['data_aug']["time_mask_ratios"])
154
+ mel, _ = feature_transformation(mel, **params['data_aug']["transform"])
155
+
156
+ strong_pred = model(mel, cls)
157
+
158
+ B, N, L = label.shape
159
+ label = label.reshape(B * N, L)
160
+ label = label.unsqueeze(1)
161
+
162
+ loss = strong_loss_func(strong_pred, label)
163
+
164
+ accelerator.backward(loss)
165
+
166
+ # clip grad up
167
+ if accelerator.sync_gradients:
168
+ if 'grad_clip' in params['opt'] and params['opt']['grad_clip'] > 0:
169
+ accelerator.clip_grad_norm_(model.parameters(),
170
+ max_norm=params['opt']['grad_clip'])
171
+ optimizer.step()
172
+ lr_scheduler.step()
173
+ optimizer.zero_grad()
174
+
175
+ global_step += 1/params['opt']['accumulation_steps']
176
+ losses += loss.item()/params['opt']['accumulation_steps']
177
+
178
+ if accelerator.is_main_process:
179
+ if global_step % args.log_step == 0:
180
+ current_time = time.asctime(time.localtime(time.time()))
181
+ epoch_info = f'Epoch: [{epoch + 1}][{args.epochs}]'
182
+ batch_info = f'Global Step: {global_step}'
183
+ loss_info = f'Loss: {losses / args.log_step:.6f}'
184
+
185
+ # Extract the learning rate from the optimizer
186
+ lr = optimizer.param_groups[0]['lr']
187
+ lr_info = f'Learning Rate: {lr:.6f}'
188
+
189
+ log_message = f'{current_time}\n{epoch_info} {batch_info} {loss_info} {lr_info}\n'
190
+
191
+ with open(args.log_dir + 'log.txt', mode='a') as n:
192
+ n.write(log_message)
193
+
194
+ losses = 0.0
195
+
196
+ # check performance
197
+ if (global_step + 1) % args.eval_every_step == 0:
198
+ if accelerator.is_main_process:
199
+ model_module = model.module if hasattr(model, 'module') else model
200
+ val_psds(model_module, val_loader, params, epoch=global_step+1, split='val',
201
+ save_path=args.log_dir + 'output/', device=accelerator.device)
202
+ # save model
203
+ unwrapped_model = accelerator.unwrap_model(model)
204
+ accelerator.save({
205
+ "model": model.state_dict(),
206
+ }, args.save_dir + str(global_step+1) + '.pt')
207
+ accelerator.wait_for_everyone()
208
+ model.train()
src/.ipynb_checkpoints/val-checkpoint.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ import sed_scores_eval
6
+ from desed_task.evaluation.evaluation_measures import (compute_per_intersection_macro_f1,
7
+ compute_psds_from_operating_points,
8
+ compute_psds_from_scores)
9
+ from local.utils import (batched_decode_preds,)
10
+ from utils.sed import Encoder
11
+ import numpy as np
12
+
13
+
14
+ @torch.no_grad()
15
+ def val_psds(model, val_loader, params, epoch, split, save_path, device):
16
+ label_df = pd.read_csv(params['data'][split]['label'])
17
+ EVENTS = label_df['label'].tolist()
18
+
19
+ clap_emb = []
20
+ for event in EVENTS:
21
+ cls = torch.load(params['data']['train_data']['clap_dir'] + event + '.pt').to(device)
22
+ cls = cls.unsqueeze(1)
23
+ clap_emb.append(cls)
24
+ cls = torch.cat(clap_emb, dim=1)
25
+
26
+ encoder = Encoder(EVENTS, audio_len=10, frame_len=160, frame_hop=160, net_pooling=4, sr=16000)
27
+
28
+ model.eval()
29
+ test_csv = params['data'][split]["csv"]
30
+ test_dur = params['data'][split]["dur"]
31
+
32
+ gt = pd.read_csv(test_csv, sep='\t')
33
+
34
+ test_scores_postprocessed_buffer = {}
35
+ test_scores_postprocessed_buffer_tsed = {}
36
+ test_thresholds = [0.5]
37
+ test_psds_buffer = {k: pd.DataFrame() for k in test_thresholds}
38
+ test_psds_buffer_tsed = {k: pd.DataFrame() for k in test_thresholds}
39
+
40
+ for batch in tqdm(val_loader):
41
+ audio, filenames = batch
42
+ B = audio.shape[0]
43
+ N = cls.shape[1]
44
+ cls = cls.expand(B, -1, -1)
45
+
46
+ audio = audio.to(device)
47
+ mel = model.forward_to_spec(audio)
48
+
49
+ preds = model(mel, cls)
50
+ preds = torch.sigmoid(preds)
51
+ preds = preds.reshape(B, N, -1)
52
+ preds_tsed = preds.clone()
53
+ # tsed assumes sound exitencance is known
54
+ for idx, filename in enumerate(filenames):
55
+ weak_label = list(gt[gt['filename'] == filename]['event_label'].unique())
56
+ for j, event in enumerate(EVENTS):
57
+ if event not in weak_label:
58
+ preds_tsed[idx][j] = 0.0
59
+ # preds = preds.transpose(1, 2)
60
+
61
+ (_, scores_postprocessed_strong, _,) = \
62
+ batched_decode_preds(
63
+ preds,
64
+ filenames,
65
+ encoder,
66
+ median_filter=9,
67
+ thresholds=list(test_psds_buffer.keys()), )
68
+ test_scores_postprocessed_buffer.update(scores_postprocessed_strong)
69
+
70
+ (_, scores_postprocessed_strong_tsed, _,) = \
71
+ batched_decode_preds(
72
+ preds_tsed,
73
+ filenames,
74
+ encoder,
75
+ median_filter=9,
76
+ thresholds=list(test_psds_buffer_tsed.keys()), )
77
+ test_scores_postprocessed_buffer_tsed.update(scores_postprocessed_strong_tsed)
78
+
79
+ ground_truth = sed_scores_eval.io.read_ground_truth_events(test_csv)
80
+ audio_durations = sed_scores_eval.io.read_audio_durations(test_dur)
81
+
82
+ ground_truth = {
83
+ audio_id: ground_truth[audio_id]
84
+ for audio_id in test_scores_postprocessed_buffer
85
+ }
86
+ audio_durations = {
87
+ audio_id: audio_durations[audio_id]
88
+ for audio_id in test_scores_postprocessed_buffer
89
+ }
90
+
91
+ psds1_sed_scores_eval, psds1_cls = compute_psds_from_scores(
92
+ test_scores_postprocessed_buffer,
93
+ ground_truth,
94
+ audio_durations,
95
+ dtc_threshold=0.7,
96
+ gtc_threshold=0.7,
97
+ cttc_threshold=None,
98
+ alpha_ct=0.0,
99
+ alpha_st=0.0,
100
+ # save_dir=os.path.join(save_dir, "student", "scenario1"),
101
+ )
102
+ psds1_cls['overall'] = psds1_sed_scores_eval
103
+ psds1_cls['macro_averaged'] = np.array([v for k, v in psds1_cls.items()]).mean()
104
+ psds1_cls['name'] = 'psds1'
105
+
106
+ psds1_sed_scores_eval_tsed, psds1_cls_tsed = compute_psds_from_scores(
107
+ test_scores_postprocessed_buffer_tsed,
108
+ ground_truth,
109
+ audio_durations,
110
+ dtc_threshold=0.7,
111
+ gtc_threshold=0.7,
112
+ cttc_threshold=None,
113
+ alpha_ct=0.0,
114
+ alpha_st=0.0,
115
+ # save_dir=os.path.join(save_dir, "student", "scenario1"),
116
+ )
117
+
118
+ psds1_cls_tsed['overall'] = psds1_sed_scores_eval_tsed
119
+ psds1_cls_tsed['macro_averaged'] = np.array([v for k, v in psds1_cls_tsed.items()]).mean()
120
+ psds1_cls_tsed['name'] = 'psds1_tsed'
121
+
122
+ # psds2_sed_scores_eval, psds2_cls = compute_psds_from_scores(
123
+ # test_scores_postprocessed_buffer,
124
+ # ground_truth,
125
+ # audio_durations,
126
+ # dtc_threshold=0.1,
127
+ # gtc_threshold=0.1,
128
+ # cttc_threshold=0.3,
129
+ # alpha_ct=0.5,
130
+ # alpha_st=1,
131
+ # # save_dir=os.path.join(save_dir, "student", "scenario1"),
132
+ # )
133
+ # psds2_cls['overall'] = psds2_sed_scores_eval
134
+ # psds2_cls['macro_averaged'] = np.array([v for k, v in psds2_cls.items()]).mean()
135
+ # psds2_cls['name'] = 'psds2'
136
+ psds_cls = pd.DataFrame([psds1_cls, psds1_cls_tsed])
137
+ # psds_cls = pd.DataFrame([psds1_cls, psds2_cls])
138
+ os.makedirs(f'{save_path}/psds_cls/', exist_ok=True)
139
+ psds_cls.to_csv(f'{save_path}/psds_cls/{epoch}.csv', index=False)
140
+
141
+ return psds1_sed_scores_eval, psds1_sed_scores_eval_tsed
src/clap_embedding/Accelerating, revving, vroom.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4a351451f29ac729cdf638e3c3e81da4c1ff7963cdbbc17ca64c49f2e0a7f8
3
+ size 3397
src/clap_embedding/Air brake.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ca30c586a0c92b86b136aa8fd69c27a31a10b454159e6fdfc1197e8c1585b5
3
+ size 3238
src/clap_embedding/Air conditioning.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8238a7ffa14c033b4d41f2331ce94424d925a0e3b1a37f4d5b491a111d518425
3
+ size 3273
src/clap_embedding/Air horn, truck horn.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a054ca636e138e115e085ed4b5f315ffaa60b647400f657dd6d334720fbc8e73
3
+ size 3293
src/clap_embedding/Aircraft engine.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac2b736bed945841a2d066cb4ad5218b55903c88083d34870b6d27eccc9b1d55
3
+ size 3268
src/clap_embedding/Aircraft.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:902502af6d7e3ff22b6650282c1c8e3f98d6c1687b1f3078465bf942c30620cf
3
+ size 3233
src/clap_embedding/Alarm clock.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e0d17a345893c6f0fe6a9d6fb11f060da277b8d82475b95c2249138919beb5b
3
+ size 3248
src/clap_embedding/Alarm.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43fce07d002c946daad296e9b637943a9941cc703bb3c1755fb497f72afcccc1
3
+ size 3154
src/clap_embedding/Alert.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:751b03e62094f66ddd4965d55583ffc8db0f37a621a614b8aec953ab284d5e23
3
+ size 3154
src/clap_embedding/Ambulance (siren).pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b8d7ab047ba136bc0c8979ce97b917cbba1181e60257c046a748612dae58660
3
+ size 3278
src/clap_embedding/Animal.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:060e4de7c1aa9e784f8d4245b8ed7d17e001a68615005bdde858aeb044f61aac
3
+ size 3159
src/clap_embedding/Applause.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:053458a5422c7a2dc316dfe803963118a11093ef324b07c07d79f98e11001bbb
3
+ size 3233
src/clap_embedding/Arrow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ad03adea0e391dce55cf4e1ef13d4d299ea741d56e06a379915305b4ae56d03
3
+ size 3154
src/clap_embedding/Artillery fire.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a8aafd9fe1fc64424eb28aa0206bf3ad5ad505cb5eeb471164453f6d3a61313
3
+ size 3263
src/clap_embedding/Audio logo.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6e3a7f4f827f4e9d2de401956568f7ff771a7e7c11cb547306454cf0ea0c4ab
3
+ size 3243
src/clap_embedding/Babbling.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c7ccbb7c692f6d2a5a1039039ea127d06189788e2cab5d25302d8d0bd4ddef5
3
+ size 3233
src/clap_embedding/Baby cry, infant cry.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebad3b3fd6e97f3a4681f2cbf3bc7b9dad2eed05715b2ab28d095ee156d204f7
3
+ size 3293
src/clap_embedding/Baby laughter.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13abd466060f1997a8f0251fd7a32456824713cfec0ca8754a01a2f245ae03af
3
+ size 3258
src/clap_embedding/Background noise.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e64c97573b4d05dfcb0e50362afff050b9e287e9c26195b10e6da1182a8b104f
3
+ size 3273
src/clap_embedding/Bang.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e28d04d1f0b6e407dda531d7df5c1883ac907c218feaffabb4a213445d874e5
3
+ size 3149
src/clap_embedding/Bark.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f866f230ad42ff9be34383a120ae6563606d149b1bd00039cd360c61cbbb371a
3
+ size 3149
src/clap_embedding/Basketball bounce.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cac0a6b3d44c42d8d1eeb93ea1fb59abcc095127ae88ea9f7f684ff0ba5f52d
3
+ size 3278
src/clap_embedding/Bathroom sounds.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e781b8e4ea847ad5acfa5cdc7cbecf294019e96d156f5e8e9d71fd384421f3c4
3
+ size 3268
src/clap_embedding/Bathtub (filling or washing).pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af853e52974d5cd77f2d79b207303df6299c23f508ed481cf343c1b7034bed8
3
+ size 3397
src/clap_embedding/Battle cry.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f06286c4fc69426a7fe8685fa6c74a337b97f552ab503009b11d11b70498a45
3
+ size 3243
src/clap_embedding/Bee, wasp, etc..pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaabbd88980a77e0e1a4293d912c9f21fefe9dc8cc288c75fd4c07020e86ef58
3
+ size 3268
src/clap_embedding/Beep, bleep.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d6a2021aa05b8f325efdd8e8aa163df1829cc9350b650da63a46284087d142
3
+ size 3248
src/clap_embedding/Bell.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3252fcd0b5f0e000410ac308b5f012167770d0ac0e27f4897945b922e02485c2
3
+ size 3149
src/clap_embedding/Bellow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b43dd2c5a59384af15520a845ee7801138265eb28d0050fd96156f096ec822a1
3
+ size 3159
src/clap_embedding/Belly laugh.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b60d96453309b2791d7e17a36e63e4ea98d5e55f687b5083349ac67a1e9cc7
3
+ size 3248
src/clap_embedding/Bicycle bell.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a0a337a1b9c19819393b6eab07aace307ad402fec3f9e84f72d6ffd5501e76
3
+ size 3253
src/clap_embedding/Bicycle, tricycle.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f950ffa51035922839dcebe144e6c395c463288870f719a286064c02adf1f4f
3
+ size 3278
src/clap_embedding/Bird flight, flapping wings.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41e0c3d59dcfb50e5169cbee4ca8642f9b78ca047c0f3e44d0fa7f2b46bfb320
3
+ size 3392
src/clap_embedding/Bird vocalization, bird call, bird song.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1445a61059d9164ced63c01abf60de91637b79d26fed015004124158824c04df
3
+ size 3452
src/clap_embedding/Bird.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c94d39ae43d5c16d35c77a6ddbe500dc7aae6a044a3a10f43d33dcf14da48e91
3
+ size 3149
src/clap_embedding/Biting.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5c42b9c0b3fbfa6c0b43428a0bbbb199706c3d1babd7942d2f45e8a8874106d
3
+ size 3159
src/clap_embedding/Bleat.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2b9513225200a2f5ddeea1ee54d167c85c9b180af1bbe3766f5ff0fb044ee2c
3
+ size 3154
src/clap_embedding/Blender, food processor.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b82c822d42ea9ada0d83695e565af6b90a016942f7f02b0bbb27b9f5d755f5c
3
+ size 3372