musicgowdam commited on
Commit
cfc7911
·
verified ·
1 Parent(s): 922a2cc

Upload 12 files

Browse files
Files changed (12) hide show
  1. .gitattributes +1 -0
  2. .gitignore +5 -0
  3. Audio_Separator_Colab.ipynb +74 -0
  4. LICENSE +21 -0
  5. README.md +38 -8
  6. app.py +1491 -0
  7. mdx_models/data.json +354 -0
  8. packages.txt +1 -0
  9. pre-requirements.txt +2 -0
  10. requirements.txt +19 -0
  11. test.mp3 +3 -0
  12. utils.py +142 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ test.mp3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv
2
+ .venv
3
+ __pycache__
4
+ get-pip.py
5
+ clean_song_output
Audio_Separator_Colab.ipynb ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/R3gm/Audio_separator_ui/blob/main/Audio_Separator_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {
17
+ "cellView": "form",
18
+ "id": "XbteyAzfklwa"
19
+ },
20
+ "outputs": [],
21
+ "source": [
22
+ "# @title Run the `Audio🔹separator` App\n",
23
+ "\n",
24
+ "conf_end = \"/content/Audio_separator_ui/.venv/complete.txt\"\n",
25
+ "def config_complete() -> bool:\n",
26
+ " import os\n",
27
+ " return os.path.exists(conf_end)\n",
28
+ "\n",
29
+ "if not config_complete():\n",
30
+ " print(\"\\033[34mThe installation will take approximately 5 minutes.\\033[0m\")\n",
31
+ " %cd /content\n",
32
+ " !git clone https://github.com/R3gm/Audio_separator_ui\n",
33
+ " %cd Audio_separator_ui\n",
34
+ " !pip install uv==0.8.13 -q\n",
35
+ " !uv venv --python 3.10 --clear -q\n",
36
+ " !curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py\n",
37
+ " !uv run python get-pip.py pip==23.1.2 -q\n",
38
+ " !uv run python -m pip install -q -r pre-requirements.txt\n",
39
+ " !uv run python -m pip install -q torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124\n",
40
+ " !uv run python -m pip install -q -r requirements.txt && touch {conf_end}\n",
41
+ "\n",
42
+ "%cd /content/Audio_separator_ui\n",
43
+ "!uv run python app.py --share --theme NoCrypt/miku"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "metadata": {
49
+ "id": "i18Q7Kf1CU9J"
50
+ },
51
+ "source": [
52
+ "Open the `public URL` when it appears"
53
+ ]
54
+ }
55
+ ],
56
+ "metadata": {
57
+ "accelerator": "GPU",
58
+ "colab": {
59
+ "authorship_tag": "ABX9TyNhZd6FNhKtQGbXlQbsSREk",
60
+ "gpuType": "T4",
61
+ "include_colab_link": true,
62
+ "provenance": []
63
+ },
64
+ "kernelspec": {
65
+ "display_name": "Python 3",
66
+ "name": "python3"
67
+ },
68
+ "language_info": {
69
+ "name": "python"
70
+ }
71
+ },
72
+ "nbformat": 4,
73
+ "nbformat_minor": 0
74
+ }
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Roger Condori
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,14 +1,44 @@
1
  ---
2
- title: DEMO2
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
- short_description: Music - vocal seperator
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Audio🔹Separator
3
+ emoji: 🏃
4
+ colorFrom: purple
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 5.43.1
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
+ short_description: Vocal and background audio separator
12
  ---
13
 
14
+ # Audio Separator
15
+
16
+ ## Overview
17
+ **Audio Separator** is a Hugging Face Space designed to split an audio file into two distinct components: **vocals** and **background music**
18
+
19
+ | Description | Link |
20
+ | ----------- | ---- |
21
+ | 📙 Colab Notebook | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/R3gm/Audio_separator_ui/blob/main/Audio_Separator_Colab.ipynb) |
22
+ | 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/Audio_separator_ui) |
23
+ | 🚀 Online DEMO | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/Audio_separator) |
24
+
25
+ ## Key Functions
26
+ 1. **Audio Upload**
27
+ Upload your own audio files (e.g., songs, recordings) for processing.
28
+
29
+ 2. **Separation Process**
30
+ The tool extracts:
31
+ - **Vocals** – the vocal track containing singing or speaking.
32
+ - **Background/Instrumental** – the rest of the audio excluding vocals.
33
+
34
+ 3. **Output Download**
35
+ After processing, you can download the separated tracks individually.
36
+
37
+ ## Summary Table
38
+
39
+ | Function | Description |
40
+ |-----------------------|---------------------------------------------------------------|
41
+ | **Audio Upload** | Import your audio file into the tool for separation. |
42
+ | **Vocal Extraction** | Isolate and extract vocal content from the original audio. |
43
+ | **Instrumental Track**| Separate and extract the background music component. |
44
+ | **Download Output** | Download the separated vocal and instrumental tracks. |
app.py ADDED
@@ -0,0 +1,1491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spaces
3
+ import gc
4
+ import hashlib
5
+ import queue
6
+ import threading
7
+ import json
8
+ import shlex
9
+ import sys
10
+ import subprocess
11
+ import librosa
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import torch
15
+ from tqdm import tqdm
16
+ from utils import (
17
+ remove_directory_contents,
18
+ create_directories,
19
+ download_manager,
20
+ )
21
+ import random
22
+ from utils import logger
23
+ import onnxruntime as ort
24
+ import warnings
25
+ import gradio as gr
26
+ import time
27
+ import traceback
28
+ from pedalboard import Pedalboard, Reverb, Delay, Chorus, Compressor, Gain, HighpassFilter, LowpassFilter
29
+ from pedalboard.io import AudioFile
30
+ import argparse
31
+
32
+ parser = argparse.ArgumentParser(description="Run the app with optional sharing")
33
+ parser.add_argument(
34
+ '--share',
35
+ action='store_true',
36
+ help='Enable sharing mode'
37
+ )
38
+ parser.add_argument(
39
+ '--theme',
40
+ type=str,
41
+ default="NoCrypt/miku",
42
+ help='Set the theme (default: NoCrypt/miku)'
43
+ )
44
+ args = parser.parse_args()
45
+
46
+ warnings.filterwarnings("ignore")
47
+ IS_COLAB = True if ('google.colab' in sys.modules or args.share) else False
48
+ IS_ZERO_GPU = os.getenv("SPACES_ZERO_GPU")
49
+
50
+ title = "<center><strong><font size='7'>Audio🔹separator</font></strong></center>"
51
+ base_demo = "This demo uses the "
52
+ description = (f"{base_demo if IS_ZERO_GPU else ''}MDX-Net models for vocal and background sound separation.")
53
+ RESOURCES = "- You can also try `Audio🔹separator` in Colab’s free tier, which provides free GPU [link](https://github.com/R3gm/Audio_separator_ui?tab=readme-ov-file#audio-separator)."
54
+ theme = args.theme
55
+
56
+ stem_naming = {
57
+ "Vocals": "Instrumental",
58
+ "Other": "Instruments",
59
+ "Instrumental": "Vocals",
60
+ "Drums": "Drumless",
61
+ "Bass": "Bassless",
62
+ }
63
+
64
+
65
+ class MDXModel:
66
+ def __init__(
67
+ self,
68
+ device,
69
+ dim_f,
70
+ dim_t,
71
+ n_fft,
72
+ hop=1024,
73
+ stem_name=None,
74
+ compensation=1.000,
75
+ ):
76
+ self.dim_f = dim_f
77
+ self.dim_t = dim_t
78
+ self.dim_c = 4
79
+ self.n_fft = n_fft
80
+ self.hop = hop
81
+ self.stem_name = stem_name
82
+ self.compensation = compensation
83
+
84
+ self.n_bins = self.n_fft // 2 + 1
85
+ self.chunk_size = hop * (self.dim_t - 1)
86
+ self.window = torch.hann_window(
87
+ window_length=self.n_fft, periodic=True
88
+ ).to(device)
89
+
90
+ out_c = self.dim_c
91
+
92
+ self.freq_pad = torch.zeros(
93
+ [1, out_c, self.n_bins - self.dim_f, self.dim_t]
94
+ ).to(device)
95
+
96
+ def stft(self, x):
97
+ x = x.reshape([-1, self.chunk_size])
98
+ x = torch.stft(
99
+ x,
100
+ n_fft=self.n_fft,
101
+ hop_length=self.hop,
102
+ window=self.window,
103
+ center=True,
104
+ return_complex=True,
105
+ )
106
+ x = torch.view_as_real(x)
107
+ x = x.permute([0, 3, 1, 2])
108
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
109
+ [-1, 4, self.n_bins, self.dim_t]
110
+ )
111
+ return x[:, :, : self.dim_f]
112
+
113
+ def istft(self, x, freq_pad=None):
114
+ freq_pad = (
115
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
116
+ if freq_pad is None
117
+ else freq_pad
118
+ )
119
+ x = torch.cat([x, freq_pad], -2)
120
+ # c = 4*2 if self.target_name=='*' else 2
121
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
122
+ [-1, 2, self.n_bins, self.dim_t]
123
+ )
124
+ x = x.permute([0, 2, 3, 1])
125
+ x = x.contiguous()
126
+ x = torch.view_as_complex(x)
127
+ x = torch.istft(
128
+ x,
129
+ n_fft=self.n_fft,
130
+ hop_length=self.hop,
131
+ window=self.window,
132
+ center=True,
133
+ )
134
+ return x.reshape([-1, 2, self.chunk_size])
135
+
136
+
137
+ class MDX:
138
+ DEFAULT_SR = 44100
139
+ # Unit: seconds
140
+ DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
141
+ DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
142
+
143
+ def __init__(
144
+ self, model_path: str, params: MDXModel, processor=0
145
+ ):
146
+ # Set the device and the provider (CPU or CUDA)
147
+ self.device = (
148
+ torch.device(f"cuda:{processor}")
149
+ if processor >= 0
150
+ else torch.device("cpu")
151
+ )
152
+ self.provider = (
153
+ ["CUDAExecutionProvider"]
154
+ if processor >= 0
155
+ else ["CPUExecutionProvider"]
156
+ )
157
+
158
+ self.model = params
159
+
160
+ # Load the ONNX model using ONNX Runtime
161
+ self.ort = ort.InferenceSession(model_path, providers=self.provider)
162
+ # Preload the model for faster performance
163
+ self.ort.run(
164
+ None,
165
+ {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()},
166
+ )
167
+ self.process = lambda spec: self.ort.run(
168
+ None, {"input": spec.cpu().numpy()}
169
+ )[0]
170
+
171
+ self.prog = None
172
+
173
+ @staticmethod
174
+ def get_hash(model_path):
175
+ try:
176
+ with open(model_path, "rb") as f:
177
+ f.seek(-10000 * 1024, 2)
178
+ model_hash = hashlib.md5(f.read()).hexdigest()
179
+ except: # noqa
180
+ model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
181
+
182
+ return model_hash
183
+
184
+ @staticmethod
185
+ def segment(
186
+ wave,
187
+ combine=True,
188
+ chunk_size=DEFAULT_CHUNK_SIZE,
189
+ margin_size=DEFAULT_MARGIN_SIZE,
190
+ ):
191
+ """
192
+ Segment or join segmented wave array
193
+
194
+ Args:
195
+ wave: (np.array) Wave array to be segmented or joined
196
+ combine: (bool) If True, combines segmented wave array.
197
+ If False, segments wave array.
198
+ chunk_size: (int) Size of each segment (in samples)
199
+ margin_size: (int) Size of margin between segments (in samples)
200
+
201
+ Returns:
202
+ numpy array: Segmented or joined wave array
203
+ """
204
+
205
+ if combine:
206
+ # Initializing as None instead of [] for later numpy array concatenation
207
+ processed_wave = None
208
+ for segment_count, segment in enumerate(wave):
209
+ start = 0 if segment_count == 0 else margin_size
210
+ end = None if segment_count == len(wave) - 1 else -margin_size
211
+ if margin_size == 0:
212
+ end = None
213
+ if processed_wave is None: # Create array for first segment
214
+ processed_wave = segment[:, start:end]
215
+ else: # Concatenate to existing array for subsequent segments
216
+ processed_wave = np.concatenate(
217
+ (processed_wave, segment[:, start:end]), axis=-1
218
+ )
219
+
220
+ else:
221
+ processed_wave = []
222
+ sample_count = wave.shape[-1]
223
+
224
+ if chunk_size <= 0 or chunk_size > sample_count:
225
+ chunk_size = sample_count
226
+
227
+ if margin_size > chunk_size:
228
+ margin_size = chunk_size
229
+
230
+ for segment_count, skip in enumerate(
231
+ range(0, sample_count, chunk_size)
232
+ ):
233
+ margin = 0 if segment_count == 0 else margin_size
234
+ end = min(skip + chunk_size + margin_size, sample_count)
235
+ start = skip - margin
236
+
237
+ cut = wave[:, start:end].copy()
238
+ processed_wave.append(cut)
239
+
240
+ if end == sample_count:
241
+ break
242
+
243
+ return processed_wave
244
+
245
+ def pad_wave(self, wave):
246
+ """
247
+ Pad the wave array to match the required chunk size
248
+
249
+ Args:
250
+ wave: (np.array) Wave array to be padded
251
+
252
+ Returns:
253
+ tuple: (padded_wave, pad, trim)
254
+ - padded_wave: Padded wave array
255
+ - pad: Number of samples that were padded
256
+ - trim: Number of samples that were trimmed
257
+ """
258
+ n_sample = wave.shape[1]
259
+ trim = self.model.n_fft // 2
260
+ gen_size = self.model.chunk_size - 2 * trim
261
+ pad = gen_size - n_sample % gen_size
262
+
263
+ # Padded wave
264
+ wave_p = np.concatenate(
265
+ (
266
+ np.zeros((2, trim)),
267
+ wave,
268
+ np.zeros((2, pad)),
269
+ np.zeros((2, trim)),
270
+ ),
271
+ 1,
272
+ )
273
+
274
+ mix_waves = []
275
+ for i in range(0, n_sample + pad, gen_size):
276
+ waves = np.array(wave_p[:, i:i + self.model.chunk_size])
277
+ mix_waves.append(waves)
278
+
279
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
280
+ self.device
281
+ )
282
+
283
+ return mix_waves, pad, trim
284
+
285
+ def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
286
+ """
287
+ Process each wave segment in a multi-threaded environment
288
+
289
+ Args:
290
+ mix_waves: (torch.Tensor) Wave segments to be processed
291
+ trim: (int) Number of samples trimmed during padding
292
+ pad: (int) Number of samples padded during padding
293
+ q: (queue.Queue) Queue to hold the processed wave segments
294
+ _id: (int) Identifier of the processed wave segment
295
+
296
+ Returns:
297
+ numpy array: Processed wave segment
298
+ """
299
+ mix_waves = mix_waves.split(1)
300
+ with torch.no_grad():
301
+ pw = []
302
+ for mix_wave in mix_waves:
303
+ self.prog.update()
304
+ spec = self.model.stft(mix_wave)
305
+ processed_spec = torch.tensor(self.process(spec))
306
+ processed_wav = self.model.istft(
307
+ processed_spec.to(self.device)
308
+ )
309
+ processed_wav = (
310
+ processed_wav[:, :, trim:-trim]
311
+ .transpose(0, 1)
312
+ .reshape(2, -1)
313
+ .cpu()
314
+ .numpy()
315
+ )
316
+ pw.append(processed_wav)
317
+ processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
318
+ q.put({_id: processed_signal})
319
+ return processed_signal
320
+
321
+ def process_wave(self, wave: np.array, mt_threads=1):
322
+ """
323
+ Process the wave array in a multi-threaded environment
324
+
325
+ Args:
326
+ wave: (np.array) Wave array to be processed
327
+ mt_threads: (int) Number of threads to be used for processing
328
+
329
+ Returns:
330
+ numpy array: Processed wave array
331
+ """
332
+ self.prog = tqdm(total=0)
333
+ chunk = wave.shape[-1] // mt_threads
334
+ waves = self.segment(wave, False, chunk)
335
+
336
+ # Create a queue to hold the processed wave segments
337
+ q = queue.Queue()
338
+ threads = []
339
+ for c, batch in enumerate(waves):
340
+ mix_waves, pad, trim = self.pad_wave(batch)
341
+ self.prog.total = len(mix_waves) * mt_threads
342
+ thread = threading.Thread(
343
+ target=self._process_wave, args=(mix_waves, trim, pad, q, c)
344
+ )
345
+ thread.start()
346
+ threads.append(thread)
347
+ for thread in threads:
348
+ thread.join()
349
+ self.prog.close()
350
+
351
+ processed_batches = []
352
+ while not q.empty():
353
+ processed_batches.append(q.get())
354
+ processed_batches = [
355
+ list(wave.values())[0]
356
+ for wave in sorted(
357
+ processed_batches, key=lambda d: list(d.keys())[0]
358
+ )
359
+ ]
360
+ assert len(processed_batches) == len(
361
+ waves
362
+ ), "Incomplete processed batches, please reduce batch size!"
363
+ return self.segment(processed_batches, True, chunk)
364
+
365
+
366
+ @spaces.GPU(duration=40)
367
+ def run_mdx(
368
+ model_params,
369
+ output_dir,
370
+ model_path,
371
+ filename,
372
+ exclude_main=False,
373
+ exclude_inversion=False,
374
+ suffix=None,
375
+ invert_suffix=None,
376
+ denoise=False,
377
+ keep_orig=True,
378
+ m_threads=2,
379
+ device_base="cuda",
380
+ ):
381
+
382
+ if device_base == "cuda":
383
+ device = torch.device("cuda:0")
384
+ processor_num = 0
385
+ device_properties = torch.cuda.get_device_properties(device)
386
+ vram_gb = device_properties.total_memory / 1024**3
387
+ m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
388
+ duration = librosa.get_duration(filename=filename)
389
+ if duration < 60:
390
+ m_threads = 1
391
+ logger.info(f"threads: {m_threads} vram: {vram_gb}")
392
+ else:
393
+ device = torch.device("cpu")
394
+ processor_num = -1
395
+ m_threads = 1
396
+
397
+ model_hash = MDX.get_hash(model_path)
398
+ mp = model_params.get(model_hash)
399
+ model = MDXModel(
400
+ device,
401
+ dim_f=mp["mdx_dim_f_set"],
402
+ dim_t=2 ** mp["mdx_dim_t_set"],
403
+ n_fft=mp["mdx_n_fft_scale_set"],
404
+ stem_name=mp["primary_stem"],
405
+ compensation=mp["compensate"],
406
+ )
407
+
408
+ mdx_sess = MDX(model_path, model, processor=processor_num)
409
+ wave, sr = librosa.load(filename, mono=False, sr=44100)
410
+ # normalizing input wave gives better output
411
+ peak = max(np.max(wave), abs(np.min(wave)))
412
+ wave /= peak
413
+ if denoise:
414
+ wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
415
+ mdx_sess.process_wave(wave, m_threads)
416
+ )
417
+ wave_processed *= 0.5
418
+ else:
419
+ wave_processed = mdx_sess.process_wave(wave, m_threads)
420
+ # return to previous peak
421
+ wave_processed *= peak
422
+ stem_name = model.stem_name if suffix is None else suffix
423
+
424
+ main_filepath = None
425
+ if not exclude_main:
426
+ main_filepath = os.path.join(
427
+ output_dir,
428
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
429
+ )
430
+ sf.write(main_filepath, wave_processed.T, sr)
431
+
432
+ invert_filepath = None
433
+ if not exclude_inversion:
434
+ diff_stem_name = (
435
+ stem_naming.get(stem_name)
436
+ if invert_suffix is None
437
+ else invert_suffix
438
+ )
439
+ stem_name = (
440
+ f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
441
+ )
442
+ invert_filepath = os.path.join(
443
+ output_dir,
444
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
445
+ )
446
+ sf.write(
447
+ invert_filepath,
448
+ (-wave_processed.T * model.compensation) + wave.T,
449
+ sr,
450
+ )
451
+
452
+ if not keep_orig:
453
+ os.remove(filename)
454
+
455
+ del mdx_sess, wave_processed, wave
456
+ gc.collect()
457
+ torch.cuda.empty_cache()
458
+ return main_filepath, invert_filepath
459
+
460
+
461
+ def run_mdx_beta(
462
+ model_params,
463
+ output_dir,
464
+ model_path,
465
+ filename,
466
+ exclude_main=False,
467
+ exclude_inversion=False,
468
+ suffix=None,
469
+ invert_suffix=None,
470
+ denoise=False,
471
+ keep_orig=True,
472
+ m_threads=2,
473
+ device_base="",
474
+ ):
475
+
476
+ m_threads = 1
477
+ duration = librosa.get_duration(filename=filename)
478
+ if IS_COLAB or duration < 60:
479
+ m_threads = 1
480
+ elif duration >= 60 and duration <= 120:
481
+ m_threads = 8
482
+ elif duration > 120:
483
+ m_threads = 16
484
+
485
+ logger.info(f"threads: {m_threads}")
486
+
487
+ model_hash = MDX.get_hash(model_path)
488
+ device = torch.device("cpu")
489
+ processor_num = -1
490
+ mp = model_params.get(model_hash)
491
+ model = MDXModel(
492
+ device,
493
+ dim_f=mp["mdx_dim_f_set"],
494
+ dim_t=2 ** mp["mdx_dim_t_set"],
495
+ n_fft=mp["mdx_n_fft_scale_set"],
496
+ stem_name=mp["primary_stem"],
497
+ compensation=mp["compensate"],
498
+ )
499
+
500
+ mdx_sess = MDX(model_path, model, processor=processor_num)
501
+ wave, sr = librosa.load(filename, mono=False, sr=44100)
502
+ # normalizing input wave gives better output
503
+ peak = max(np.max(wave), abs(np.min(wave)))
504
+ wave /= peak
505
+ if denoise:
506
+ wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
507
+ mdx_sess.process_wave(wave, m_threads)
508
+ )
509
+ wave_processed *= 0.5
510
+ else:
511
+ wave_processed = mdx_sess.process_wave(wave, m_threads)
512
+ # return to previous peak
513
+ wave_processed *= peak
514
+ stem_name = model.stem_name if suffix is None else suffix
515
+
516
+ main_filepath = None
517
+ if not exclude_main:
518
+ main_filepath = os.path.join(
519
+ output_dir,
520
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
521
+ )
522
+ sf.write(main_filepath, wave_processed.T, sr)
523
+
524
+ invert_filepath = None
525
+ if not exclude_inversion:
526
+ diff_stem_name = (
527
+ stem_naming.get(stem_name)
528
+ if invert_suffix is None
529
+ else invert_suffix
530
+ )
531
+ stem_name = (
532
+ f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
533
+ )
534
+ invert_filepath = os.path.join(
535
+ output_dir,
536
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
537
+ )
538
+ sf.write(
539
+ invert_filepath,
540
+ (-wave_processed.T * model.compensation) + wave.T,
541
+ sr,
542
+ )
543
+
544
+ if not keep_orig:
545
+ os.remove(filename)
546
+
547
+ del mdx_sess, wave_processed, wave
548
+ gc.collect()
549
+ torch.cuda.empty_cache()
550
+ return main_filepath, invert_filepath
551
+
552
+
553
+ MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
554
+ UVR_MODELS = [
555
+ "UVR-MDX-NET-Voc_FT.onnx",
556
+ "UVR_MDXNET_KARA_2.onnx",
557
+ "Reverb_HQ_By_FoxJoy.onnx",
558
+ "UVR-MDX-NET-Inst_HQ_4.onnx",
559
+ ]
560
+ BASE_DIR = "." # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
561
+ mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
562
+ output_dir = os.path.join(BASE_DIR, "clean_song_output")
563
+
564
+
565
+ def convert_to_stereo_and_wav(audio_path):
566
+ wave, sr = librosa.load(audio_path, mono=False, sr=44100)
567
+
568
+ # check if mono
569
+ if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa
570
+ stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav"
571
+ stereo_path = os.path.join(output_dir, stereo_path)
572
+
573
+ command = shlex.split(
574
+ f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"'
575
+ )
576
+ sub_params = {
577
+ "stdout": subprocess.PIPE,
578
+ "stderr": subprocess.PIPE,
579
+ "creationflags": subprocess.CREATE_NO_WINDOW
580
+ if sys.platform == "win32"
581
+ else 0,
582
+ }
583
+ process_wav = subprocess.Popen(command, **sub_params)
584
+ output, errors = process_wav.communicate()
585
+ if process_wav.returncode != 0 or not os.path.exists(stereo_path):
586
+ raise Exception("Error processing audio to stereo wav")
587
+
588
+ return stereo_path
589
+ else:
590
+ return audio_path
591
+
592
+
593
+ def get_hash(filepath):
594
+ with open(filepath, 'rb') as f:
595
+ file_hash = hashlib.blake2b()
596
+ while chunk := f.read(8192):
597
+ file_hash.update(chunk)
598
+
599
+ return file_hash.hexdigest()[:18]
600
+
601
+
602
+ def random_sleep():
603
+ sleep_time = 0.1
604
+ if IS_ZERO_GPU:
605
+ sleep_time = round(random.uniform(3.2, 5.9), 1)
606
+ time.sleep(sleep_time)
607
+
608
+
609
+ def process_uvr_task(
610
+ orig_song_path: str = "aud_test.mp3",
611
+ main_vocals: bool = False,
612
+ dereverb: bool = True,
613
+ song_id: str = "mdx", # folder output name
614
+ only_voiceless: bool = False,
615
+ remove_files_output_dir: bool = False,
616
+ ):
617
+
618
+ device_base = "cuda" if torch.cuda.is_available() else "cpu"
619
+ logger.info(f"Device: {device_base}")
620
+
621
+ if remove_files_output_dir:
622
+ remove_directory_contents(output_dir)
623
+
624
+ with open(os.path.join(mdxnet_models_dir, "data.json")) as infile:
625
+ mdx_model_params = json.load(infile)
626
+
627
+ song_output_dir = os.path.join(output_dir, song_id)
628
+ create_directories(song_output_dir)
629
+ orig_song_path = convert_to_stereo_and_wav(orig_song_path)
630
+
631
+ logger.info(f"onnxruntime device >> {ort.get_device()}")
632
+
633
+ if only_voiceless:
634
+ logger.info("Voiceless Track Separation...")
635
+
636
+ process = run_mdx(
637
+ mdx_model_params,
638
+ song_output_dir,
639
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
640
+ orig_song_path,
641
+ suffix="Voiceless",
642
+ denoise=False,
643
+ keep_orig=True,
644
+ exclude_inversion=True,
645
+ device_base=device_base,
646
+ )
647
+
648
+ return process
649
+
650
+ logger.info("Vocal Track Isolation...")
651
+ vocals_path, instrumentals_path = run_mdx(
652
+ mdx_model_params,
653
+ song_output_dir,
654
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
655
+ orig_song_path,
656
+ denoise=True,
657
+ keep_orig=True,
658
+ device_base=device_base,
659
+ )
660
+
661
+ if main_vocals:
662
+ random_sleep()
663
+ msg_main = "Main Voice Separation from Supporting Vocals..."
664
+ logger.info(msg_main)
665
+ gr.Info(msg_main)
666
+ try:
667
+ backup_vocals_path, main_vocals_path = run_mdx(
668
+ mdx_model_params,
669
+ song_output_dir,
670
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
671
+ vocals_path,
672
+ suffix="Backup",
673
+ invert_suffix="Main",
674
+ denoise=True,
675
+ device_base=device_base,
676
+ )
677
+ except Exception as e:
678
+ backup_vocals_path, main_vocals_path = run_mdx_beta(
679
+ mdx_model_params,
680
+ song_output_dir,
681
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
682
+ vocals_path,
683
+ suffix="Backup",
684
+ invert_suffix="Main",
685
+ denoise=True,
686
+ device_base=device_base,
687
+ )
688
+ else:
689
+ backup_vocals_path, main_vocals_path = None, vocals_path
690
+
691
+ if dereverb:
692
+ random_sleep()
693
+ msg_dereverb = "Vocal Clarity Enhancement through De-Reverberation..."
694
+ logger.info(msg_dereverb)
695
+ gr.Info(msg_dereverb)
696
+ try:
697
+ _, vocals_dereverb_path = run_mdx(
698
+ mdx_model_params,
699
+ song_output_dir,
700
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
701
+ main_vocals_path,
702
+ invert_suffix="DeReverb",
703
+ exclude_main=True,
704
+ denoise=True,
705
+ device_base=device_base,
706
+ )
707
+ except Exception as e:
708
+ _, vocals_dereverb_path = run_mdx_beta(
709
+ mdx_model_params,
710
+ song_output_dir,
711
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
712
+ main_vocals_path,
713
+ invert_suffix="DeReverb",
714
+ exclude_main=True,
715
+ denoise=True,
716
+ device_base=device_base,
717
+ )
718
+ else:
719
+ vocals_dereverb_path = main_vocals_path
720
+
721
+ return (
722
+ vocals_path,
723
+ instrumentals_path,
724
+ backup_vocals_path,
725
+ main_vocals_path,
726
+ vocals_dereverb_path,
727
+ )
728
+
729
+
730
+ def add_vocal_effects(input_file, output_file, reverb_room_size=0.6, vocal_reverb_dryness=0.8, reverb_damping=0.6, reverb_wet_level=0.35,
731
+ delay_seconds=0.4, delay_mix=0.25,
732
+ compressor_threshold_db=-25, compressor_ratio=3.5, compressor_attack_ms=10, compressor_release_ms=60,
733
+ gain_db=3):
734
+
735
+ effects = [HighpassFilter()]
736
+
737
+ effects.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=vocal_reverb_dryness))
738
+
739
+ effects.append(Compressor(threshold_db=compressor_threshold_db, ratio=compressor_ratio,
740
+ attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
741
+
742
+ if delay_seconds > 0 or delay_mix > 0:
743
+ effects.append(Delay(delay_seconds=delay_seconds, mix=delay_mix))
744
+ # print("delay applied")
745
+ # effects.append(Chorus())
746
+
747
+ if gain_db:
748
+ effects.append(Gain(gain_db=gain_db))
749
+ # print("added gain db")
750
+
751
+ board = Pedalboard(effects)
752
+
753
+ with AudioFile(input_file) as f:
754
+ with AudioFile(output_file, 'w', f.samplerate, f.num_channels) as o:
755
+ # Read one second of audio at a time, until the file is empty:
756
+ while f.tell() < f.frames:
757
+ chunk = f.read(int(f.samplerate))
758
+ effected = board(chunk, f.samplerate, reset=False)
759
+ o.write(effected)
760
+
761
+
762
+ def add_instrumental_effects(input_file, output_file, highpass_freq=100, lowpass_freq=12000,
763
+ reverb_room_size=0.5, reverb_damping=0.5, reverb_wet_level=0.25,
764
+ compressor_threshold_db=-20, compressor_ratio=2.5, compressor_attack_ms=15, compressor_release_ms=80,
765
+ gain_db=2):
766
+
767
+ effects = [
768
+ HighpassFilter(cutoff_frequency_hz=highpass_freq),
769
+ LowpassFilter(cutoff_frequency_hz=lowpass_freq),
770
+ ]
771
+ if reverb_room_size > 0 or reverb_damping > 0 or reverb_wet_level > 0:
772
+ effects.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level))
773
+
774
+ effects.append(Compressor(threshold_db=compressor_threshold_db, ratio=compressor_ratio,
775
+ attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
776
+
777
+ if gain_db:
778
+ effects.append(Gain(gain_db=gain_db))
779
+
780
+ board = Pedalboard(effects)
781
+
782
+ with AudioFile(input_file) as f:
783
+ with AudioFile(output_file, 'w', f.samplerate, f.num_channels) as o:
784
+ # Read one second of audio at a time, until the file is empty:
785
+ while f.tell() < f.frames:
786
+ chunk = f.read(int(f.samplerate))
787
+ effected = board(chunk, f.samplerate, reset=False)
788
+ o.write(effected)
789
+
790
+
791
+ COMMON_SAMPLE_RATES = [8000, 16000, 22050, 32000, 44100, 48000, 96000]
792
+
793
+
794
+ def save_audio(audio_opt: np.ndarray, final_sr: int, output_audio_path: str, target_format: str) -> str:
795
+ """
796
+ Save audio with automatic handling of unsupported sample rates for non-WAV formats.
797
+ """
798
+ ext = os.path.splitext(output_audio_path)[1].lower()
799
+
800
+ try:
801
+ if ext == ".wav":
802
+ sf.write(output_audio_path, audio_opt, final_sr, format=target_format)
803
+ else:
804
+ target_sr = min(COMMON_SAMPLE_RATES, key=lambda altsr: abs(altsr - final_sr))
805
+ if target_sr != final_sr:
806
+ logger.warning(f"Resampling from {final_sr} -> {target_sr} for {ext}")
807
+ audio_opt = librosa.resample(audio_opt, orig_sr=final_sr, target_sr=target_sr)
808
+ sf.write(output_audio_path, audio_opt, target_sr, format=target_format)
809
+ except Exception as e:
810
+ logger.error(e)
811
+ logger.error(f"Error saving {output_audio_path}, performing fallback to WAV")
812
+ output_audio_path = output_audio_path.replace(f"_converted.{target_format}", ".wav")
813
+
814
+ return output_audio_path
815
+
816
+
817
+ def convert_format(file_paths, media_dir, target_format):
818
+ """
819
+ Convert a list of audio files to the target format with automatic safe sample rates.
820
+
821
+ WAV files are returned as-is; non-WAV files are resampled if needed to a supported rate.
822
+ """
823
+ target_format = target_format.lower()
824
+ if target_format == "wav":
825
+ return file_paths # No conversion needed for WAV
826
+
827
+ suffix = "_converted"
828
+ converted_files = []
829
+
830
+ for fp in file_paths:
831
+ # Absolute paths and base filename
832
+ abs_fp = os.path.abspath(fp)
833
+ file_name, _ = os.path.splitext(os.path.basename(abs_fp))
834
+ file_ext = f".{target_format}"
835
+ out_name = file_name + suffix + file_ext
836
+ out_path = os.path.join(media_dir, out_name)
837
+
838
+ # Load audio with librosa (handles many formats)
839
+ audio, sr = sf.read(abs_fp)
840
+
841
+ # Save using safe resampling
842
+ saved_path = save_audio(audio, sr, out_path, target_format)
843
+ converted_files.append(saved_path)
844
+
845
+ # print(f"Converted: {abs_fp} -> {saved_path}")
846
+
847
+ return converted_files
848
+
849
+
850
+ def sound_separate(
851
+ media_file, stem, main, dereverb, vocal_effects=True, background_effects=True,
852
+ vocal_reverb_room_size=0.6, vocal_reverb_damping=0.6, vocal_reverb_dryness=0.8, vocal_reverb_wet_level=0.35,
853
+ vocal_delay_seconds=0.4, vocal_delay_mix=0.25,
854
+ vocal_compressor_threshold_db=-25, vocal_compressor_ratio=3.5, vocal_compressor_attack_ms=10, vocal_compressor_release_ms=60,
855
+ vocal_gain_db=4,
856
+ background_highpass_freq=120, background_lowpass_freq=11000,
857
+ background_reverb_room_size=0.5, background_reverb_damping=0.5, background_reverb_wet_level=0.25,
858
+ background_compressor_threshold_db=-20, background_compressor_ratio=2.5, background_compressor_attack_ms=15, background_compressor_release_ms=80,
859
+ background_gain_db=3,
860
+ target_format="WAV",
861
+ ):
862
+ if not media_file:
863
+ raise ValueError("The audio path is missing.")
864
+
865
+ if not stem:
866
+ raise ValueError("Please select 'vocal' or 'background' stem.")
867
+
868
+ hash_audio = str(get_hash(media_file))
869
+ media_dir = os.path.dirname(media_file)
870
+
871
+ outputs = []
872
+
873
+ try:
874
+ duration_base_ = librosa.get_duration(filename=media_file)
875
+ print("Duration audio:", duration_base_)
876
+ except Exception as e:
877
+ print(e)
878
+
879
+ start_time = time.time()
880
+
881
+ if "vocal" in stem:
882
+ try:
883
+ _, _, _, _, vocal_audio = process_uvr_task(
884
+ orig_song_path=media_file,
885
+ song_id=hash_audio + "mdx",
886
+ main_vocals=main,
887
+ dereverb=dereverb,
888
+ remove_files_output_dir=False,
889
+ )
890
+
891
+ if vocal_effects:
892
+ suffix = '_effects'
893
+ file_name, file_extension = os.path.splitext(os.path.abspath(vocal_audio))
894
+ out_effects = file_name + suffix + file_extension
895
+ out_effects_path = os.path.join(media_dir, out_effects)
896
+ add_vocal_effects(vocal_audio, out_effects_path,
897
+ reverb_room_size=vocal_reverb_room_size, reverb_damping=vocal_reverb_damping, vocal_reverb_dryness=vocal_reverb_dryness, reverb_wet_level=vocal_reverb_wet_level,
898
+ delay_seconds=vocal_delay_seconds, delay_mix=vocal_delay_mix,
899
+ compressor_threshold_db=vocal_compressor_threshold_db, compressor_ratio=vocal_compressor_ratio, compressor_attack_ms=vocal_compressor_attack_ms, compressor_release_ms=vocal_compressor_release_ms,
900
+ gain_db=vocal_gain_db
901
+ )
902
+ vocal_audio = out_effects_path
903
+
904
+ outputs.append(vocal_audio)
905
+ except Exception as error:
906
+ gr.Info(str(error))
907
+ logger.error(str(error))
908
+
909
+ if "background" in stem:
910
+ background_audio, _ = process_uvr_task(
911
+ orig_song_path=media_file,
912
+ song_id=hash_audio + "voiceless",
913
+ only_voiceless=True,
914
+ remove_files_output_dir=False,
915
+ )
916
+
917
+ if background_effects:
918
+ suffix = '_effects'
919
+ file_name, file_extension = os.path.splitext(os.path.abspath(background_audio))
920
+ out_effects = file_name + suffix + file_extension
921
+ out_effects_path = os.path.join(media_dir, out_effects)
922
+ # print(file_name, file_extension, out_effects, out_effects_path)
923
+ add_instrumental_effects(background_audio, out_effects_path,
924
+ highpass_freq=background_highpass_freq, lowpass_freq=background_lowpass_freq,
925
+ reverb_room_size=background_reverb_room_size, reverb_damping=background_reverb_damping, reverb_wet_level=background_reverb_wet_level,
926
+ compressor_threshold_db=background_compressor_threshold_db, compressor_ratio=background_compressor_ratio, compressor_attack_ms=background_compressor_attack_ms, compressor_release_ms=background_compressor_release_ms,
927
+ gain_db=background_gain_db
928
+ )
929
+ background_audio = out_effects_path
930
+
931
+ outputs.append(background_audio)
932
+
933
+ end_time = time.time()
934
+ execution_time = end_time - start_time
935
+ logger.info(f"Execution time: {execution_time} seconds")
936
+
937
+ if not outputs:
938
+ raise Exception("Error in sound separation.")
939
+
940
+ return convert_format(outputs, media_dir, target_format)
941
+
942
+
943
+ def audio_downloader(
944
+ url_media,
945
+ ):
946
+
947
+ url_media = url_media.strip()
948
+
949
+ if not url_media:
950
+ return None
951
+
952
+ if IS_ZERO_GPU and "youtube.com" in url_media:
953
+ gr.Info("This option isn’t available on Hugging Face.")
954
+ return None
955
+
956
+ import yt_dlp
957
+ # print(url_media[:10])
958
+
959
+ dir_output_downloads = "downloads"
960
+ os.makedirs(dir_output_downloads, exist_ok=True)
961
+
962
+ media_info = yt_dlp.YoutubeDL(
963
+ {"quiet": True, "no_warnings": True, "noplaylist": True}
964
+ ).extract_info(url_media, download=False)
965
+ download_path = f"{os.path.join(dir_output_downloads, media_info['title'])}.m4a"
966
+
967
+ ydl_opts = {
968
+ 'format': 'm4a/bestaudio/best',
969
+ 'postprocessors': [{ # Extract audio using ffmpeg
970
+ 'key': 'FFmpegExtractAudio',
971
+ 'preferredcodec': 'm4a',
972
+ }],
973
+ 'force_overwrites': True,
974
+ 'noplaylist': True,
975
+ 'no_warnings': True,
976
+ 'quiet': True,
977
+ 'ignore_no_formats_error': True,
978
+ 'restrictfilenames': True,
979
+ 'outtmpl': download_path,
980
+ }
981
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl_download:
982
+ ydl_download.download([url_media])
983
+
984
+ return download_path
985
+
986
+
987
+ def downloader_conf():
988
+ return gr.Checkbox(
989
+ False,
990
+ label="URL-to-Audio",
991
+ # info="",
992
+ container=False,
993
+ )
994
+
995
+
996
+ def url_media_conf():
997
+ return gr.Textbox(
998
+ value="",
999
+ label="Enter URL",
1000
+ placeholder="www.youtube.com/watch?v=g_9rPvbENUw",
1001
+ visible=False,
1002
+ lines=1,
1003
+ )
1004
+
1005
+
1006
+ def url_button_conf():
1007
+ return gr.Button(
1008
+ "Go",
1009
+ variant="secondary",
1010
+ visible=False,
1011
+ )
1012
+
1013
+
1014
+ def show_components_downloader(value_active):
1015
+ return gr.update(
1016
+ visible=value_active
1017
+ ), gr.update(
1018
+ visible=value_active
1019
+ )
1020
+
1021
+
1022
+ def audio_conf():
1023
+ return gr.File(
1024
+ label="Audio file",
1025
+ # file_count="multiple",
1026
+ type="filepath",
1027
+ container=True,
1028
+ )
1029
+
1030
+
1031
+ def stem_conf():
1032
+ return gr.CheckboxGroup(
1033
+ choices=["vocal", "background"],
1034
+ value="vocal",
1035
+ label="Stem",
1036
+ # info="",
1037
+ )
1038
+
1039
+
1040
+ def main_conf():
1041
+ return gr.Checkbox(
1042
+ False,
1043
+ label="Main",
1044
+ # info="",
1045
+ )
1046
+
1047
+
1048
+ def dereverb_conf():
1049
+ return gr.Checkbox(
1050
+ False,
1051
+ label="Dereverb",
1052
+ # info="",
1053
+ visible=True,
1054
+ )
1055
+
1056
+
1057
+ def vocal_effects_conf():
1058
+ return gr.Checkbox(
1059
+ False,
1060
+ label="Vocal Effects",
1061
+ # info="",
1062
+ visible=True,
1063
+ )
1064
+
1065
+
1066
+ def background_effects_conf():
1067
+ return gr.Checkbox(
1068
+ False,
1069
+ label="Background Effects",
1070
+ # info="",
1071
+ visible=False,
1072
+ )
1073
+
1074
+
1075
+ def vocal_reverb_room_size_conf():
1076
+ return gr.Number(
1077
+ 0.15,
1078
+ label="Vocal Reverb Room Size",
1079
+ minimum=0.0,
1080
+ maximum=1.0,
1081
+ step=0.05,
1082
+ visible=True,
1083
+ )
1084
+
1085
+
1086
+ def vocal_reverb_damping_conf():
1087
+ return gr.Number(
1088
+ 0.7,
1089
+ label="Vocal Reverb Damping",
1090
+ minimum=0.0,
1091
+ maximum=1.0,
1092
+ step=0.01,
1093
+ visible=True,
1094
+ )
1095
+
1096
+
1097
+ def vocal_reverb_wet_level_conf():
1098
+ return gr.Number(
1099
+ 0.2,
1100
+ label="Vocal Reverb Wet Level",
1101
+ minimum=0.0,
1102
+ maximum=1.0,
1103
+ step=0.05,
1104
+ visible=True,
1105
+ )
1106
+
1107
+
1108
+ def vocal_reverb_dryness_level_conf():
1109
+ return gr.Number(
1110
+ 0.8,
1111
+ label="Vocal Reverb Dryness Level",
1112
+ minimum=0.0,
1113
+ maximum=1.0,
1114
+ step=0.05,
1115
+ visible=True,
1116
+ )
1117
+
1118
+
1119
+ def vocal_delay_seconds_conf():
1120
+ return gr.Number(
1121
+ 0.,
1122
+ label="Vocal Delay Seconds",
1123
+ minimum=0.0,
1124
+ maximum=1.0,
1125
+ step=0.01,
1126
+ visible=True,
1127
+ )
1128
+
1129
+
1130
+ def vocal_delay_mix_conf():
1131
+ return gr.Number(
1132
+ 0.,
1133
+ label="Vocal Delay Mix",
1134
+ minimum=0.0,
1135
+ maximum=1.0,
1136
+ step=0.01,
1137
+ visible=True,
1138
+ )
1139
+
1140
+
1141
+ def vocal_compressor_threshold_db_conf():
1142
+ return gr.Number(
1143
+ -15,
1144
+ label="Vocal Compressor Threshold (dB)",
1145
+ minimum=-60,
1146
+ maximum=0,
1147
+ step=1,
1148
+ visible=True,
1149
+ )
1150
+
1151
+
1152
+ def vocal_compressor_ratio_conf():
1153
+ return gr.Number(
1154
+ 4.,
1155
+ label="Vocal Compressor Ratio",
1156
+ minimum=0,
1157
+ maximum=20,
1158
+ step=0.1,
1159
+ visible=True,
1160
+ )
1161
+
1162
+
1163
+ def vocal_compressor_attack_ms_conf():
1164
+ return gr.Number(
1165
+ 1.0,
1166
+ label="Vocal Compressor Attack (ms)",
1167
+ minimum=0,
1168
+ maximum=1000,
1169
+ step=1,
1170
+ visible=True,
1171
+ )
1172
+
1173
+
1174
+ def vocal_compressor_release_ms_conf():
1175
+ return gr.Number(
1176
+ 100,
1177
+ label="Vocal Compressor Release (ms)",
1178
+ minimum=0,
1179
+ maximum=3000,
1180
+ step=1,
1181
+ visible=True,
1182
+ )
1183
+
1184
+
1185
+ def vocal_gain_db_conf():
1186
+ return gr.Number(
1187
+ 0,
1188
+ label="Vocal Gain (dB)",
1189
+ minimum=-40,
1190
+ maximum=40,
1191
+ step=1,
1192
+ visible=True,
1193
+ )
1194
+
1195
+
1196
+ def background_highpass_freq_conf():
1197
+ return gr.Number(
1198
+ 120,
1199
+ label="Background Highpass Frequency (Hz)",
1200
+ minimum=0,
1201
+ maximum=1000,
1202
+ step=1,
1203
+ visible=True,
1204
+ )
1205
+
1206
+
1207
+ def background_lowpass_freq_conf():
1208
+ return gr.Number(
1209
+ 11000,
1210
+ label="Background Lowpass Frequency (Hz)",
1211
+ minimum=0,
1212
+ maximum=20000,
1213
+ step=1,
1214
+ visible=True,
1215
+ )
1216
+
1217
+
1218
+ def background_reverb_room_size_conf():
1219
+ return gr.Number(
1220
+ 0.1,
1221
+ label="Background Reverb Room Size",
1222
+ minimum=0.0,
1223
+ maximum=1.0,
1224
+ step=0.1,
1225
+ visible=True,
1226
+ )
1227
+
1228
+
1229
+ def background_reverb_damping_conf():
1230
+ return gr.Number(
1231
+ 0.5,
1232
+ label="Background Reverb Damping",
1233
+ minimum=0.0,
1234
+ maximum=1.0,
1235
+ step=0.1,
1236
+ visible=True,
1237
+ )
1238
+
1239
+
1240
+ def background_reverb_wet_level_conf():
1241
+ return gr.Number(
1242
+ 0.25,
1243
+ label="Background Reverb Wet Level",
1244
+ minimum=0.0,
1245
+ maximum=1.0,
1246
+ step=0.05,
1247
+ visible=True,
1248
+ )
1249
+
1250
+
1251
+ def background_compressor_threshold_db_conf():
1252
+ return gr.Number(
1253
+ -15,
1254
+ label="Background Compressor Threshold (dB)",
1255
+ minimum=-60,
1256
+ maximum=0,
1257
+ step=1,
1258
+ visible=True,
1259
+ )
1260
+
1261
+
1262
+ def background_compressor_ratio_conf():
1263
+ return gr.Number(
1264
+ 4.,
1265
+ label="Background Compressor Ratio",
1266
+ minimum=0,
1267
+ maximum=20,
1268
+ step=0.1,
1269
+ visible=True,
1270
+ )
1271
+
1272
+
1273
+ def background_compressor_attack_ms_conf():
1274
+ return gr.Number(
1275
+ 15,
1276
+ label="Background Compressor Attack (ms)",
1277
+ minimum=0,
1278
+ maximum=1000,
1279
+ step=1,
1280
+ visible=True,
1281
+ )
1282
+
1283
+
1284
+ def background_compressor_release_ms_conf():
1285
+ return gr.Number(
1286
+ 60,
1287
+ label="Background Compressor Release (ms)",
1288
+ minimum=0,
1289
+ maximum=3000,
1290
+ step=1,
1291
+ visible=True,
1292
+ )
1293
+
1294
+
1295
+ def background_gain_db_conf():
1296
+ return gr.Number(
1297
+ 0,
1298
+ label="Background Gain (dB)",
1299
+ minimum=-40,
1300
+ maximum=40,
1301
+ step=1,
1302
+ visible=True,
1303
+ )
1304
+
1305
+
1306
+ def button_conf():
1307
+ return gr.Button(
1308
+ "Inference",
1309
+ variant="primary",
1310
+ )
1311
+
1312
+
1313
+ def output_conf():
1314
+ return gr.File(
1315
+ label="Result",
1316
+ file_count="multiple",
1317
+ interactive=False,
1318
+ )
1319
+
1320
+
1321
+ def show_vocal_components(value_name):
1322
+ v_ = "vocal" in value_name
1323
+ b_ = "background" in value_name
1324
+
1325
+ return gr.update(visible=v_), gr.update(
1326
+ visible=v_
1327
+ ), gr.update(visible=v_), gr.update(
1328
+ visible=b_
1329
+ )
1330
+
1331
+
1332
+ FORMAT_OPTIONS = ["WAV", "MP3", "FLAC"]
1333
+
1334
+
1335
+ def format_conf():
1336
+ return gr.Dropdown(
1337
+ choices=FORMAT_OPTIONS,
1338
+ value=FORMAT_OPTIONS[0],
1339
+ label="Format output:"
1340
+ )
1341
+
1342
+
1343
+ def get_gui(theme):
1344
+ with gr.Blocks(theme=theme, fill_width=True, fill_height=False, delete_cache=(3200, 10800)) as app:
1345
+ gr.Markdown(title)
1346
+ gr.Markdown(description)
1347
+
1348
+ downloader_gui = downloader_conf()
1349
+ with gr.Row():
1350
+ with gr.Column(scale=2):
1351
+ url_media_gui = url_media_conf()
1352
+ with gr.Column(scale=1):
1353
+ url_button_gui = url_button_conf()
1354
+
1355
+ downloader_gui.change(
1356
+ show_components_downloader,
1357
+ [downloader_gui],
1358
+ [url_media_gui, url_button_gui]
1359
+ )
1360
+
1361
+ aud = audio_conf()
1362
+
1363
+ url_button_gui.click(
1364
+ audio_downloader,
1365
+ [url_media_gui],
1366
+ [aud]
1367
+ )
1368
+
1369
+ with gr.Column():
1370
+ with gr.Row():
1371
+ stem_gui = stem_conf()
1372
+
1373
+ with gr.Column():
1374
+ with gr.Row():
1375
+ main_gui = main_conf()
1376
+ dereverb_gui = dereverb_conf()
1377
+ vocal_effects_gui = vocal_effects_conf()
1378
+ background_effects_gui = background_effects_conf()
1379
+
1380
+ with gr.Accordion("Vocal Effects Parameters", open=False):
1381
+ with gr.Row():
1382
+ vocal_reverb_room_size_gui = vocal_reverb_room_size_conf()
1383
+ vocal_reverb_damping_gui = vocal_reverb_damping_conf()
1384
+ vocal_reverb_dryness_gui = vocal_reverb_dryness_level_conf()
1385
+ vocal_reverb_wet_level_gui = vocal_reverb_wet_level_conf()
1386
+ vocal_delay_seconds_gui = vocal_delay_seconds_conf()
1387
+ vocal_delay_mix_gui = vocal_delay_mix_conf()
1388
+ vocal_compressor_threshold_db_gui = vocal_compressor_threshold_db_conf()
1389
+ vocal_compressor_ratio_gui = vocal_compressor_ratio_conf()
1390
+ vocal_compressor_attack_ms_gui = vocal_compressor_attack_ms_conf()
1391
+ vocal_compressor_release_ms_gui = vocal_compressor_release_ms_conf()
1392
+ vocal_gain_db_gui = vocal_gain_db_conf()
1393
+
1394
+ with gr.Accordion("Background Effects Parameters", open=False):
1395
+ with gr.Row():
1396
+ background_highpass_freq_gui = background_highpass_freq_conf()
1397
+ background_lowpass_freq_gui = background_lowpass_freq_conf()
1398
+ background_reverb_room_size_gui = background_reverb_room_size_conf()
1399
+ background_reverb_damping_gui = background_reverb_damping_conf()
1400
+ background_reverb_wet_level_gui = background_reverb_wet_level_conf()
1401
+ background_compressor_threshold_db_gui = background_compressor_threshold_db_conf()
1402
+ background_compressor_ratio_gui = background_compressor_ratio_conf()
1403
+ background_compressor_attack_ms_gui = background_compressor_attack_ms_conf()
1404
+ background_compressor_release_ms_gui = background_compressor_release_ms_conf()
1405
+ background_gain_db_gui = background_gain_db_conf()
1406
+
1407
+ stem_gui.change(
1408
+ show_vocal_components,
1409
+ [stem_gui],
1410
+ [main_gui, dereverb_gui, vocal_effects_gui, background_effects_gui],
1411
+ )
1412
+
1413
+ target_format_gui = format_conf()
1414
+ button_base = button_conf()
1415
+ output_base = output_conf()
1416
+
1417
+ button_base.click(
1418
+ sound_separate,
1419
+ inputs=[
1420
+ aud,
1421
+ stem_gui,
1422
+ main_gui,
1423
+ dereverb_gui,
1424
+ vocal_effects_gui,
1425
+ background_effects_gui,
1426
+ vocal_reverb_room_size_gui, vocal_reverb_damping_gui, vocal_reverb_dryness_gui, vocal_reverb_wet_level_gui,
1427
+ vocal_delay_seconds_gui, vocal_delay_mix_gui, vocal_compressor_threshold_db_gui, vocal_compressor_ratio_gui,
1428
+ vocal_compressor_attack_ms_gui, vocal_compressor_release_ms_gui, vocal_gain_db_gui,
1429
+ background_highpass_freq_gui, background_lowpass_freq_gui, background_reverb_room_size_gui,
1430
+ background_reverb_damping_gui, background_reverb_wet_level_gui, background_compressor_threshold_db_gui,
1431
+ background_compressor_ratio_gui, background_compressor_attack_ms_gui, background_compressor_release_ms_gui,
1432
+ background_gain_db_gui, target_format_gui,
1433
+ ],
1434
+ outputs=[output_base],
1435
+ )
1436
+
1437
+ gr.Examples(
1438
+ examples=[
1439
+ [
1440
+ "./test.mp3",
1441
+ "vocal",
1442
+ False,
1443
+ False,
1444
+ False,
1445
+ False,
1446
+ 0.15, 0.7, 0.8, 0.2,
1447
+ 0., 0., -15, 4., 1, 100, 0,
1448
+ 120, 11000, 0.5, 0.1, 0.25, -15, 4., 15, 60, 0,
1449
+ ],
1450
+ ],
1451
+ fn=sound_separate,
1452
+ inputs=[
1453
+ aud,
1454
+ stem_gui,
1455
+ main_gui,
1456
+ dereverb_gui,
1457
+ vocal_effects_gui,
1458
+ background_effects_gui,
1459
+ vocal_reverb_room_size_gui, vocal_reverb_damping_gui, vocal_reverb_dryness_gui, vocal_reverb_wet_level_gui,
1460
+ vocal_delay_seconds_gui, vocal_delay_mix_gui, vocal_compressor_threshold_db_gui, vocal_compressor_ratio_gui,
1461
+ vocal_compressor_attack_ms_gui, vocal_compressor_release_ms_gui, vocal_gain_db_gui,
1462
+ background_highpass_freq_gui, background_lowpass_freq_gui, background_reverb_room_size_gui,
1463
+ background_reverb_damping_gui, background_reverb_wet_level_gui, background_compressor_threshold_db_gui,
1464
+ background_compressor_ratio_gui, background_compressor_attack_ms_gui, background_compressor_release_ms_gui,
1465
+ background_gain_db_gui,
1466
+ ],
1467
+ outputs=[output_base],
1468
+ cache_examples=False,
1469
+ )
1470
+
1471
+ gr.Markdown(RESOURCES)
1472
+
1473
+ return app
1474
+
1475
+
1476
+ if __name__ == "__main__":
1477
+ for id_model in UVR_MODELS:
1478
+ download_manager(
1479
+ os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
1480
+ )
1481
+
1482
+ app = get_gui(theme)
1483
+ app.queue(default_concurrency_limit=40)
1484
+ app.launch(
1485
+ max_threads=40,
1486
+ share=IS_COLAB,
1487
+ show_error=True,
1488
+ quiet=False,
1489
+ debug=IS_COLAB,
1490
+ ssr_mode=False,
1491
+ )
mdx_models/data.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0ddfc0eb5792638ad5dc27850236c246": {
3
+ "compensate": 1.035,
4
+ "mdx_dim_f_set": 2048,
5
+ "mdx_dim_t_set": 8,
6
+ "mdx_n_fft_scale_set": 6144,
7
+ "primary_stem": "Vocals"
8
+ },
9
+ "26d308f91f3423a67dc69a6d12a8793d": {
10
+ "compensate": 1.035,
11
+ "mdx_dim_f_set": 2048,
12
+ "mdx_dim_t_set": 9,
13
+ "mdx_n_fft_scale_set": 8192,
14
+ "primary_stem": "Other"
15
+ },
16
+ "2cdd429caac38f0194b133884160f2c6": {
17
+ "compensate": 1.045,
18
+ "mdx_dim_f_set": 3072,
19
+ "mdx_dim_t_set": 8,
20
+ "mdx_n_fft_scale_set": 7680,
21
+ "primary_stem": "Instrumental"
22
+ },
23
+ "2f5501189a2f6db6349916fabe8c90de": {
24
+ "compensate": 1.035,
25
+ "mdx_dim_f_set": 2048,
26
+ "mdx_dim_t_set": 8,
27
+ "mdx_n_fft_scale_set": 6144,
28
+ "primary_stem": "Vocals"
29
+ },
30
+ "398580b6d5d973af3120df54cee6759d": {
31
+ "compensate": 1.75,
32
+ "mdx_dim_f_set": 3072,
33
+ "mdx_dim_t_set": 8,
34
+ "mdx_n_fft_scale_set": 7680,
35
+ "primary_stem": "Vocals"
36
+ },
37
+ "488b3e6f8bd3717d9d7c428476be2d75": {
38
+ "compensate": 1.035,
39
+ "mdx_dim_f_set": 3072,
40
+ "mdx_dim_t_set": 8,
41
+ "mdx_n_fft_scale_set": 7680,
42
+ "primary_stem": "Instrumental"
43
+ },
44
+ "4910e7827f335048bdac11fa967772f9": {
45
+ "compensate": 1.035,
46
+ "mdx_dim_f_set": 2048,
47
+ "mdx_dim_t_set": 7,
48
+ "mdx_n_fft_scale_set": 4096,
49
+ "primary_stem": "Drums"
50
+ },
51
+ "53c4baf4d12c3e6c3831bb8f5b532b93": {
52
+ "compensate": 1.043,
53
+ "mdx_dim_f_set": 3072,
54
+ "mdx_dim_t_set": 8,
55
+ "mdx_n_fft_scale_set": 7680,
56
+ "primary_stem": "Vocals"
57
+ },
58
+ "5d343409ef0df48c7d78cce9f0106781": {
59
+ "compensate": 1.075,
60
+ "mdx_dim_f_set": 3072,
61
+ "mdx_dim_t_set": 8,
62
+ "mdx_n_fft_scale_set": 7680,
63
+ "primary_stem": "Vocals"
64
+ },
65
+ "5f6483271e1efb9bfb59e4a3e6d4d098": {
66
+ "compensate": 1.035,
67
+ "mdx_dim_f_set": 2048,
68
+ "mdx_dim_t_set": 9,
69
+ "mdx_n_fft_scale_set": 6144,
70
+ "primary_stem": "Vocals"
71
+ },
72
+ "65ab5919372a128e4167f5e01a8fda85": {
73
+ "compensate": 1.035,
74
+ "mdx_dim_f_set": 2048,
75
+ "mdx_dim_t_set": 8,
76
+ "mdx_n_fft_scale_set": 8192,
77
+ "primary_stem": "Other"
78
+ },
79
+ "6703e39f36f18aa7855ee1047765621d": {
80
+ "compensate": 1.035,
81
+ "mdx_dim_f_set": 2048,
82
+ "mdx_dim_t_set": 9,
83
+ "mdx_n_fft_scale_set": 16384,
84
+ "primary_stem": "Bass"
85
+ },
86
+ "6b31de20e84392859a3d09d43f089515": {
87
+ "compensate": 1.035,
88
+ "mdx_dim_f_set": 2048,
89
+ "mdx_dim_t_set": 8,
90
+ "mdx_n_fft_scale_set": 6144,
91
+ "primary_stem": "Vocals"
92
+ },
93
+ "867595e9de46f6ab699008295df62798": {
94
+ "compensate": 1.03,
95
+ "mdx_dim_f_set": 3072,
96
+ "mdx_dim_t_set": 8,
97
+ "mdx_n_fft_scale_set": 7680,
98
+ "primary_stem": "Vocals"
99
+ },
100
+ "a3cd63058945e777505c01d2507daf37": {
101
+ "compensate": 1.03,
102
+ "mdx_dim_f_set": 2048,
103
+ "mdx_dim_t_set": 8,
104
+ "mdx_n_fft_scale_set": 6144,
105
+ "primary_stem": "Vocals"
106
+ },
107
+ "b33d9b3950b6cbf5fe90a32608924700": {
108
+ "compensate": 1.03,
109
+ "mdx_dim_f_set": 3072,
110
+ "mdx_dim_t_set": 8,
111
+ "mdx_n_fft_scale_set": 7680,
112
+ "primary_stem": "Vocals"
113
+ },
114
+ "c3b29bdce8c4fa17ec609e16220330ab": {
115
+ "compensate": 1.035,
116
+ "mdx_dim_f_set": 2048,
117
+ "mdx_dim_t_set": 8,
118
+ "mdx_n_fft_scale_set": 16384,
119
+ "primary_stem": "Bass"
120
+ },
121
+ "ceed671467c1f64ebdfac8a2490d0d52": {
122
+ "compensate": 1.035,
123
+ "mdx_dim_f_set": 3072,
124
+ "mdx_dim_t_set": 8,
125
+ "mdx_n_fft_scale_set": 7680,
126
+ "primary_stem": "Instrumental"
127
+ },
128
+ "d2a1376f310e4f7fa37fb9b5774eb701": {
129
+ "compensate": 1.035,
130
+ "mdx_dim_f_set": 3072,
131
+ "mdx_dim_t_set": 8,
132
+ "mdx_n_fft_scale_set": 7680,
133
+ "primary_stem": "Instrumental"
134
+ },
135
+ "d7bff498db9324db933d913388cba6be": {
136
+ "compensate": 1.035,
137
+ "mdx_dim_f_set": 2048,
138
+ "mdx_dim_t_set": 8,
139
+ "mdx_n_fft_scale_set": 6144,
140
+ "primary_stem": "Vocals"
141
+ },
142
+ "d94058f8c7f1fae4164868ae8ae66b20": {
143
+ "compensate": 1.035,
144
+ "mdx_dim_f_set": 2048,
145
+ "mdx_dim_t_set": 8,
146
+ "mdx_n_fft_scale_set": 6144,
147
+ "primary_stem": "Vocals"
148
+ },
149
+ "dc41ede5961d50f277eb846db17f5319": {
150
+ "compensate": 1.035,
151
+ "mdx_dim_f_set": 2048,
152
+ "mdx_dim_t_set": 9,
153
+ "mdx_n_fft_scale_set": 4096,
154
+ "primary_stem": "Drums"
155
+ },
156
+ "e5572e58abf111f80d8241d2e44e7fa4": {
157
+ "compensate": 1.028,
158
+ "mdx_dim_f_set": 3072,
159
+ "mdx_dim_t_set": 8,
160
+ "mdx_n_fft_scale_set": 7680,
161
+ "primary_stem": "Instrumental"
162
+ },
163
+ "e7324c873b1f615c35c1967f912db92a": {
164
+ "compensate": 1.03,
165
+ "mdx_dim_f_set": 3072,
166
+ "mdx_dim_t_set": 8,
167
+ "mdx_n_fft_scale_set": 7680,
168
+ "primary_stem": "Vocals"
169
+ },
170
+ "1c56ec0224f1d559c42fd6fd2a67b154": {
171
+ "compensate": 1.025,
172
+ "mdx_dim_f_set": 2048,
173
+ "mdx_dim_t_set": 8,
174
+ "mdx_n_fft_scale_set": 5120,
175
+ "primary_stem": "Instrumental"
176
+ },
177
+ "f2df6d6863d8f435436d8b561594ff49": {
178
+ "compensate": 1.035,
179
+ "mdx_dim_f_set": 3072,
180
+ "mdx_dim_t_set": 8,
181
+ "mdx_n_fft_scale_set": 7680,
182
+ "primary_stem": "Instrumental"
183
+ },
184
+ "b06327a00d5e5fbc7d96e1781bbdb596": {
185
+ "compensate": 1.035,
186
+ "mdx_dim_f_set": 3072,
187
+ "mdx_dim_t_set": 8,
188
+ "mdx_n_fft_scale_set": 6144,
189
+ "primary_stem": "Instrumental"
190
+ },
191
+ "94ff780b977d3ca07c7a343dab2e25dd": {
192
+ "compensate": 1.039,
193
+ "mdx_dim_f_set": 3072,
194
+ "mdx_dim_t_set": 8,
195
+ "mdx_n_fft_scale_set": 6144,
196
+ "primary_stem": "Instrumental"
197
+ },
198
+ "73492b58195c3b52d34590d5474452f6": {
199
+ "compensate": 1.043,
200
+ "mdx_dim_f_set": 3072,
201
+ "mdx_dim_t_set": 8,
202
+ "mdx_n_fft_scale_set": 7680,
203
+ "primary_stem": "Vocals"
204
+ },
205
+ "970b3f9492014d18fefeedfe4773cb42": {
206
+ "compensate": 1.009,
207
+ "mdx_dim_f_set": 3072,
208
+ "mdx_dim_t_set": 8,
209
+ "mdx_n_fft_scale_set": 7680,
210
+ "primary_stem": "Vocals"
211
+ },
212
+ "1d64a6d2c30f709b8c9b4ce1366d96ee": {
213
+ "compensate": 1.035,
214
+ "mdx_dim_f_set": 2048,
215
+ "mdx_dim_t_set": 8,
216
+ "mdx_n_fft_scale_set": 5120,
217
+ "primary_stem": "Instrumental"
218
+ },
219
+ "203f2a3955221b64df85a41af87cf8f0": {
220
+ "compensate": 1.035,
221
+ "mdx_dim_f_set": 3072,
222
+ "mdx_dim_t_set": 8,
223
+ "mdx_n_fft_scale_set": 6144,
224
+ "primary_stem": "Instrumental"
225
+ },
226
+ "291c2049608edb52648b96e27eb80e95": {
227
+ "compensate": 1.035,
228
+ "mdx_dim_f_set": 3072,
229
+ "mdx_dim_t_set": 8,
230
+ "mdx_n_fft_scale_set": 6144,
231
+ "primary_stem": "Instrumental"
232
+ },
233
+ "ead8d05dab12ec571d67549b3aab03fc": {
234
+ "compensate": 1.035,
235
+ "mdx_dim_f_set": 3072,
236
+ "mdx_dim_t_set": 8,
237
+ "mdx_n_fft_scale_set": 6144,
238
+ "primary_stem": "Instrumental"
239
+ },
240
+ "cc63408db3d80b4d85b0287d1d7c9632": {
241
+ "compensate": 1.033,
242
+ "mdx_dim_f_set": 3072,
243
+ "mdx_dim_t_set": 8,
244
+ "mdx_n_fft_scale_set": 6144,
245
+ "primary_stem": "Instrumental"
246
+ },
247
+ "cd5b2989ad863f116c855db1dfe24e39": {
248
+ "compensate": 1.035,
249
+ "mdx_dim_f_set": 3072,
250
+ "mdx_dim_t_set": 9,
251
+ "mdx_n_fft_scale_set": 6144,
252
+ "primary_stem": "Other"
253
+ },
254
+ "55657dd70583b0fedfba5f67df11d711": {
255
+ "compensate": 1.022,
256
+ "mdx_dim_f_set": 3072,
257
+ "mdx_dim_t_set": 8,
258
+ "mdx_n_fft_scale_set": 6144,
259
+ "primary_stem": "Instrumental"
260
+ },
261
+ "b6bccda408a436db8500083ef3491e8b": {
262
+ "compensate": 1.02,
263
+ "mdx_dim_f_set": 3072,
264
+ "mdx_dim_t_set": 8,
265
+ "mdx_n_fft_scale_set": 7680,
266
+ "primary_stem": "Instrumental"
267
+ },
268
+ "8a88db95c7fb5dbe6a095ff2ffb428b1": {
269
+ "compensate": 1.026,
270
+ "mdx_dim_f_set": 2048,
271
+ "mdx_dim_t_set": 8,
272
+ "mdx_n_fft_scale_set": 5120,
273
+ "primary_stem": "Instrumental"
274
+ },
275
+ "b78da4afc6512f98e4756f5977f5c6b9": {
276
+ "compensate": 1.021,
277
+ "mdx_dim_f_set": 3072,
278
+ "mdx_dim_t_set": 8,
279
+ "mdx_n_fft_scale_set": 7680,
280
+ "primary_stem": "Instrumental"
281
+ },
282
+ "77d07b2667ddf05b9e3175941b4454a0": {
283
+ "compensate": 1.021,
284
+ "mdx_dim_f_set": 3072,
285
+ "mdx_dim_t_set": 8,
286
+ "mdx_n_fft_scale_set": 7680,
287
+ "primary_stem": "Vocals"
288
+ },
289
+ "0f2a6bc5b49d87d64728ee40e23bceb1": {
290
+ "compensate": 1.019,
291
+ "mdx_dim_f_set": 2560,
292
+ "mdx_dim_t_set": 8,
293
+ "mdx_n_fft_scale_set": 5120,
294
+ "primary_stem": "Instrumental"
295
+ },
296
+ "b02be2d198d4968a121030cf8950b492": {
297
+ "compensate": 1.020,
298
+ "mdx_dim_f_set": 2560,
299
+ "mdx_dim_t_set": 8,
300
+ "mdx_n_fft_scale_set": 5120,
301
+ "primary_stem": "No Crowd"
302
+ },
303
+ "2154254ee89b2945b97a7efed6e88820": {
304
+ "config_yaml": "model_2_stem_061321.yaml"
305
+ },
306
+ "063aadd735d58150722926dcbf5852a9": {
307
+ "config_yaml": "model_2_stem_061321.yaml"
308
+ },
309
+ "fe96801369f6a148df2720f5ced88c19": {
310
+ "config_yaml": "model3.yaml"
311
+ },
312
+ "02e8b226f85fb566e5db894b9931c640": {
313
+ "config_yaml": "model2.yaml"
314
+ },
315
+ "e3de6d861635ab9c1d766149edd680d6": {
316
+ "config_yaml": "model1.yaml"
317
+ },
318
+ "3f2936c554ab73ce2e396d54636bd373": {
319
+ "config_yaml": "modelB.yaml"
320
+ },
321
+ "890d0f6f82d7574bca741a9e8bcb8168": {
322
+ "config_yaml": "modelB.yaml"
323
+ },
324
+ "63a3cb8c37c474681049be4ad1ba8815": {
325
+ "config_yaml": "modelB.yaml"
326
+ },
327
+ "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
328
+ "config_yaml": "modelA.yaml"
329
+ },
330
+ "3567f3dee6e77bf366fcb1c7b8bc3745": {
331
+ "config_yaml": "modelA.yaml"
332
+ },
333
+ "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
334
+ "config_yaml": "modelA.yaml"
335
+ },
336
+ "c9971a18da20911822593dc81caa8be9": {
337
+ "config_yaml": "sndfx.yaml"
338
+ },
339
+ "57d94d5ed705460d21c75a5ac829a605": {
340
+ "config_yaml": "sndfx.yaml"
341
+ },
342
+ "e7a25f8764f25a52c1b96c4946e66ba2": {
343
+ "config_yaml": "sndfx.yaml"
344
+ },
345
+ "104081d24e37217086ce5fde09147ee1": {
346
+ "config_yaml": "model_2_stem_061321.yaml"
347
+ },
348
+ "1e6165b601539f38d0a9330f3facffeb": {
349
+ "config_yaml": "model_2_stem_061321.yaml"
350
+ },
351
+ "fe0108464ce0d8271be5ab810891bd7c": {
352
+ "config_yaml": "model_2_stem_full_band.yaml"
353
+ }
354
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
pre-requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pip<=23.1.2
2
+ Setuptools<=80.6.0
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ soundfile
2
+ librosa
3
+ torch==2.5.1
4
+ pedalboard
5
+ yt-dlp
6
+ tqdm
7
+ spaces
8
+ numpy<2
9
+ gradio==5.43.1
10
+ ffmpeg-python
11
+ scipy
12
+ scikit-learn
13
+ matplotlib
14
+ matplotlib-inline
15
+ seaborn
16
+ requests
17
+ urllib3
18
+ onnxruntime-gpu==1.22.0
19
+ # onnxruntime # only CPU
test.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb83b97e05205b30d99db64f4bb4f8e5234631387f2ed3bd59e1b2417d94cca
3
+ size 193349
utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, zipfile, shutil, subprocess, shlex, sys # noqa
2
+ from urllib.parse import urlparse
3
+ import re
4
+ import logging
5
+
6
+
7
+ def load_file_from_url(
8
+ url: str,
9
+ model_dir: str,
10
+ file_name: str | None = None,
11
+ overwrite: bool = False,
12
+ progress: bool = True,
13
+ ) -> str:
14
+ """Download a file from `url` into `model_dir`,
15
+ using the file present if possible.
16
+
17
+ Returns the path to the downloaded file.
18
+ """
19
+ os.makedirs(model_dir, exist_ok=True)
20
+ if not file_name:
21
+ parts = urlparse(url)
22
+ file_name = os.path.basename(parts.path)
23
+ cached_file = os.path.abspath(os.path.join(model_dir, file_name))
24
+
25
+ # Overwrite
26
+ if os.path.exists(cached_file):
27
+ if overwrite or os.path.getsize(cached_file) == 0:
28
+ remove_files(cached_file)
29
+
30
+ # Download
31
+ if not os.path.exists(cached_file):
32
+ logger.info(f'Downloading: "{url}" to {cached_file}\n')
33
+ from torch.hub import download_url_to_file
34
+
35
+ download_url_to_file(url, cached_file, progress=progress)
36
+ else:
37
+ logger.debug(cached_file)
38
+
39
+ return cached_file
40
+
41
+
42
+ def friendly_name(file: str):
43
+ if file.startswith("http"):
44
+ file = urlparse(file).path
45
+
46
+ file = os.path.basename(file)
47
+ model_name, extension = os.path.splitext(file)
48
+ return model_name, extension
49
+
50
+
51
+ def download_manager(
52
+ url: str,
53
+ path: str,
54
+ extension: str = "",
55
+ overwrite: bool = False,
56
+ progress: bool = True,
57
+ ):
58
+ url = url.strip()
59
+
60
+ name, ext = friendly_name(url)
61
+ name += ext if not extension else f".{extension}"
62
+
63
+ if url.startswith("http"):
64
+ filename = load_file_from_url(
65
+ url=url,
66
+ model_dir=path,
67
+ file_name=name,
68
+ overwrite=overwrite,
69
+ progress=progress,
70
+ )
71
+ else:
72
+ filename = path
73
+
74
+ return filename
75
+
76
+
77
+ def remove_files(file_list):
78
+ if isinstance(file_list, str):
79
+ file_list = [file_list]
80
+
81
+ for file in file_list:
82
+ if os.path.exists(file):
83
+ os.remove(file)
84
+
85
+
86
+ def remove_directory_contents(directory_path):
87
+ """
88
+ Removes all files and subdirectories within a directory.
89
+
90
+ Parameters:
91
+ directory_path (str): Path to the directory whose
92
+ contents need to be removed.
93
+ """
94
+ if os.path.exists(directory_path):
95
+ for filename in os.listdir(directory_path):
96
+ file_path = os.path.join(directory_path, filename)
97
+ try:
98
+ if os.path.isfile(file_path):
99
+ os.remove(file_path)
100
+ elif os.path.isdir(file_path):
101
+ shutil.rmtree(file_path)
102
+ except Exception as e:
103
+ logger.error(f"Failed to delete {file_path}. Reason: {e}")
104
+ logger.info(f"Content in '{directory_path}' removed.")
105
+ else:
106
+ logger.error(f"Directory '{directory_path}' does not exist.")
107
+
108
+
109
+ # Create directory if not exists
110
+ def create_directories(directory_path):
111
+ if isinstance(directory_path, str):
112
+ directory_path = [directory_path]
113
+ for one_dir_path in directory_path:
114
+ if not os.path.exists(one_dir_path):
115
+ os.makedirs(one_dir_path)
116
+ logger.debug(f"Directory '{one_dir_path}' created.")
117
+
118
+
119
+ def setup_logger(name_log):
120
+ logger = logging.getLogger(name_log)
121
+ logger.setLevel(logging.INFO)
122
+
123
+ _default_handler = logging.StreamHandler() # Set sys.stderr as stream.
124
+ _default_handler.flush = sys.stderr.flush
125
+ logger.addHandler(_default_handler)
126
+
127
+ logger.propagate = False
128
+
129
+ handlers = logger.handlers
130
+
131
+ for handler in handlers:
132
+ formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
133
+ handler.setFormatter(formatter)
134
+
135
+ # logger.handlers
136
+
137
+ return logger
138
+
139
+
140
+ logger = setup_logger("ss")
141
+ logger.setLevel(logging.INFO)
142
+