|
|
--- |
|
|
base_model: |
|
|
- gru-audio-binary |
|
|
datasets: |
|
|
- mozilla-foundation/common_voice_11_0 |
|
|
language: |
|
|
- zh |
|
|
metrics: |
|
|
- accuracy |
|
|
pipeline_tag: audio-classification |
|
|
tags: |
|
|
- speaker_dialect_classification |
|
|
library_name: transformers |
|
|
--- |
|
|
|
|
|
|
|
|
# shanghai-binary |
|
|
|
|
|
Binary classifier: **Shanghai** vs **Not-Shanghai** (audio FBANK β GRU β MLP). |
|
|
|
|
|
## Files |
|
|
- `model.safetensors` β PyTorch weights (safetensors) |
|
|
- `config.json` β model architecture |
|
|
- `preprocessor_config.json` β audio feature extraction settings |
|
|
- `label_mapping.json` β index β label |
|
|
|
|
|
## Inference (PyTorch) |
|
|
```python |
|
|
import torch, json, numpy as np, librosa |
|
|
from safetensors.torch import load_file as load_safetensors |
|
|
|
|
|
# Load config |
|
|
import json, os |
|
|
model_dir = "./hf/models/shanghai-binary" |
|
|
cfg = json.load(open(os.path.join(model_dir, "config.json"))) |
|
|
pp = json.load(open(os.path.join(model_dir, "preprocessor_config.json"))) |
|
|
lm = json.load(open(os.path.join(model_dir, "label_mapping.json"))) |
|
|
|
|
|
# Define the model class you trained (LanNetBinary) |
|
|
# (Same as in your training notebook) |
|
|
class LanNetBinary(torch.nn.Module): |
|
|
def __init__(self, input_dim=40, hidden_dim=512, num_layers=2): |
|
|
super().__init__() |
|
|
self.gru = torch.nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True) |
|
|
self.linear2 = torch.nn.Linear(hidden_dim, 192) |
|
|
self.linear3 = torch.nn.Linear(192, 2) |
|
|
def forward(self, x): |
|
|
out, _ = self.gru(x) |
|
|
last = out[:, -1, :] |
|
|
x = self.linear2(last) |
|
|
x = self.linear3(x) |
|
|
return x |
|
|
|
|
|
# Load weights |
|
|
model = LanNetBinary(cfg["input_dim"], cfg["hidden_dim"], cfg["num_layers"]) |
|
|
sd = load_safetensors(os.path.join(model_dir, "model.safetensors")) |
|
|
model.load_state_dict(sd, strict=True) |
|
|
model.eval() |
|
|
|
|
|
# Feature extraction should match preprocessor_config.json |
|
|
def fbanks_from_array(y, sr=pp["sampling_rate"], n_mels=pp["n_mels"], n_fft=pp["n_fft"], hop_length=pp["hop_length"], max_len=pp["max_len_frames"]): |
|
|
mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, power=2.0) |
|
|
fbanks = librosa.power_to_db(mel).T |
|
|
T = fbanks.shape[0] |
|
|
if T < max_len: |
|
|
import numpy as np |
|
|
fbanks = np.pad(fbanks, ((0, max_len - T), (0, 0)), mode="constant") |
|
|
else: |
|
|
fbanks = fbanks[:max_len, :] |
|
|
return torch.tensor(fbanks, dtype=torch.float32).unsqueeze(0) # (1, T, F) |
|
|
|
|
|
# Example: predict from a waveform array "y" at 16kHz |
|
|
# y, _ = librosa.load("example.wav", sr=pp["sampling_rate"]) |
|
|
# x = fbanks_from_array(y) |
|
|
# with torch.no_grad(): |
|
|
# logits = model(x) |
|
|
# pred = int(torch.argmax(logits, dim=1)) |
|
|
# print(lm[str(pred)]) |
|
|
``` |
|
|
|
|
|
## References |
|
|
|
|
|
- Model based from [https://github.com/Colt1990/chinese-dialect-recognition/tree/master](https://github.com/Colt1990/chinese-dialect-recognition/tree/master) |
|
|
|
|
|
|