README.md CHANGED
@@ -3,14 +3,13 @@ language:
3
  - en
4
  - zh
5
  pipeline_tag: text-to-audio
6
- library_name: tencent-song-generation
7
  ---
8
 
9
  # SongGeneration
10
 
11
- <p align="center"><img src="img/logo.jpg" width="40%"></p>
12
  <p align="center">
13
- <a href="https://levo-demo.github.io/">Demo</a> &nbsp;|&nbsp; <a href="https://arxiv.org/abs/2506.07520">Paper</a> &nbsp;|&nbsp; <a href="https://github.com/tencent-ailab/songgeneration">Code</a> &nbsp;|&nbsp; <a href="https://huggingface.co/spaces/tencent/SongGeneration">Space Demo</a>
14
  </p>
15
 
16
 
@@ -18,15 +17,11 @@ This repository is the official weight repository for LeVo: High-Quality Song Ge
18
 
19
  ## Model Versions
20
 
21
- | Model | Max Length | Language | GPU Menmory | RFT(A100) | Download Link |
22
- | ------------------------- | :--------: | :------------------: | :---------: | :-------: | ------------------------------------------------------------ |
23
- | SongGeneration-base | 2m30s | zh | 10G/16G | 1.26 | You were here |
24
- | SongGeneration-base-new | 2m30s | zh, en | 10G/16G | 1.26 | [Huggingface](https://huggingface.co/lglg666/SongGeneration-base-new) |
25
- | SongGeneration-base-full | 4m30s | zh, en | 12G/18G | 1.30 | [Huggingface](https://huggingface.co/lglg666/SongGeneration-base-full) |
26
- | SongGeneration-large | 4m30s | zh, en | 22G/28G | 1.51 | [Huggingface](https://huggingface.co/lglg666/SongGeneration-large) |
27
- | SongGeneration-v1.5-small | 2m | zh, en, es, ja, etc. | - | - | Coming soon |
28
- | SongGeneration-v1.5-base | 4m30s | zh, en, es, ja, etc. | - | - | Coming soon |
29
- | SongGeneration-v1.5-large | 4m30s | zh, en, es, ja, etc. | - | - | Coming soon |
30
 
31
  ## Overview
32
 
@@ -36,4 +31,4 @@ We develop the SongGeneration model. It is an LM-based framework consisting of *
36
 
37
  ## License
38
 
39
- The code and weights in this repository is released in the [LICENSE](LICENSE) file.
 
3
  - en
4
  - zh
5
  pipeline_tag: text-to-audio
6
+ library_name: transformers
7
  ---
8
 
9
  # SongGeneration
10
 
 
11
  <p align="center">
12
+ <a href="https://levo-demo.github.io/">Demo</a> &nbsp;|&nbsp; <a href="https://arxiv.org/abs/2506.07520">Paper</a> &nbsp;|&nbsp; <a href="https://github.com/tencent-ailab/songgeneration">Code</a> &nbsp;|&nbsp; <a href="https://huggingface.co/spaces/waytan22/SongGeneration-LeVo">Space Demo</a>
13
  </p>
14
 
15
 
 
17
 
18
  ## Model Versions
19
 
20
+ | Model | HuggingFace |
21
+ | :----------------------: | :----------------------------------------------------------: |
22
+ | SongGeneration-base(zh) | <a href="https://huggingface.co/tencent/SongGeneration/tree/main/ckpt/songgeneration_base_zh">v20250520</a> |
23
+ | SongGeneration-base(zh&en) | Coming soon |
24
+ | SongGeneration-full(zh&en) | Coming soon |
 
 
 
 
25
 
26
  ## Overview
27
 
 
31
 
32
  ## License
33
 
34
+ The code and weights in this repository is released in the [LICENSE](LICENSE) file.
ckpt/encode-s12k.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e250df56b035f74c1f66f15133f4c78f664d70fa0b09aa9a752b7871bb58c02f
3
+ size 3957949089
ckpt/model_1rvq/model_2_fixed.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfbc5d4f0057921f64bccb93431fa5820be8cae326d913ad383cac1f61b8052f
3
- size 659473962
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:339a16956b859a82defc02bfd32c3744d11ff942065f6ec9306dfd4400d62110
3
+ size 4704507596
ckpt/model_septoken/model_2.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a82451aeba6e171b47c6be9991698e46cf859eb3f17bfbedc17332341bd86e4
3
- size 3768119184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758aa342942a7b7c0ae179af1a952e0b944e39128ea816741499b3031113aaee
3
+ size 4808167708
ckpt/models--lengyue233--content-vec-best/.no_exist/c0b9ba13db21beaa4053faae94c102ebe326fd68/model.safetensors ADDED
File without changes
ckpt/models--lengyue233--content-vec-best/.no_exist/c0b9ba13db21beaa4053faae94c102ebe326fd68/model.safetensors.index.json ADDED
File without changes
ckpt/models--lengyue233--content-vec-best/blobs/5186a71b15933aca2d9942db95e1aff02642d1f0 ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.27.3",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
ckpt/models--lengyue233--content-vec-best/blobs/d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
3
+ size 378342945
ckpt/models--lengyue233--content-vec-best/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ c0b9ba13db21beaa4053faae94c102ebe326fd68
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.27.3",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
3
+ size 378342945
ckpt/{songgeneration_base → songgeneration_base_zh}/config.yaml RENAMED
@@ -106,36 +106,3 @@ conditioners:
106
  QwTextTokenizer:
107
  token_path: third_party/Qwen2-7B
108
  max_len: 50
109
-
110
- offload:
111
- audiolm:
112
- offload_module: self
113
- cpu_mem_gb: 0
114
- pre_copy_step: 1
115
- clean_cache_after_forward: false
116
- dtype: torch.float16
117
- offload_layer_dict:
118
- transformer: 4
119
- transformer2: 4
120
- ignore_layer_list: []
121
- clean_cache_wrapper:
122
- module: self
123
- method_name: _sample_next_token
124
- diff_mem_gb_thre: 2
125
- debug: false
126
-
127
- wav_tokenizer_diffusion:
128
- offload_module: self.model.model
129
- pre_copy_step: 1
130
- clean_cache_after_forward: false
131
- cpu_mem_gb: -1
132
- dtype: null
133
- offload_layer_dict:
134
- cfm_wrapper: 5
135
- hubert: 4
136
- ignore_layer_list: []
137
- clean_cache_wrapper:
138
- module: self.model.model.cfm_wrapper.estimator
139
- method_name: forward
140
- diff_mem_gb_thre: 1
141
- debug: false
 
106
  QwTextTokenizer:
107
  token_path: third_party/Qwen2-7B
108
  max_len: 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/{songgeneration_base → songgeneration_base_zh}/model.pt RENAMED
File without changes
img/logo.jpg DELETED
Binary file (70.4 kB)