Upload ultravox_config.py with huggingface_hub
Browse files- ultravox_config.py +0 -32
ultravox_config.py
CHANGED
|
@@ -9,7 +9,6 @@ import transformers
|
|
| 9 |
class LoraConfigSimplified:
|
| 10 |
"""
|
| 11 |
Low Rank Approximation (LoRA) configuration.
|
| 12 |
-
|
| 13 |
Used for language and audio models separately.
|
| 14 |
"""
|
| 15 |
|
|
@@ -23,17 +22,6 @@ class LoraConfigSimplified:
|
|
| 23 |
unfreeze_layers: Optional[List[str]] = None
|
| 24 |
|
| 25 |
|
| 26 |
-
class LossMaskType(str, Enum):
|
| 27 |
-
"""Type of loss mask to use."""
|
| 28 |
-
|
| 29 |
-
LAST_ASSISTANT = "last_assistant"
|
| 30 |
-
"""This applies the loss mask up until the last assistant token"""
|
| 31 |
-
ALL = "all" # This does not work with KL loss
|
| 32 |
-
"""No loss mask, all inputs are used for loss"""
|
| 33 |
-
AFTER_AUDIO = "after_audio"
|
| 34 |
-
"""Applies the loss mask up until the audio token"""
|
| 35 |
-
|
| 36 |
-
|
| 37 |
class LossFunction(str, Enum):
|
| 38 |
CrossEntropy = "ce"
|
| 39 |
KL_Divergence = "kl"
|
|
@@ -57,10 +45,8 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 57 |
r"""
|
| 58 |
This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
| 59 |
Ultravox model according to the specified arguments, defining the model architecture.
|
| 60 |
-
|
| 61 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 62 |
documentation from [`PretrainedConfig`] for more information.
|
| 63 |
-
|
| 64 |
Args:
|
| 65 |
audio_config (`WhisperConfig`, *optional*):
|
| 66 |
Custom audio config or dict
|
|
@@ -82,28 +68,19 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 82 |
The LoRA configuration for finetuning the audio model.
|
| 83 |
audio_latency_block_size (`int`, *optional*, defaults to `None`):
|
| 84 |
The latency block size for simulating audio streaming.
|
| 85 |
-
|
| 86 |
-
|
| 87 |
Example:
|
| 88 |
-
|
| 89 |
```python
|
| 90 |
>>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
|
| 91 |
-
|
| 92 |
>>> # Initializing an audio encoder config
|
| 93 |
>>> audio_config = WhisperConfig()
|
| 94 |
-
|
| 95 |
>>> # Initializing a Llama config
|
| 96 |
>>> text_config = LlamaConfig()
|
| 97 |
-
|
| 98 |
>>> # Initializing a default configuration
|
| 99 |
>>> configuration = UltravoxConfig(audio_config, text_config)
|
| 100 |
-
|
| 101 |
>>> # Initializing a completely untrained model from the configuration
|
| 102 |
>>> model = UltravoxModel(configuration)
|
| 103 |
-
|
| 104 |
>>> # Accessing the model configuration
|
| 105 |
>>> configuration = model.config
|
| 106 |
-
|
| 107 |
>>> # Initialize a model from pretrained checkpoints and random projector weights
|
| 108 |
>>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
|
| 109 |
```"""
|
|
@@ -117,9 +94,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 117 |
text_config: dict[str, Any] | transformers.PretrainedConfig | None = None,
|
| 118 |
audio_model_id: str | None = None,
|
| 119 |
text_model_id: str | None = None,
|
| 120 |
-
llm_only_training: bool = False,
|
| 121 |
ignore_index: int = -100,
|
| 122 |
-
audio_token_index: int | None = None,
|
| 123 |
hidden_size: int = 4096,
|
| 124 |
stack_factor: int = 8,
|
| 125 |
norm_init: float = 0.4,
|
|
@@ -135,8 +110,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 135 |
self.audio_model_id = audio_model_id
|
| 136 |
self.text_model_id = text_model_id
|
| 137 |
|
| 138 |
-
self.audio_token_index = audio_token_index
|
| 139 |
-
|
| 140 |
self.hidden_size = hidden_size
|
| 141 |
self.stack_factor = stack_factor
|
| 142 |
self.norm_init = norm_init
|
|
@@ -163,7 +136,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 163 |
self.text_config = text_config
|
| 164 |
self.audio_config = audio_config
|
| 165 |
|
| 166 |
-
self.llm_only_training = llm_only_training
|
| 167 |
self.text_model_lora_config = (
|
| 168 |
text_model_lora_config
|
| 169 |
if isinstance(text_model_lora_config, dict)
|
|
@@ -176,10 +148,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 176 |
)
|
| 177 |
self.audio_latency_block_size = audio_latency_block_size
|
| 178 |
|
| 179 |
-
if hasattr(text_config, "text_config"):
|
| 180 |
-
text_config.vocab_size = text_config.text_config.vocab_size
|
| 181 |
-
text_config.hidden_size = text_config.text_config.hidden_size
|
| 182 |
-
|
| 183 |
self.vocab_size = text_config.vocab_size
|
| 184 |
|
| 185 |
self.initializer_range = text_config.initializer_range
|
|
|
|
| 9 |
class LoraConfigSimplified:
|
| 10 |
"""
|
| 11 |
Low Rank Approximation (LoRA) configuration.
|
|
|
|
| 12 |
Used for language and audio models separately.
|
| 13 |
"""
|
| 14 |
|
|
|
|
| 22 |
unfreeze_layers: Optional[List[str]] = None
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
class LossFunction(str, Enum):
|
| 26 |
CrossEntropy = "ce"
|
| 27 |
KL_Divergence = "kl"
|
|
|
|
| 45 |
r"""
|
| 46 |
This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
| 47 |
Ultravox model according to the specified arguments, defining the model architecture.
|
|
|
|
| 48 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 49 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
|
| 50 |
Args:
|
| 51 |
audio_config (`WhisperConfig`, *optional*):
|
| 52 |
Custom audio config or dict
|
|
|
|
| 68 |
The LoRA configuration for finetuning the audio model.
|
| 69 |
audio_latency_block_size (`int`, *optional*, defaults to `None`):
|
| 70 |
The latency block size for simulating audio streaming.
|
|
|
|
|
|
|
| 71 |
Example:
|
|
|
|
| 72 |
```python
|
| 73 |
>>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
|
|
|
|
| 74 |
>>> # Initializing an audio encoder config
|
| 75 |
>>> audio_config = WhisperConfig()
|
|
|
|
| 76 |
>>> # Initializing a Llama config
|
| 77 |
>>> text_config = LlamaConfig()
|
|
|
|
| 78 |
>>> # Initializing a default configuration
|
| 79 |
>>> configuration = UltravoxConfig(audio_config, text_config)
|
|
|
|
| 80 |
>>> # Initializing a completely untrained model from the configuration
|
| 81 |
>>> model = UltravoxModel(configuration)
|
|
|
|
| 82 |
>>> # Accessing the model configuration
|
| 83 |
>>> configuration = model.config
|
|
|
|
| 84 |
>>> # Initialize a model from pretrained checkpoints and random projector weights
|
| 85 |
>>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
|
| 86 |
```"""
|
|
|
|
| 94 |
text_config: dict[str, Any] | transformers.PretrainedConfig | None = None,
|
| 95 |
audio_model_id: str | None = None,
|
| 96 |
text_model_id: str | None = None,
|
|
|
|
| 97 |
ignore_index: int = -100,
|
|
|
|
| 98 |
hidden_size: int = 4096,
|
| 99 |
stack_factor: int = 8,
|
| 100 |
norm_init: float = 0.4,
|
|
|
|
| 110 |
self.audio_model_id = audio_model_id
|
| 111 |
self.text_model_id = text_model_id
|
| 112 |
|
|
|
|
|
|
|
| 113 |
self.hidden_size = hidden_size
|
| 114 |
self.stack_factor = stack_factor
|
| 115 |
self.norm_init = norm_init
|
|
|
|
| 136 |
self.text_config = text_config
|
| 137 |
self.audio_config = audio_config
|
| 138 |
|
|
|
|
| 139 |
self.text_model_lora_config = (
|
| 140 |
text_model_lora_config
|
| 141 |
if isinstance(text_model_lora_config, dict)
|
|
|
|
| 148 |
)
|
| 149 |
self.audio_latency_block_size = audio_latency_block_size
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
self.vocab_size = text_config.vocab_size
|
| 152 |
|
| 153 |
self.initializer_range = text_config.initializer_range
|