Upload 5 files

Browse files

Files changed (5) hide show

4bit/QUANTIZATION_README.md +95 -0
4bit/quantize_and_save_vibevoice.py +330 -0
4bit/test_accurate_vram.py +207 -0
4bit/use_quantized_model.py +70 -0
4bit/vibevoice_7gb_target.py +196 -0

4bit/QUANTIZATION_README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# VibeVoice Quantization Guide
+Successfully quantized VibeVoice 7B model to both 4-bit and 8-bit versions using bitsandbytes!
+## Model Sizes
+| Model Version | Size | Memory Usage | Quality |
+|---------------|------|--------------|---------|
+| Original (fp16/bf16) | 18GB | ~18GB VRAM | Best |
+| 8-bit Quantized | 9.9GB | ~10.6GB VRAM | Excellent |
+| 4-bit Quantized (nf4) | 6.2GB | ~6.6GB VRAM | Very Good |
+## How to Use Pre-Quantized Models
+### 1. Loading 4-bit Model
+```python
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+# Load pre-quantized 4-bit model
+model_path = "/path/to/VibeVoice-Large-4bit"
+processor = VibeVoiceProcessor.from_pretrained(model_path)
+model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+    model_path,
+    device_map='cuda',
+    torch_dtype=torch.bfloat16,
+)
+```
+### 2. Loading 8-bit Model
+```python
+# Same code, just point to 8-bit model
+model_path = "/path/to/VibeVoice-Large-8bit"
+# ... rest is the same
+```
+## Creating Your Own Quantized Models
+Use the provided script to quantize models:
+```bash
+# 4-bit quantization (nf4)
+python quantize_and_save_vibevoice.py \
+    --model_path /path/to/original/model \
+    --output_dir /path/to/output/4bit \
+    --bits 4 \
+    --test
+# 8-bit quantization
+python quantize_and_save_vibevoice.py \
+    --model_path /path/to/original/model \
+    --output_dir /path/to/output/8bit \
+    --bits 8 \
+    --test
+```
+## Benefits
+1. **Pre-quantized models load faster** - No on-the-fly quantization needed
+2. **Lower VRAM requirements** - 4-bit uses only ~6.6GB vs 18GB
+3. **Shareable** - Upload the quantized folder to share with others
+4. **Quality preserved** - nf4 quantization maintains excellent output quality
+## Distribution
+To share quantized models:
+1. Upload the entire quantized model directory (e.g., `VibeVoice-Large-4bit/`)
+2. Include the `quantization_config.json` file (automatically created)
+3. Users can load directly without any quantization setup
+## Performance Notes
+- 4-bit (nf4): Best for memory-constrained systems, minimal quality loss
+- 8-bit: Better quality than 4-bit, still significant memory savings
+- Both versions maintain the same generation speed as the original
+- Flash Attention 2 is supported in all quantized versions
+## Troubleshooting
+If loading fails:
+1. Ensure you have `bitsandbytes` installed: `pip install bitsandbytes`
+2. Make sure you're on a CUDA-capable GPU
+3. Check that all model files are present in the directory
+## Files Created
+Each quantized model directory contains:
+- `model.safetensors.*` - Quantized model weights
+- `config.json` - Model configuration with quantization settings
+- `quantization_config.json` - Specific quantization parameters
+- `processor/` - Audio processor files
+- `load_quantized_Xbit.py` - Example loading script

4bit/quantize_and_save_vibevoice.py ADDED Viewed

	@@ -0,0 +1,330 @@

+#!/usr/bin/env python
+"""
+Quantize and save VibeVoice model using bitsandbytes
+Creates a pre-quantized model that can be shared and loaded directly
+"""
+import os
+import json
+import shutil
+import torch
+from pathlib import Path
+from transformers import BitsAndBytesConfig
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+from transformers.utils import logging
+from safetensors.torch import save_file
+logging.set_verbosity_info()
+def quantize_and_save_model(
+    model_path: str,
+    output_dir: str,
+    bits: int = 4,
+    quant_type: str = "nf4"
+):
+    """Quantize VibeVoice model and save it for distribution"""
+    print(f"\n{'='*70}")
+    print(f"VIBEVOICE QUANTIZATION - {bits}-bit ({quant_type})")
+    print(f"{'='*70}")
+    print(f"Source: {model_path}")
+    print(f"Output: {output_dir}")
+    print(f"{'='*70}\n")
+    # Create output directory
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Configure quantization
+    if bits == 4:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type=quant_type
+        )
+    elif bits == 8:
+        bnb_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+        )
+    else:
+        raise ValueError(f"Unsupported bit width: {bits}")
+    print("🔧 Loading and quantizing model...")
+    # Load the model with quantization
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    # Get memory usage
+    memory_gb = torch.cuda.memory_allocated() / 1e9
+    print(f"💾 Quantized model memory usage: {memory_gb:.1f} GB")
+    # Save the quantized model
+    print("\n📦 Saving quantized model...")
+    # Method 1: Try using save_pretrained with quantization info
+    try:
+        # Save model with quantization config
+        model.save_pretrained(
+            output_path,
+            safe_serialization=True,
+            max_shard_size="5GB"
+        )
+        # Save the quantization config separately
+        quant_config_dict = {
+            "quantization_config": bnb_config.to_dict(),
+            "quantization_method": "bitsandbytes",
+            "bits": bits,
+            "quant_type": quant_type
+        }
+        with open(output_path / "quantization_config.json", 'w') as f:
+            json.dump(quant_config_dict, f, indent=2)
+        print("✅ Model saved with integrated quantization")
+    except Exception as e:
+        print(f"⚠️ Standard save failed: {e}")
+        print("Trying alternative save method...")
+        # Method 2: Save state dict with quantized weights
+        save_quantized_state_dict(model, output_path, bnb_config)
+    # Copy processor files
+    print("\n📋 Copying processor files...")
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    processor.save_pretrained(output_path)
+    # Copy additional config files
+    for file in ["config.json", "generation_config.json"]:
+        src = Path(model_path) / file
+        if src.exists():
+            shutil.copy2(src, output_path / file)
+    # Update config to indicate quantization
+    config_path = output_path / "config.json"
+    if config_path.exists():
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        config["quantization_config"] = bnb_config.to_dict()
+        config["_quantization_method"] = "bitsandbytes"
+        with open(config_path, 'w') as f:
+            json.dump(config, f, indent=2)
+    print(f"\n✅ Quantized model saved to: {output_path}")
+    # Create loading script
+    create_loading_script(output_path, bits, quant_type)
+    return output_path
+def save_quantized_state_dict(model, output_path, bnb_config):
+    """Alternative method to save quantized weights"""
+    print("\n🔧 Saving quantized state dict...")
+    # Get the state dict
+    state_dict = model.state_dict()
+    # Separate quantized and non-quantized parameters
+    quantized_state = {}
+    metadata = {
+        "quantized_modules": [],
+        "quantization_config": bnb_config.to_dict()
+    }
+    for name, param in state_dict.items():
+        # Check if this is a quantized parameter
+        if hasattr(param, 'quant_state'):
+            # Store quantization state
+            metadata["quantized_modules"].append(name)
+            quantized_state[name] = param.data
+        else:
+            # Regular parameter
+            quantized_state[name] = param
+    # Save using safetensors
+    save_file(quantized_state, output_path / "model.safetensors", metadata=metadata)
+    # Save metadata
+    with open(output_path / "quantization_metadata.json", 'w') as f:
+        json.dump(metadata, f, indent=2)
+def create_loading_script(output_path, bits, quant_type):
+    """Create a script to load the quantized model"""
+    script_content = f'''#!/usr/bin/env python
+"""
+Load and use the {bits}-bit quantized VibeVoice model
+"""
+import torch
+from transformers import BitsAndBytesConfig
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+def load_quantized_model(model_path="{output_path}"):
+    """Load the pre-quantized VibeVoice model"""
+    print("Loading {bits}-bit quantized VibeVoice model...")
+    # The model is already quantized, but we need to specify the config
+    # to ensure proper loading of quantized weights
+    bnb_config = BitsAndBytesConfig(
+        load_in_{bits}bit=True,
+        bnb_{bits}bit_compute_dtype=torch.bfloat16,
+        {"bnb_4bit_use_double_quant=True," if bits == 4 else ""}
+        {"bnb_4bit_quant_type='" + quant_type + "'" if bits == 4 else ""}
+    )
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    # Load model
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    print("✅ Model loaded successfully!")
+    print(f"💾 Memory usage: {{torch.cuda.memory_allocated() / 1e9:.1f}} GB")
+    return model, processor
+# Example usage
+if __name__ == "__main__":
+    model, processor = load_quantized_model()
+    # Generate audio
+    text = "Speaker 1: Hello! Speaker 2: Hi there!"
+    inputs = processor(
+        text=[text],
+        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
+        padding=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        outputs = model.generate(**inputs)
+    # Save audio
+    processor.save_audio(outputs.speech_outputs[0], "output.wav")
+'''
+    script_path = output_path / f"load_quantized_{bits}bit.py"
+    with open(script_path, 'w') as f:
+        f.write(script_content)
+    print(f"📝 Created loading script: {script_path}")
+def test_quantized_model(model_path):
+    """Test loading and generating with the quantized model"""
+    print(f"\n🧪 Testing quantized model from: {model_path}")
+    try:
+        # Load the quantized model
+        processor = VibeVoiceProcessor.from_pretrained(model_path)
+        # Load with auto-detection of quantization
+        model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+            model_path,
+            device_map='cuda',
+            torch_dtype=torch.bfloat16,
+        )
+        print("✅ Model loaded successfully!")
+        # Quick generation test
+        test_text = "Speaker 1: Testing quantized model. Speaker 2: It works!"
+        print(f"\n🎤 Testing generation with: '{test_text}'")
+        # Use demo voices
+        voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
+        speaker_voices = [
+            os.path.join(voices_dir, "en-Alice_woman.wav"),
+            os.path.join(voices_dir, "en-Carter_man.wav")
+        ]
+        inputs = processor(
+            text=[test_text],
+            voice_samples=[speaker_voices],
+            padding=True,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=None,
+                cfg_scale=1.3,
+                tokenizer=processor.tokenizer,
+                generation_config={'do_sample': False},
+            )
+        print("✅ Generation successful!")
+        # Save test output
+        output_path = Path(model_path) / "test_output.wav"
+        processor.save_audio(outputs.speech_outputs[0], output_path=str(output_path))
+        print(f"🔊 Test audio saved to: {output_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+        return False
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Quantize and save VibeVoice model")
+    parser.add_argument("--model_path", default="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt",
+                       help="Path to the original model")
+    parser.add_argument("--output_dir", default="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit",
+                       help="Output directory for quantized model")
+    parser.add_argument("--bits", type=int, default=4, choices=[4, 8],
+                       help="Quantization bits (4 or 8)")
+    parser.add_argument("--quant_type", default="nf4", choices=["nf4", "fp4"],
+                       help="4-bit quantization type")
+    parser.add_argument("--test", action="store_true",
+                       help="Test the quantized model after saving")
+    args = parser.parse_args()
+    # Update output dir based on bits
+    if str(args.bits) not in args.output_dir:
+        args.output_dir = args.output_dir.replace("4bit", f"{args.bits}bit")
+    # Quantize and save
+    output_path = quantize_and_save_model(
+        args.model_path,
+        args.output_dir,
+        args.bits,
+        args.quant_type
+    )
+    # Test if requested
+    if args.test:
+        test_quantized_model(output_path)
+    print(f"\n🎉 Done! Quantized model ready for distribution at: {output_path}")
+    print(f"\n📦 To share this model:")
+    print(f"1. Upload the entire '{output_path}' directory")
+    print(f"2. Users can load it with the provided script or directly with transformers")
+    print(f"3. The model will load in {args.bits}-bit without additional quantization")
+if __name__ == "__main__":
+    main()

4bit/test_accurate_vram.py ADDED Viewed

	@@ -0,0 +1,207 @@

+#!/usr/bin/env python
+"""
+Accurate VRAM measurement for VibeVoice models
+Shows the difference between allocated vs reserved memory
+"""
+import os
+import gc
+import torch
+import subprocess
+import time
+from pathlib import Path
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+def get_gpu_memory_info():
+    """Get detailed GPU memory information"""
+    if not torch.cuda.is_available():
+        return {}
+    # PyTorch memory stats
+    allocated = torch.cuda.memory_allocated() / 1e9
+    reserved = torch.cuda.memory_reserved() / 1e9
+    # Get nvidia-smi info
+    try:
+        result = subprocess.run([
+            'nvidia-smi',
+            '--query-gpu=memory.used,memory.total',
+            '--format=csv,nounits,noheader'
+        ], capture_output=True, text=True)
+        if result.returncode == 0:
+            used, total = map(int, result.stdout.strip().split(','))
+            nvidia_used_gb = used / 1024  # Convert MB to GB
+            nvidia_total_gb = total / 1024
+        else:
+            nvidia_used_gb = 0
+            nvidia_total_gb = 0
+    except:
+        nvidia_used_gb = 0
+        nvidia_total_gb = 0
+    return {
+        'allocated': allocated,
+        'reserved': reserved,
+        'nvidia_smi': nvidia_used_gb,
+        'nvidia_total': nvidia_total_gb
+    }
+def print_memory_report(label, before, after):
+    """Print detailed memory usage report"""
+    print(f"\n{label}:")
+    print(f"  PyTorch Allocated: {before['allocated']:.2f} GB → {after['allocated']:.2f} GB "
+          f"(+{after['allocated'] - before['allocated']:.2f} GB)")
+    print(f"  PyTorch Reserved:  {before['reserved']:.2f} GB → {after['reserved']:.2f} GB "
+          f"(+{after['reserved'] - before['reserved']:.2f} GB)")
+    print(f"  nvidia-smi Total:  {before['nvidia_smi']:.2f} GB → {after['nvidia_smi']:.2f} GB "
+          f"(+{after['nvidia_smi'] - before['nvidia_smi']:.2f} GB)")
+    print(f"  Memory Overhead:   {after['reserved'] - after['allocated']:.2f} GB (PyTorch cache)")
+def clear_gpu_memory():
+    """Aggressively clear GPU memory"""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        # Force memory pool cleanup
+        torch.cuda.reset_peak_memory_stats()
+def test_model_memory(model_path, model_name):
+    """Test model with detailed memory tracking"""
+    print(f"\n{'='*70}")
+    print(f"Testing {model_name}")
+    print(f"{'='*70}")
+    # Clear memory and get baseline
+    clear_gpu_memory()
+    time.sleep(2)  # Let memory settle
+    baseline = get_gpu_memory_info()
+    print(f"\nBaseline GPU Memory:")
+    print(f"  PyTorch Allocated: {baseline['allocated']:.2f} GB")
+    print(f"  PyTorch Reserved:  {baseline['reserved']:.2f} GB")
+    print(f"  nvidia-smi Shows:  {baseline['nvidia_smi']:.2f} GB / {baseline['nvidia_total']:.2f} GB")
+    # Load model
+    print(f"\nLoading {model_name}...")
+    load_start = time.time()
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    load_time = time.time() - load_start
+    # Get memory after loading
+    loaded = get_gpu_memory_info()
+    print_memory_report("After Model Loading", baseline, loaded)
+    # Test generation to see peak usage
+    print(f"\nTesting generation...")
+    test_text = "Speaker 1: Testing memory usage. Speaker 2: Let's see the results!"
+    voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
+    speaker_voices = [
+        os.path.join(voices_dir, "en-Alice_woman.wav"),
+        os.path.join(voices_dir, "en-Carter_man.wav")
+    ]
+    inputs = processor(
+        text=[test_text],
+        voice_samples=[speaker_voices],
+        padding=True,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    # Monitor during generation
+    pre_gen = get_gpu_memory_info()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=None,
+            cfg_scale=1.3,
+            tokenizer=processor.tokenizer,
+            generation_config={'do_sample': False},
+        )
+    post_gen = get_gpu_memory_info()
+    print_memory_report("During Generation", pre_gen, post_gen)
+    # Peak memory stats
+    if torch.cuda.is_available():
+        peak_memory = torch.cuda.max_memory_allocated() / 1e9
+        peak_reserved = torch.cuda.max_memory_reserved() / 1e9
+        print(f"\nPeak Memory Usage:")
+        print(f"  Peak Allocated: {peak_memory:.2f} GB")
+        print(f"  Peak Reserved:  {peak_reserved:.2f} GB")
+    # Clean up
+    del model
+    del processor
+    clear_gpu_memory()
+    return {
+        'name': model_name,
+        'allocated': loaded['allocated'] - baseline['allocated'],
+        'reserved': loaded['reserved'] - baseline['reserved'],
+        'nvidia_smi': loaded['nvidia_smi'] - baseline['nvidia_smi'],
+        'peak_allocated': peak_memory,
+        'peak_reserved': peak_reserved
+    }
+def main():
+    print("="*70)
+    print("ACCURATE VRAM MEASUREMENT FOR VIBEVOICE")
+    print("="*70)
+    print("\nNote: PyTorch reserves extra memory for efficiency.")
+    print("nvidia-smi shows total reserved memory, not just allocated.")
+    models = [
+        {
+            "path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt",
+            "name": "16-bit Original"
+        },
+        {
+            "path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit",
+            "name": "4-bit Quantized"
+        }
+    ]
+    results = []
+    for model_info in models:
+        try:
+            result = test_model_memory(model_info["path"], model_info["name"])
+            results.append(result)
+            time.sleep(5)
+        except Exception as e:
+            print(f"Error testing {model_info['name']}: {e}")
+    # Summary
+    print("\n" + "="*70)
+    print("MEMORY USAGE SUMMARY")
+    print("="*70)
+    print(f"\n{'Model':<20} {'Allocated':<12} {'Reserved':<12} {'nvidia-smi':<12} {'Peak':<12}")
+    print("-"*70)
+    for r in results:
+        print(f"{r['name']:<20} "
+              f"{r['allocated']:<12.2f} "
+              f"{r['reserved']:<12.2f} "
+              f"{r['nvidia_smi']:<12.2f} "
+              f"{r['peak_allocated']:<12.2f}")
+    print("\n💡 Key Insights:")
+    print("- 'Allocated' = Actual model weights in memory")
+    print("- 'Reserved' = Total GPU memory reserved by PyTorch (includes cache)")
+    print("- 'nvidia-smi' = What nvidia-smi reports (includes all overhead)")
+    print("- The difference is PyTorch's memory pool for efficiency")
+if __name__ == "__main__":
+    main()

4bit/use_quantized_model.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python
+"""
+Simple example of using the pre-quantized VibeVoice model
+No need for on-the-fly quantization - loads much faster!
+"""
+import os
+import torch
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+def main():
+    # Path to the pre-quantized model
+    model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"
+    print("Loading pre-quantized VibeVoice 4-bit model...")
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    # Load the pre-quantized model
+    # The quantization config is already saved in the model
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    # Check memory usage
+    memory_gb = torch.cuda.memory_allocated() / 1e9
+    print(f"✅ Model loaded! Memory usage: {memory_gb:.1f} GB")
+    # Example generation
+    text = "Speaker 1: Welcome to our podcast! Speaker 2: Thanks for having me!"
+    # Voice samples (using demo voices)
+    voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
+    speaker_voices = [
+        os.path.join(voices_dir, "en-Alice_woman.wav"),
+        os.path.join(voices_dir, "en-Carter_man.wav")
+    ]
+    # Process inputs
+    inputs = processor(
+        text=[text],
+        voice_samples=[speaker_voices],
+        padding=True,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    # Generate
+    print(f"\nGenerating: '{text}'")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=None,
+            cfg_scale=1.3,
+            tokenizer=processor.tokenizer,
+            generation_config={'do_sample': False},
+        )
+    # Save output
+    output_path = "quantized_output.wav"
+    processor.save_audio(outputs.speech_outputs[0], output_path=output_path)
+    print(f"✅ Audio saved to: {output_path}")
+if __name__ == "__main__":
+    main()

4bit/vibevoice_7gb_target.py ADDED Viewed

	@@ -0,0 +1,196 @@

+#!/usr/bin/env python
+"""
+Load VibeVoice 4-bit in ~7GB VRAM
+Minimize PyTorch's memory pool overhead
+"""
+import os
+import gc
+import torch
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+# CRITICAL: Set these BEFORE any CUDA operations
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'
+# Reduce memory fraction to force PyTorch to be more conservative
+torch.cuda.set_per_process_memory_fraction(0.75)  # This limits reserved memory
+def get_memory_stats():
+    """Get detailed memory statistics"""
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1e9
+        reserved = torch.cuda.memory_reserved() / 1e9
+        free = torch.cuda.mem_get_info()[0] / 1e9
+        total = torch.cuda.mem_get_info()[1] / 1e9
+        return {
+            'allocated': allocated,
+            'reserved': reserved,
+            'free': free,
+            'total': total,
+            'used': total - free
+        }
+    return {}
+def load_model_minimal(model_path):
+    """Load model with absolute minimal memory overhead"""
+    print("Loading 4-bit model with minimal overhead...")
+    # Start clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    # Report initial state
+    stats = get_memory_stats()
+    print(f"\nInitial state:")
+    print(f"  GPU total: {stats['total']:.2f} GB")
+    print(f"  GPU used:  {stats['used']:.2f} GB")
+    print(f"  GPU free:  {stats['free']:.2f} GB")
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    # Load model - let it use default device map
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+    )
+    # Immediately set to eval and disable gradients
+    model.eval()
+    model.requires_grad_(False)
+    # Force cleanup
+    gc.collect()
+    torch.cuda.empty_cache()
+    # Report after loading
+    stats = get_memory_stats()
+    print(f"\nAfter loading:")
+    print(f"  Allocated: {stats['allocated']:.2f} GB (actual model)")
+    print(f"  Reserved:  {stats['reserved']:.2f} GB (PyTorch total)")
+    print(f"  Overhead:  {stats['reserved'] - stats['allocated']:.2f} GB")
+    print(f"  System reports: {stats['used']:.2f} GB used")
+    return model, processor
+def generate_minimal(model, processor, text, speaker_voices):
+    """Generate with minimal memory overhead"""
+    # Process inputs
+    inputs = processor(
+        text=[text],
+        voice_samples=[speaker_voices],
+        padding=True,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    # Disable caching to save memory during generation
+    with torch.no_grad():
+        # Temporarily reduce memory fragmentation
+        torch.cuda.empty_cache()
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=None,
+            cfg_scale=1.3,
+            tokenizer=processor.tokenizer,
+            generation_config={
+                'do_sample': False,
+                'use_cache': True,  # Actually, keeping cache can be more efficient
+            },
+        )
+    # Cleanup
+    del inputs
+    gc.collect()
+    return outputs
+def try_memory_reduction_tricks():
+    """Additional tricks to reduce memory"""
+    print("\n🔧 Applying memory reduction tricks...")
+    # 1. Reduce CUDA kernel reservation
+    if hasattr(torch.cuda, 'set_allocator_settings'):
+        torch.cuda.set_allocator_settings(backend='native')
+    # 2. Force synchronization and cleanup
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+    # 3. Try to release unused cached blocks
+    allocated_before = torch.cuda.memory_allocated()
+    reserved_before = torch.cuda.memory_reserved()
+    # This might help
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.empty_cache()
+    allocated_after = torch.cuda.memory_allocated()
+    reserved_after = torch.cuda.memory_reserved()
+    if reserved_before > reserved_after:
+        print(f"  ✓ Freed {(reserved_before - reserved_after) / 1e9:.2f} GB")
+def main():
+    # Paths
+    model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"
+    voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
+    print("="*60)
+    print("VIBEVOICE 4-BIT - 7GB TARGET MODE")
+    print("="*60)
+    # Apply tricks before loading
+    try_memory_reduction_tricks()
+    # Load model
+    model, processor = load_model_minimal(model_path)
+    # Try to compact memory after loading
+    try_memory_reduction_tricks()
+    # Test generation
+    test_text = "Speaker 1: Testing minimal memory. Speaker 2: Hope it works!"
+    speaker_voices = [
+        os.path.join(voices_dir, "en-Alice_woman.wav"),
+        os.path.join(voices_dir, "en-Carter_man.wav")
+    ]
+    print("\n🎤 Generating audio...")
+    outputs = generate_minimal(model, processor, test_text, speaker_voices)
+    # Final stats
+    stats = get_memory_stats()
+    print(f"\nFinal memory usage:")
+    print(f"  Allocated: {stats['allocated']:.2f} GB")
+    print(f"  Reserved:  {stats['reserved']:.2f} GB")
+    print(f"  Total used: {stats['used']:.2f} GB")
+    # Save output
+    output_path = "7gb_target_output.wav"
+    processor.save_audio(outputs.speech_outputs[0], output_path=output_path)
+    print(f"\n✅ Audio saved to: {output_path}")
+    # Analysis
+    print("\n📊 Analysis:")
+    overhead = stats['reserved'] - stats['allocated']
+    print(f"The {overhead:.2f} GB overhead comes from:")
+    print("- PyTorch memory pool fragmentation")
+    print("- CUDA kernel workspace")
+    print("- Temporary buffers for operations")
+    print("\n💡 The model IS 6.6GB, but PyTorch needs workspace!")
+    # Extreme option
+    print("\n🚀 To truly get to 7GB total, you could:")
+    print("1. Use bnb 3-bit quantization (experimental)")
+    print("2. Prune some model layers")
+    print("3. Use a custom CUDA allocator")
+    print("4. Compile with torch.compile() for memory efficiency")
+if __name__ == "__main__":
+    main()