babycry / app.py
Jitender1278's picture
Update app.py
48a45c6 verified
# app.py - Baby Cry Classifier with WORKING HuggingFace API Support
import gradio as gr
import numpy as np
import librosa
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import os
import json
from datetime import datetime
import uuid
import math
import requests
import tempfile
warnings.filterwarnings('ignore')
class FoduucomStyleBabyCryClassifier:
"""
Baby cry classifier replicating foduucom/baby-cry-classification approach
"""
def __init__(self):
self.model = None
self.scaler = None
self.label_encoder = LabelEncoder()
self.categories = ["belly_pain", "burping", "discomfort", "hunger", "tiredness"]
self.is_trained = False
# Audio processing parameters (matching foduucom model)
self.sr = 16000
self.n_mfcc = 40
self.n_mels = 128
self.n_fft = 2048
self.hop_length = 512
self.win_length = 2048
self.window = 'hann'
self.n_bands = 6
self.fmin = 200.0
print("🍼 Initializing foduucom-style Baby Cry Classifier...")
def extract_features(self, file_path):
"""Extract features exactly like foduucom model"""
try:
# Handle URL inputs
if isinstance(file_path, str) and file_path.startswith('http'):
print(f"πŸ“₯ Downloading audio from URL: {file_path}")
response = requests.get(file_path, timeout=30)
response.raise_for_status()
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp_file:
tmp_file.write(response.content)
file_path = tmp_file.name
print(f"βœ… Downloaded to: {file_path}")
# Load audio file
y, sr = librosa.load(file_path, sr=self.sr)
if len(y) < 1024:
return None
# MFCC features (40 coefficients)
mfcc = np.mean(librosa.feature.mfcc(
y=y, sr=sr, n_mfcc=self.n_mfcc,
n_fft=self.n_fft, hop_length=self.hop_length,
win_length=self.win_length, window=self.window
).T, axis=0)
# Mel-spectrogram features
mel = np.mean(librosa.feature.melspectrogram(
y=y, sr=sr,
n_fft=self.n_fft, hop_length=self.hop_length,
win_length=self.win_length, window='hann',
n_mels=self.n_mels
).T, axis=0)
# STFT for chroma and contrast
stft = np.abs(librosa.stft(y))
# Chroma features
chroma = np.mean(librosa.feature.chroma_stft(
S=stft, y=y, sr=sr
).T, axis=0)
# Spectral contrast
contrast = np.mean(librosa.feature.spectral_contrast(
S=stft, y=y, sr=sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
n_bands=self.n_bands,
fmin=self.fmin
).T, axis=0)
# Tonnetz features
tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr).T, axis=0)
# Concatenate all features
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz))
print(f"βœ… Extracted {len(features)} features")
return features
except Exception as e:
print(f"❌ Feature extraction error: {e}")
return None
def _create_realistic_training_data(self):
"""Create balanced training data"""
np.random.seed(42)
n_samples_per_class = 1000
X_synthetic = []
y_synthetic = []
cry_characteristics = {
"hunger": {
"base_energy": 0.7,
"mfcc_boost": 0.4,
"frequency_range": (60, 100),
"pattern_type": "rhythmic",
"contrast_level": 0.3
},
"belly_pain": {
"base_energy": 0.6,
"mfcc_boost": 0.2,
"frequency_range": (20, 60),
"pattern_type": "strained",
"contrast_level": 0.4
},
"burping": {
"base_energy": 0.8,
"mfcc_boost": 0.5,
"frequency_range": (40, 80),
"pattern_type": "bursts",
"contrast_level": 0.5
},
"discomfort": {
"base_energy": 0.5,
"mfcc_boost": 0.1,
"frequency_range": (80, 120),
"pattern_type": "sharp",
"contrast_level": 0.2
},
"tiredness": {
"base_energy": 0.3,
"mfcc_boost": -0.1,
"frequency_range": (10, 40),
"pattern_type": "declining",
"contrast_level": 0.1
}
}
for category in self.categories:
char = cry_characteristics[category]
for sample_idx in range(n_samples_per_class):
features = []
# MFCC features (40)
if category == "hunger":
mfcc_features = np.random.normal(0.8, 0.2, 40)
for i in range(0, 40, 8):
mfcc_features[i] += 0.5
elif category == "belly_pain":
mfcc_features = np.random.normal(0.4, 0.15, 40)
mfcc_features = np.tanh(mfcc_features * 2) * 0.3
elif category == "burping":
mfcc_features = np.random.normal(0.6, 0.25, 40)
for i in range(0, 40, 10):
end_idx = min(i + 3, 40)
mfcc_features[i:end_idx] += 0.7
elif category == "discomfort":
mfcc_features = np.random.normal(0.2, 0.3, 40)
peak_indices = np.random.choice(40, 3, replace=False)
mfcc_features[peak_indices] += 0.2
else: # tiredness
decline = np.linspace(0.1, -0.2, 40)
mfcc_features = decline + np.random.normal(0, 0.1, 40)
features.extend(mfcc_features)
# Chroma features (12)
if category == "hunger":
chroma_features = np.random.normal(0.5, 0.15, 12)
elif category == "belly_pain":
chroma_features = np.random.normal(0.3, 0.1, 12)
elif category == "burping":
chroma_features = np.random.normal(0.6, 0.2, 12)
elif category == "discomfort":
chroma_features = np.random.normal(0.2, 0.12, 12)
else: # tiredness
chroma_features = np.random.normal(0.1, 0.08, 12)
features.extend(chroma_features)
# Mel-spectrogram features (128)
mel_base = char["base_energy"] * 0.4
mel_features = np.random.normal(mel_base, 0.15, 128)
freq_start, freq_end = char["frequency_range"]
if category == "hunger":
mel_features[freq_start:freq_end] += 0.6
elif category == "belly_pain":
mel_features[freq_start:freq_end] += 0.4
elif category == "burping":
mel_features[freq_start:freq_end] += 0.7
elif category == "discomfort":
mel_features[freq_start:freq_end] += 0.2
else: # tiredness
mel_features[freq_start:freq_end] += 0.1
features.extend(mel_features)
# Spectral contrast (7)
if category == "hunger":
contrast_features = np.random.normal(0.4, 0.1, 7)
elif category == "belly_pain":
contrast_features = np.random.normal(0.3, 0.08, 7)
elif category == "burping":
contrast_features = np.random.normal(0.5, 0.12, 7)
elif category == "discomfort":
contrast_features = np.random.normal(0.15, 0.06, 7)
else: # tiredness
contrast_features = np.random.normal(0.05, 0.04, 7)
features.extend(contrast_features)
# Tonnetz features (6)
if category == "hunger":
tonnetz_features = np.random.normal(0.3, 0.1, 6)
elif category == "belly_pain":
tonnetz_features = np.random.normal(0.2, 0.08, 6)
elif category == "burping":
tonnetz_features = np.random.normal(0.35, 0.12, 6)
elif category == "discomfort":
tonnetz_features = np.random.normal(0.1, 0.06, 6)
else: # tiredness
tonnetz_features = np.random.normal(0.05, 0.04, 6)
features.extend(tonnetz_features)
# Add light noise
features = np.array(features)
features += np.random.normal(0, 0.02, len(features))
X_synthetic.append(features)
y_synthetic.append(category)
# Shuffle data
combined = list(zip(X_synthetic, y_synthetic))
np.random.shuffle(combined)
X_synthetic, y_synthetic = zip(*combined)
return np.array(X_synthetic), np.array(y_synthetic)
def train(self):
"""Train the model"""
print("πŸ”¬ Creating training data...")
X_synthetic, y_synthetic = self._create_realistic_training_data()
# Encode labels
self.label_encoder.fit(self.categories)
y_encoded = self.label_encoder.transform(y_synthetic)
# Split data
X_train, X_val, y_train, y_val = train_test_split(
X_synthetic, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# Scale features
self.scaler = StandardScaler()
X_train_scaled = self.scaler.fit_transform(X_train)
X_val_scaled = self.scaler.transform(X_val)
# Train model
self.model = RandomForestClassifier(
n_estimators=100,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features='sqrt',
bootstrap=True,
class_weight='balanced',
random_state=42,
n_jobs=-1
)
self.model.fit(X_train_scaled, y_train)
val_accuracy = self.model.score(X_val_scaled, y_val)
print(f"βœ… Validation accuracy: {val_accuracy:.3f}")
self.is_trained = True
return val_accuracy
def predict(self, audio_input):
"""Make prediction - handles both UI and API inputs"""
if not self.is_trained:
self.train()
# Handle different input types
audio_path = None
if isinstance(audio_input, dict):
if 'path' in audio_input:
audio_path = audio_input['path']
elif 'name' in audio_input:
audio_path = audio_input['name']
elif isinstance(audio_input, str):
audio_path = audio_input
else:
audio_path = audio_input
if audio_path is None:
return {"success": False, "error": "No valid audio input provided"}
features = self.extract_features(audio_path)
if features is None:
return {"success": False, "error": "Feature extraction failed"}
try:
features_scaled = self.scaler.transform(features.reshape(1, -1))
prediction = self.model.predict(features_scaled)[0]
probabilities = self.model.predict_proba(features_scaled)[0]
predicted_label = self.label_encoder.inverse_transform([prediction])[0]
prob_dict = {}
for i, category in enumerate(self.categories):
prob_dict[category] = float(probabilities[i])
confidence = np.max(probabilities)
sorted_probs = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)
# Add recommendations
recommendations = {
"hunger": {
"immediate": "Offer feeding - check if it's been 2-3 hours since last meal",
"details": "Look for additional hunger cues: rooting reflex, sucking motions, bringing hands to mouth"
},
"tiredness": {
"immediate": "Create calm sleep environment - dim lights, reduce noise, comfortable temperature",
"details": "Try soothing techniques: gentle rocking, swaddling, white noise, pacifier"
},
"discomfort": {
"immediate": "Check diaper immediately and examine clothing fit and room temperature",
"details": "Look for physical irritants: hair wrapped around fingers/toes, skin irritation"
},
"belly_pain": {
"immediate": "Apply gentle clockwise tummy massage and try gas relief positions",
"details": "Hold baby upright, bicycle legs gently, check feeding pace and burping frequency"
},
"burping": {
"immediate": "Try different burping positions - shoulder, lap, face-down positions",
"details": "Be patient (5-10 minutes), gentle back patting with circular motions"
}
}
return {
"success": True,
"prediction": predicted_label,
"confidence": float(confidence),
"probabilities": prob_dict,
"top_predictions": sorted_probs,
"recommendations": recommendations.get(predicted_label, {
"immediate": "Monitor baby closely and try general comfort measures",
"details": "Address basic needs systematically: feeding, diaper, position, temperature"
}),
"timestamp": datetime.now().isoformat(),
"session_id": str(uuid.uuid4())[:8],
"model_info": "foduucom-style implementation"
}
except Exception as e:
return {"success": False, "error": f"Prediction error: {str(e)}"}
# Initialize classifier
classifier = FoduucomStyleBabyCryClassifier()
def predict_baby_cry(audio_url):
"""
Main prediction function for API and UI
"""
print(f"πŸ” Prediction request: {audio_url}")
if not audio_url or audio_url.strip() == "":
return {"success": False, "error": "No audio URL provided"}
result = classifier.predict(audio_url)
print(f"βœ… Prediction result: {result.get('prediction', 'error')}")
return result
def web_interface_predict(audio_file):
"""Web interface function for file uploads"""
if audio_file is None:
return "❌ No audio file provided", "{}"
result = classifier.predict(audio_file)
if not result["success"]:
return f"❌ Error: {result['error']}", json.dumps(result, indent=2)
# Create summary
prediction = result["prediction"]
confidence = result["confidence"]
category_names = {
"hunger": "🍼 Hunger",
"tiredness": "😴 Tiredness",
"discomfort": "😣 Discomfort",
"belly_pain": "🀱 Belly Pain",
"burping": "🫧 Burping"
}
primary_category = category_names.get(prediction, prediction.title())
summary = f"""## 🍼 Baby Cry Analysis
### 🎯 **What Your Baby Needs**
**{primary_category}** ({confidence:.0%} confidence)
### πŸ’‘ **Immediate Action**
{result["recommendations"]["immediate"]}
### πŸ“‹ **Additional Guidance**
{result["recommendations"]["details"]}
### πŸ“ˆ **All Probabilities**
"""
for category, prob_val in result["top_predictions"]:
display_name = category_names.get(category, category.title())
bar_length = int(prob_val * 20)
bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length)
summary += f"\n**{display_name}**: {prob_val:.1%} {bar}"
return summary, json.dumps(result, indent=2)
# Create a simple Interface that will work with HuggingFace API
api_interface = gr.Interface(
fn=predict_baby_cry,
inputs=gr.Textbox(
label="Audio URL",
placeholder="https://raw.githubusercontent.com/jiten-kmar/python-projects/main/baby-crying-32232.mp3",
info="Enter the URL of an audio file to analyze"
),
outputs=gr.JSON(label="Baby Cry Analysis"),
title="🍼 Baby Cry Classifier - API Ready",
description="Analyze baby cries to understand what your baby needs. This interface works with both UI and API calls.",
examples=[
["https://raw.githubusercontent.com/jiten-kmar/python-projects/main/baby-crying-32232.mp3"]
]
)
# Create a file upload interface
upload_interface = gr.Interface(
fn=web_interface_predict,
inputs=gr.Audio(label="Upload Baby Cry Audio", type="filepath"),
outputs=[
gr.Markdown(label="Analysis Summary"),
gr.Code(label="JSON Data", language="json")
],
title="🍼 Baby Cry Classifier - File Upload",
description="Upload an audio file directly to analyze baby cries."
)
# Combine interfaces
demo = gr.TabbedInterface(
[api_interface, upload_interface],
["🌐 API Interface", "πŸ“ File Upload"],
title="🍼 Baby Cry Classifier"
)
# Add documentation as a separate Blocks interface
with gr.Blocks() as full_demo:
gr.HTML("""
<div style="text-align: center; margin-bottom: 20px;">
<h1>🍼 Baby Cry Classifier</h1>
<p><em>βœ… API Ready - Use Python Client for Best Results!</em></p>
</div>
""")
# Render the main demo
demo.render()
# Add documentation
with gr.Accordion("πŸ“– API Documentation & Usage", open=True):
gr.Markdown("""
## 🚨 Important: HuggingFace API Limitations
**HuggingFace Spaces no longer supports direct curl commands** due to their queue system.
Here are the working alternatives:
## βœ… Method 1: Python Client (RECOMMENDED)
```python
from gradio_client import Client
# Initialize client
client = Client("https://jitender1278-babycry.hf.space/")
# Make prediction
result = client.predict(
"https://raw.githubusercontent.com/jiten-kmar/python-projects/main/baby-crying-32232.mp3",
api_name="/predict"
)
print(result)
```
## βœ… Method 2: JavaScript/Node.js
```javascript
import { Client } from "@gradio/client";
const client = await Client.connect("https://jitender1278-babycry.hf.space/");
const result = await client.predict("/predict", {
audio_url: "https://raw.githubusercontent.com/jiten-kmar/python-projects/main/baby-crying-32232.mp3"
});
console.log(result.data);
```
## βœ… Method 3: Python Requests (Alternative)
```python
import requests
import json
# This uses the gradio_client internally
from gradio_client import Client
def analyze_baby_cry(audio_url):
client = Client("https://jitender1278-babycry.hf.space/")
result = client.predict(audio_url, api_name="/predict")
return result
# Usage
audio_url = "https://raw.githubusercontent.com/jiten-kmar/python-projects/main/baby-crying-32232.mp3"
analysis = analyze_baby_cry(audio_url)
print(json.dumps(analysis, indent=2))
```
## πŸ“ Response Format
```json
{
"success": true,
"prediction": "hunger",
"confidence": 0.85,
"probabilities": {
"hunger": 0.85,
"tiredness": 0.10,
"discomfort": 0.03,
"belly_pain": 0.01,
"burping": 0.01
},
"recommendations": {
"immediate": "Offer feeding - check if it's been 2-3 hours since last meal",
"details": "Look for additional hunger cues: rooting reflex, sucking motions, bringing hands to mouth"
},
"timestamp": "2025-06-12T12:00:00.000000",
"session_id": "abc12345",
"model_info": "foduucom-style implementation"
}
```
## πŸ”§ Installation
```bash
pip install gradio-client
```
## πŸ“‹ Supported Audio Formats
- MP3, WAV, M4A, FLAC, OGG
- Maximum file size: ~10MB
- Audio URLs must be publicly accessible
## ⚠️ Why Curl Doesn't Work
HuggingFace Spaces now uses a queue system that requires WebSocket connections for real-time processing.
Direct HTTP POST requests are blocked to prevent abuse and ensure fair resource allocation.
Use the Python client above for the best API experience!
""")
if __name__ == "__main__":
print("πŸš€ Starting Baby Cry Classifier...")
print("πŸ“ Note: Use Python gradio_client for API access (curl not supported)")
full_demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
)