Spaces:
Runtime error
Runtime error
| # Multi-GPU Neural OS Startup Script | |
| # Default values | |
| NUM_GPUS=2 | |
| DISPATCHER_PORT=8000 | |
| # Parse command line arguments | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --num-gpus) | |
| NUM_GPUS="$2" | |
| shift 2 | |
| ;; | |
| --port) | |
| DISPATCHER_PORT="$2" | |
| shift 2 | |
| ;; | |
| -h|--help) | |
| echo "Usage: $0 [--num-gpus N] [--port PORT]" | |
| echo " --num-gpus N Number of GPU workers to start (default: 2)" | |
| echo " --port PORT Dispatcher port (default: 8000)" | |
| exit 0 | |
| ;; | |
| *) | |
| echo "Unknown option: $1" >&2 | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| # Function to cleanup background processes | |
| cleanup() { | |
| echo "" | |
| echo "π Shutting down system..." | |
| # Kill dispatcher | |
| if [[ -n $DISPATCHER_PID ]]; then | |
| echo "Stopping dispatcher (PID: $DISPATCHER_PID)..." | |
| kill $DISPATCHER_PID 2>/dev/null | |
| wait $DISPATCHER_PID 2>/dev/null | |
| fi | |
| # Kill workers by finding their processes | |
| echo "Stopping workers..." | |
| pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true | |
| sleep 2 | |
| # Force kill if any are still running | |
| pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true | |
| echo "β System stopped" | |
| exit 0 | |
| } | |
| # Set up signal handlers | |
| trap cleanup SIGINT SIGTERM | |
| echo "π Starting Multi-GPU Neural OS System" | |
| echo "========================================" | |
| echo "π Number of GPUs: $NUM_GPUS" | |
| echo "π Dispatcher port: $DISPATCHER_PORT" | |
| echo "π» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))" | |
| echo "π Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log" | |
| echo "" | |
| # Check if required files exist | |
| if [[ ! -f "dispatcher.py" ]]; then | |
| echo "β Error: dispatcher.py not found" | |
| exit 1 | |
| fi | |
| if [[ ! -f "worker.py" ]]; then | |
| echo "β Error: worker.py not found" | |
| exit 1 | |
| fi | |
| if [[ ! -f "start_workers.py" ]]; then | |
| echo "β Error: start_workers.py not found" | |
| exit 1 | |
| fi | |
| # Start dispatcher | |
| echo "π― Starting dispatcher..." | |
| python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 & | |
| DISPATCHER_PID=$! | |
| # Wait a bit for dispatcher to start | |
| sleep 3 | |
| # Check if dispatcher started successfully | |
| if ! kill -0 $DISPATCHER_PID 2>/dev/null; then | |
| echo "β Failed to start dispatcher. Check dispatcher.log for errors." | |
| exit 1 | |
| fi | |
| echo "β Dispatcher started (PID: $DISPATCHER_PID)" | |
| # Start workers | |
| echo "π§ Starting $NUM_GPUS GPU workers..." | |
| python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1 | |
| WORKER_START_EXIT_CODE=$? | |
| # Wait a bit for workers to register | |
| sleep 3 | |
| # Check if workers started successfully by checking the exit code and log | |
| if [ $WORKER_START_EXIT_CODE -ne 0 ]; then | |
| echo "β Failed to start workers. Check workers.log for errors." | |
| cleanup | |
| exit 1 | |
| fi | |
| # Check if workers are actually running by looking for their processes | |
| RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0") | |
| if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then | |
| echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors." | |
| cleanup | |
| exit 1 | |
| fi | |
| echo "β Workers started successfully ($RUNNING_WORKERS workers running)" | |
| echo "" | |
| echo "π System is ready!" | |
| echo "================================" | |
| echo "π Web interface: http://localhost:$DISPATCHER_PORT" | |
| echo "π Dispatcher health: http://localhost:$DISPATCHER_PORT" | |
| echo "π§ Worker health checks:" | |
| for ((i=0; i<NUM_GPUS; i++)); do | |
| echo " GPU $i: http://localhost:$((8001 + i))/health" | |
| done | |
| echo "" | |
| echo "π Log files:" | |
| echo " System analytics: system_analytics_*.log (real-time monitoring)" | |
| echo " Dispatcher: dispatcher.log" | |
| echo " Workers summary: workers.log" | |
| for ((i=0; i<NUM_GPUS; i++)); do | |
| echo " GPU $i worker: worker_gpu_$i.log" | |
| done | |
| echo "" | |
| echo "π‘ Monitor system in real-time: tail -f system_analytics_*.log" | |
| echo "Press Ctrl+C to stop the system" | |
| echo "================================" | |
| # Keep the script running and wait for interrupt | |
| while true; do | |
| # Check if processes are still running | |
| if ! kill -0 $DISPATCHER_PID 2>/dev/null; then | |
| echo "β οΈ Dispatcher process died unexpectedly" | |
| cleanup | |
| exit 1 | |
| fi | |
| # Check if workers are still running | |
| CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0") | |
| if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then | |
| echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS" | |
| cleanup | |
| exit 1 | |
| fi | |
| sleep 5 | |
| done |