Spaces:

NLPGenius
/

LawChatbot

Sleeping

App Files Files Community

NLPGenius commited on Aug 4, 2025

Commit

faca925

1 Parent(s): 3de5747

Optimize for GPU acceleration on Hugging Face Spaces

Browse files

Files changed (3) hide show

Dockerfile +37 -8
app.py +31 -1
requirements.txt +20 -20

Dockerfile CHANGED Viewed

@@ -1,16 +1,45 @@
-FROM python:3.11
 WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-RUN python3 -m pip install --no-cache-dir --upgrade -r /code/requirements.txt
 COPY . .
-RUN mkdir /.cache
-RUN chmod 777 /.cache
-RUN mkdir .chroma
-RUN chmod 777 .chroma
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM nvidia/cuda:11.8-devel-ubuntu22.04
+# Install Python 3.11
+RUN apt-get update && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-pip \
+    python3.11-dev \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Set Python 3.11 as default
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
+RUN update-alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 1
+# Set environment variables for GPU optimization
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PIP_NO_CACHE_DIR=1
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+ENV CUDA_VISIBLE_DEVICES=0
+ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6"
 WORKDIR /code
+# Copy and install requirements first (for better caching)
 COPY ./requirements.txt /code/requirements.txt
+RUN pip3 install --no-cache-dir --upgrade pip && \
+    pip3 install --no-cache-dir -r /code/requirements.txt && \
+    python3 -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('stopwords', quiet=True)"
+# Copy application files
 COPY . .
+# Create necessary directories with proper permissions
+RUN mkdir -p /.cache .chroma /root/.cache/huggingface && \
+    chmod 777 /.cache .chroma /root/.cache/huggingface
+# Expose port
+EXPOSE 7860
+# Use exec form with GPU-optimized settings
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
@@ -14,6 +15,18 @@ from lawchatbot.retrievers import (
 )
 from lawchatbot.rag_chain import initialize_llm, build_rag_chain, run_rag_query
 app = FastAPI()
 # Set up static and template directories (relative to this file)
@@ -26,6 +39,9 @@ _system = {}
 @app.on_event("startup")
 def startup_event():
     with _init_lock:
         if not _system:
             config = AppConfig(
@@ -39,6 +55,8 @@ def startup_event():
                 bm25_k=10,
                 alpha=0.5
             )
             client = initialize_weaviate_client(config)
             vectorstore = initialize_vector_store(client, config)
             semantic_ret = initialize_semantic_retriever(vectorstore, config)
@@ -47,15 +65,22 @@ def startup_event():
             hybrid_ret = wrap_retriever_with_source(hybrid_ret)
             llm = initialize_llm()
             rag_chain = build_rag_chain(llm, hybrid_ret)
             _system.update({
                 "client": client,
                 "rag_chain": rag_chain
             })
             try:
                 print("⏳ Pre-warming system with dummy query...")
                 dummy_question = "This is a warmup question."
                 rag_chain.invoke({"question": dummy_question})
-                print("✅ System pre-warmed and ready for fast responses.")
             except Exception as e:
                 print(f"Warmup failed: {e}")
@@ -70,6 +95,11 @@ def shutdown_event():
             sys.close()
         except Exception:
             pass
 @app.get("/", response_class=HTMLResponse)
 def chat_page(request: Request):

 import os
+import torch
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
 )
 from lawchatbot.rag_chain import initialize_llm, build_rag_chain, run_rag_query
+# GPU optimization setup
+def setup_gpu_optimization():
+    """Configure GPU settings for optimal performance"""
+    if torch.cuda.is_available():
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        print(f"🚀 GPU detected: {torch.cuda.get_device_name(0)}")
+        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+    else:
+        print("⚠️ No GPU detected, using CPU")
 app = FastAPI()
 # Set up static and template directories (relative to this file)
 @app.on_event("startup")
 def startup_event():
+    # Setup GPU optimization first
+    setup_gpu_optimization()
     with _init_lock:
         if not _system:
             config = AppConfig(
                 bm25_k=10,
                 alpha=0.5
             )
+            print("🔄 Initializing system components with GPU acceleration...")
             client = initialize_weaviate_client(config)
             vectorstore = initialize_vector_store(client, config)
             semantic_ret = initialize_semantic_retriever(vectorstore, config)
             hybrid_ret = wrap_retriever_with_source(hybrid_ret)
             llm = initialize_llm()
             rag_chain = build_rag_chain(llm, hybrid_ret)
             _system.update({
                 "client": client,
                 "rag_chain": rag_chain
             })
             try:
                 print("⏳ Pre-warming system with dummy query...")
                 dummy_question = "This is a warmup question."
                 rag_chain.invoke({"question": dummy_question})
+                # Clear GPU cache after warmup
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                print("✅ System pre-warmed and ready for fast GPU-accelerated responses.")
             except Exception as e:
                 print(f"Warmup failed: {e}")
             sys.close()
         except Exception:
             pass
+    # Clear GPU memory on shutdown
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        print("🧹 GPU memory cleared on shutdown")
 @app.get("/", response_class=HTMLResponse)
 def chat_page(request: Request):

requirements.txt CHANGED Viewed

@@ -1,20 +1,20 @@
-neumai
-weaviate-client
-langchain-weaviate
-langchain_community
-rank_bm25
-transformers>=4.42.0
-accelerate
-torch
-torchvision
-pillow
-nltk
-python-dotenv
-pydantic
-pydantic-settings
-langchain
-langchain-core
-langchain-openai
-fastapi
-uvicorn
-jinja2

+# GPU-optimized for Hugging Face Spaces (16GB VRAM)
+weaviate-client==3.24.2
+langchain-weaviate==0.0.3
+langchain-community==0.2.16
+langchain==0.2.16
+langchain-core==0.2.38
+langchain-openai==0.1.25
+rank_bm25==0.2.2
+transformers==4.42.0
+torch==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118
+accelerate==0.24.1
+sentence-transformers==2.2.2
+python-dotenv==1.0.0
+pydantic==2.8.2
+pydantic-settings==2.4.0
+fastapi==0.112.0
+uvicorn[standard]==0.30.6
+jinja2==3.1.4
+nltk==3.8.1
+numpy==1.24.3