NLPGenius commited on
Commit
faca925
·
1 Parent(s): 3de5747

Optimize for GPU acceleration on Hugging Face Spaces

Browse files
Files changed (3) hide show
  1. Dockerfile +37 -8
  2. app.py +31 -1
  3. requirements.txt +20 -20
Dockerfile CHANGED
@@ -1,16 +1,45 @@
1
- FROM python:3.11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  WORKDIR /code
4
 
 
5
  COPY ./requirements.txt /code/requirements.txt
6
- RUN python3 -m pip install --no-cache-dir --upgrade pip
7
- RUN python3 -m pip install --no-cache-dir --upgrade -r /code/requirements.txt
 
8
 
 
9
  COPY . .
10
 
11
- RUN mkdir /.cache
12
- RUN chmod 777 /.cache
13
- RUN mkdir .chroma
14
- RUN chmod 777 .chroma
 
 
15
 
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
+ FROM nvidia/cuda:11.8-devel-ubuntu22.04
2
+
3
+ # Install Python 3.11
4
+ RUN apt-get update && apt-get install -y \
5
+ software-properties-common \
6
+ && add-apt-repository ppa:deadsnakes/ppa \
7
+ && apt-get update && apt-get install -y \
8
+ python3.11 \
9
+ python3.11-pip \
10
+ python3.11-dev \
11
+ build-essential \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Set Python 3.11 as default
15
+ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
16
+ RUN update-alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 1
17
+
18
+ # Set environment variables for GPU optimization
19
+ ENV PYTHONUNBUFFERED=1
20
+ ENV PYTHONDONTWRITEBYTECODE=1
21
+ ENV PIP_NO_CACHE_DIR=1
22
+ ENV PIP_DISABLE_PIP_VERSION_CHECK=1
23
+ ENV CUDA_VISIBLE_DEVICES=0
24
+ ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6"
25
 
26
  WORKDIR /code
27
 
28
+ # Copy and install requirements first (for better caching)
29
  COPY ./requirements.txt /code/requirements.txt
30
+ RUN pip3 install --no-cache-dir --upgrade pip && \
31
+ pip3 install --no-cache-dir -r /code/requirements.txt && \
32
+ python3 -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('stopwords', quiet=True)"
33
 
34
+ # Copy application files
35
  COPY . .
36
 
37
+ # Create necessary directories with proper permissions
38
+ RUN mkdir -p /.cache .chroma /root/.cache/huggingface && \
39
+ chmod 777 /.cache .chroma /root/.cache/huggingface
40
+
41
+ # Expose port
42
+ EXPOSE 7860
43
 
44
+ # Use exec form with GPU-optimized settings
45
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from fastapi import FastAPI, Request
3
  from fastapi.responses import HTMLResponse, JSONResponse
4
  from fastapi.staticfiles import StaticFiles
@@ -14,6 +15,18 @@ from lawchatbot.retrievers import (
14
  )
15
  from lawchatbot.rag_chain import initialize_llm, build_rag_chain, run_rag_query
16
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  app = FastAPI()
18
 
19
  # Set up static and template directories (relative to this file)
@@ -26,6 +39,9 @@ _system = {}
26
 
27
  @app.on_event("startup")
28
  def startup_event():
 
 
 
29
  with _init_lock:
30
  if not _system:
31
  config = AppConfig(
@@ -39,6 +55,8 @@ def startup_event():
39
  bm25_k=10,
40
  alpha=0.5
41
  )
 
 
42
  client = initialize_weaviate_client(config)
43
  vectorstore = initialize_vector_store(client, config)
44
  semantic_ret = initialize_semantic_retriever(vectorstore, config)
@@ -47,15 +65,22 @@ def startup_event():
47
  hybrid_ret = wrap_retriever_with_source(hybrid_ret)
48
  llm = initialize_llm()
49
  rag_chain = build_rag_chain(llm, hybrid_ret)
 
50
  _system.update({
51
  "client": client,
52
  "rag_chain": rag_chain
53
  })
 
54
  try:
55
  print("⏳ Pre-warming system with dummy query...")
56
  dummy_question = "This is a warmup question."
57
  rag_chain.invoke({"question": dummy_question})
58
- print("✅ System pre-warmed and ready for fast responses.")
 
 
 
 
 
59
  except Exception as e:
60
  print(f"Warmup failed: {e}")
61
 
@@ -70,6 +95,11 @@ def shutdown_event():
70
  sys.close()
71
  except Exception:
72
  pass
 
 
 
 
 
73
 
74
  @app.get("/", response_class=HTMLResponse)
75
  def chat_page(request: Request):
 
1
  import os
2
+ import torch
3
  from fastapi import FastAPI, Request
4
  from fastapi.responses import HTMLResponse, JSONResponse
5
  from fastapi.staticfiles import StaticFiles
 
15
  )
16
  from lawchatbot.rag_chain import initialize_llm, build_rag_chain, run_rag_query
17
 
18
+ # GPU optimization setup
19
+ def setup_gpu_optimization():
20
+ """Configure GPU settings for optimal performance"""
21
+ if torch.cuda.is_available():
22
+ torch.backends.cudnn.benchmark = True
23
+ torch.backends.cuda.matmul.allow_tf32 = True
24
+ torch.backends.cudnn.allow_tf32 = True
25
+ print(f"🚀 GPU detected: {torch.cuda.get_device_name(0)}")
26
+ print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
27
+ else:
28
+ print("⚠️ No GPU detected, using CPU")
29
+
30
  app = FastAPI()
31
 
32
  # Set up static and template directories (relative to this file)
 
39
 
40
  @app.on_event("startup")
41
  def startup_event():
42
+ # Setup GPU optimization first
43
+ setup_gpu_optimization()
44
+
45
  with _init_lock:
46
  if not _system:
47
  config = AppConfig(
 
55
  bm25_k=10,
56
  alpha=0.5
57
  )
58
+
59
+ print("🔄 Initializing system components with GPU acceleration...")
60
  client = initialize_weaviate_client(config)
61
  vectorstore = initialize_vector_store(client, config)
62
  semantic_ret = initialize_semantic_retriever(vectorstore, config)
 
65
  hybrid_ret = wrap_retriever_with_source(hybrid_ret)
66
  llm = initialize_llm()
67
  rag_chain = build_rag_chain(llm, hybrid_ret)
68
+
69
  _system.update({
70
  "client": client,
71
  "rag_chain": rag_chain
72
  })
73
+
74
  try:
75
  print("⏳ Pre-warming system with dummy query...")
76
  dummy_question = "This is a warmup question."
77
  rag_chain.invoke({"question": dummy_question})
78
+
79
+ # Clear GPU cache after warmup
80
+ if torch.cuda.is_available():
81
+ torch.cuda.empty_cache()
82
+
83
+ print("✅ System pre-warmed and ready for fast GPU-accelerated responses.")
84
  except Exception as e:
85
  print(f"Warmup failed: {e}")
86
 
 
95
  sys.close()
96
  except Exception:
97
  pass
98
+
99
+ # Clear GPU memory on shutdown
100
+ if torch.cuda.is_available():
101
+ torch.cuda.empty_cache()
102
+ print("🧹 GPU memory cleared on shutdown")
103
 
104
  @app.get("/", response_class=HTMLResponse)
105
  def chat_page(request: Request):
requirements.txt CHANGED
@@ -1,20 +1,20 @@
1
- neumai
2
- weaviate-client
3
- langchain-weaviate
4
- langchain_community
5
- rank_bm25
6
- transformers>=4.42.0
7
- accelerate
8
- torch
9
- torchvision
10
- pillow
11
- nltk
12
- python-dotenv
13
- pydantic
14
- pydantic-settings
15
- langchain
16
- langchain-core
17
- langchain-openai
18
- fastapi
19
- uvicorn
20
- jinja2
 
1
+ # GPU-optimized for Hugging Face Spaces (16GB VRAM)
2
+ weaviate-client==3.24.2
3
+ langchain-weaviate==0.0.3
4
+ langchain-community==0.2.16
5
+ langchain==0.2.16
6
+ langchain-core==0.2.38
7
+ langchain-openai==0.1.25
8
+ rank_bm25==0.2.2
9
+ transformers==4.42.0
10
+ torch==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118
11
+ accelerate==0.24.1
12
+ sentence-transformers==2.2.2
13
+ python-dotenv==1.0.0
14
+ pydantic==2.8.2
15
+ pydantic-settings==2.4.0
16
+ fastapi==0.112.0
17
+ uvicorn[standard]==0.30.6
18
+ jinja2==3.1.4
19
+ nltk==3.8.1
20
+ numpy==1.24.3