ameliakris commited on
Commit
613c8f7
·
1 Parent(s): 16815f0

Update Dockerfile and improve error handling

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -3
  2. app.py +5 -0
  3. llm.py +19 -6
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
@@ -6,12 +6,21 @@ WORKDIR /app
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
 
9
  software-properties-common \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Copy requirements first to leverage Docker cache
13
  COPY requirements.txt .
14
- RUN pip install -r requirements.txt
 
 
 
 
 
 
 
 
15
 
16
  # Copy the rest of the application
17
  COPY . .
@@ -19,9 +28,10 @@ COPY . .
19
  # Set environment variables
20
  ENV HOST=0.0.0.0
21
  ENV PORT=7860
 
22
 
23
  # Expose the port HF Spaces expects
24
  EXPOSE 7860
25
 
26
  # Start the FastAPI app
27
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.9
2
 
3
  WORKDIR /app
4
 
 
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
9
+ git \
10
  software-properties-common \
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
  # Copy requirements first to leverage Docker cache
14
  COPY requirements.txt .
15
+
16
+ # Install Python packages
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Pre-download the model
20
+ RUN python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; \
21
+ model_id='mradermacher/Huihui-gemma-3n-E4B-it-abliterated-GGUF'; \
22
+ tokenizer = AutoTokenizer.from_pretrained(model_id); \
23
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')"
24
 
25
  # Copy the rest of the application
26
  COPY . .
 
28
  # Set environment variables
29
  ENV HOST=0.0.0.0
30
  ENV PORT=7860
31
+ ENV PYTHONUNBUFFERED=1
32
 
33
  # Expose the port HF Spaces expects
34
  EXPOSE 7860
35
 
36
  # Start the FastAPI app
37
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app.py CHANGED
@@ -22,6 +22,11 @@ app.add_middleware(
22
  allow_headers=["*"],
23
  )
24
 
 
 
 
 
 
25
  # Initialize Supabase client
26
  supabase_url = os.getenv("SUPABASE_URL")
27
  supabase_key = os.getenv("SUPABASE_SERVICE_KEY")
 
22
  allow_headers=["*"],
23
  )
24
 
25
+ @app.get("/health")
26
+ async def health_check():
27
+ """Health check endpoint"""
28
+ return {"status": "healthy", "version": "1.0.0"}
29
+
30
  # Initialize Supabase client
31
  supabase_url = os.getenv("SUPABASE_URL")
32
  supabase_key = os.getenv("SUPABASE_SERVICE_KEY")
llm.py CHANGED
@@ -8,12 +8,25 @@ load_dotenv()
8
  class LLMPipeline:
9
  def __init__(self):
10
  model_id = os.getenv("HF_MODEL_ID", "mradermacher/Huihui-gemma-3n-E4B-it-abliterated-GGUF")
11
- self.pipeline = pipeline(
12
- "text-generation",
13
- model=model_id,
14
- torch_dtype=torch.float16,
15
- device_map="auto"
16
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  async def generate(self, prompt: str, max_length: int = 100) -> str:
19
  """Generate text using the local Gemma model."""
 
8
  class LLMPipeline:
9
  def __init__(self):
10
  model_id = os.getenv("HF_MODEL_ID", "mradermacher/Huihui-gemma-3n-E4B-it-abliterated-GGUF")
11
+ try:
12
+ # Try to use CUDA if available
13
+ if torch.cuda.is_available():
14
+ device = "cuda"
15
+ dtype = torch.float16
16
+ else:
17
+ device = "cpu"
18
+ dtype = torch.float32
19
+
20
+ self.pipeline = pipeline(
21
+ "text-generation",
22
+ model=model_id,
23
+ torch_dtype=dtype,
24
+ device_map="auto" if device == "cuda" else None,
25
+ model_kwargs={"low_cpu_mem_usage": True}
26
+ )
27
+ except Exception as e:
28
+ print(f"Error loading model: {e}")
29
+ raise
30
 
31
  async def generate(self, prompt: str, max_length: int = 100) -> str:
32
  """Generate text using the local Gemma model."""