lfoppiano commited on
Commit
f1ac57a
·
1 Parent(s): b7b1a78

add modal inference

Browse files
Files changed (1) hide show
  1. deployment/modal_inference.py +67 -0
deployment/modal_inference.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import modal
4
+
5
+ vllm_image = (
6
+ modal.Image.debian_slim(python_version="3.10")
7
+ .pip_install(
8
+ "vllm",
9
+ "huggingface_hub[hf_transfer]==0.26.2",
10
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
11
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
12
+ )
13
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
14
+ )
15
+
16
+ MODELS_DIR = "/llamas"
17
+ MODEL_NAME = "microsoft/Phi-4-mini-instruct"
18
+ MODEL_REVISION = "c0fb9e74abda11b496b7907a9c6c9009a7a0488f"
19
+
20
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
21
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
22
+
23
+
24
+ app = modal.App("phi-4-mini-instruct-qa-vllm")
25
+
26
+ N_GPU = 1
27
+ MINUTES = 60 # seconds
28
+ VLLM_PORT = 8000
29
+
30
+
31
+ @app.function(
32
+ image=vllm_image,
33
+ # gpu=f"L40S:{N_GPU}",
34
+ gpu=f"A10G:{N_GPU}",
35
+ # how long should we stay up with no requests?
36
+ scaledown_window=5 * MINUTES,
37
+ volumes={
38
+ "/root/.cache/huggingface": hf_cache_vol,
39
+ "/root/.cache/vllm": vllm_cache_vol,
40
+ },
41
+ secrets=[modal.Secret.from_name("document-qa-api-key")]
42
+ )
43
+ @modal.concurrent(
44
+ max_inputs=5
45
+ ) # how many requests can one replica handle? tune carefully!
46
+ @modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
47
+ def serve():
48
+ import subprocess
49
+
50
+ cmd = [
51
+ "vllm",
52
+ "serve",
53
+ "--uvicorn-log-level=info",
54
+ MODEL_NAME,
55
+ "--revision",
56
+ MODEL_REVISION,
57
+ "--max-model-len",
58
+ "32768",
59
+ "--host",
60
+ "0.0.0.0",
61
+ "--port",
62
+ str(VLLM_PORT),
63
+ "--api-key",
64
+ os.environ["API_KEY"],
65
+ ]
66
+
67
+ subprocess.Popen(" ".join(cmd), shell=True)