Spaces:

lfoppiano
/

document-qa

Running

App Files Files Community

lfoppiano commited on May 23

Commit

f1ac57a

1 Parent(s): b7b1a78

add modal inference

Browse files

Files changed (1) hide show

deployment/modal_inference.py +67 -0

deployment/modal_inference.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import modal
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        "vllm",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+MODELS_DIR = "/llamas"
+MODEL_NAME = "microsoft/Phi-4-mini-instruct"
+MODEL_REVISION = "c0fb9e74abda11b496b7907a9c6c9009a7a0488f"
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+app = modal.App("phi-4-mini-instruct-qa-vllm")
+N_GPU = 1
+MINUTES = 60  # seconds
+VLLM_PORT = 8000
+@app.function(
+    image=vllm_image,
+    # gpu=f"L40S:{N_GPU}",
+    gpu=f"A10G:{N_GPU}",
+    # how long should we stay up with no requests?
+    scaledown_window=5 * MINUTES,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("document-qa-api-key")]
+)
+@modal.concurrent(
+    max_inputs=5
+)  # how many requests can one replica handle? tune carefully!
+@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
+def serve():
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        MODEL_NAME,
+        "--revision",
+        MODEL_REVISION,
+        "--max-model-len",
+        "32768",
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--api-key",
+        os.environ["API_KEY"],
+    ]
+    subprocess.Popen(" ".join(cmd), shell=True)