File size: 1,167 Bytes
bf190b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
from typing import Optional
import uvicorn

app = FastAPI(
	title="OmniVLM API",
	description="API for text generation using OmniVLM model",
	version="1.0.0"
)

# Initialize the model
llm = Llama.from_pretrained(
	repo_id="NexaAIDev/OmniVLM-968M",
	filename="omnivision-text-optimized-llm-Q8_0.gguf",
)

class GenerationRequest(BaseModel):
	prompt: str
	max_tokens: Optional[int] = 100
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 0.9

class GenerationResponse(BaseModel):
	generated_text: str

@app.post("/generate", response_model=GenerationResponse)
async def generate_text(request: GenerationRequest):
	try:
		output = llm(
			request.prompt,
			max_tokens=request.max_tokens,
			temperature=request.temperature,
			top_p=request.top_p
		)
		
		return GenerationResponse(generated_text=output["choices"][0]["text"])
	except Exception as e:
		raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
	return {"status": "healthy"}

if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000)