File size: 1,624 Bytes
bf190b6
 
 
 
 
e0172c2
1be012e
bf190b6
 
 
 
 
 
 
e0172c2
 
bf190b6
e0172c2
 
 
 
 
 
59ee418
 
 
 
bf190b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1be012e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
from typing import Optional
import uvicorn
import huggingface_hub
import os

app = FastAPI(
	title="OmniVLM API",
	description="API for text generation using OmniVLM model",
	version="1.0.0"
)

# Download the model from Hugging Face Hub
model_path = huggingface_hub.hf_hub_download(
	repo_id="NexaAIDev/OmniVLM-968M",
	filename="omnivision-text-optimized-llm-Q8_0.gguf"
)

# Initialize the model with the downloaded file
llm = Llama(
	model_path=model_path,
	n_ctx=2048,        # Context window
	n_threads=4,       # Number of CPU threads to use
	n_batch=512,       # Number of tokens to process in parallel
	verbose=True       # Enable verbose logging for debugging
)

class GenerationRequest(BaseModel):
	prompt: str
	max_tokens: Optional[int] = 100
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 0.9

class GenerationResponse(BaseModel):
	generated_text: str

@app.post("/generate", response_model=GenerationResponse)
async def generate_text(request: GenerationRequest):
	try:
		output = llm(
			request.prompt,
			max_tokens=request.max_tokens,
			temperature=request.temperature,
			top_p=request.top_p
		)
		
		return GenerationResponse(generated_text=output["choices"][0]["text"])
	except Exception as e:
		raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
	return {"status": "healthy"}

if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))  # Hugging Face Spaces uses port 7860 by default
	uvicorn.run(app, host="0.0.0.0", port=port)