from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = FastAPI() # Load the Mongolian Llama model and tokenizer model_name = "Dorjzodovsuren/Mongolian_Llama3-v0.1" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) class UserInput(BaseModel): text: str @app.post("/generate/") def generate_response(user_input: UserInput): # Tokenize the input text inputs = tokenizer(user_input.text, return_tensors="pt") # Generate response with torch.no_grad(): outputs = model.generate( **inputs, max_length=100, # Adjust for desired response length num_return_sequences=1, temperature=0.7, # Adjust for creativity top_p=0.9 # Adjust for response diversity ) # Decode the generated text response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response}