Spaces:
Sleeping
Sleeping
File size: 2,591 Bytes
b405fea 57efdb3 b405fea 8dc137a b405fea 57efdb3 83f8a3e 9196e30 57efdb3 9196e30 57efdb3 9196e30 57efdb3 9e09549 9196e30 57efdb3 9196e30 b405fea 1749217 9196e30 1749217 8dc137a 9196e30 8dc137a 1749217 8dc137a 1749217 8dc137a 1749217 9196e30 1749217 9196e30 1749217 5eb8313 1749217 9196e30 b405fea 1749217 8dc137a 1749217 9196e30 b405fea 9196e30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import snapshot_download
class ModelInput(BaseModel):
prompt: str
max_new_tokens: int = 50
app = FastAPI()
# Define model paths
base_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
adapter_path = "khurrameycon/SmolLM-135M-Instruct-qa_pairs_converted.json-25epochs"
try:
# First load the base model
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
trust_remote_code=True,
device_map="auto"
)
# Load tokenizer from base model
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
# Download and load adapter weights
print("Loading adapter weights...")
adapter_path_local = snapshot_download(adapter_path)
# Load the adapter weights
state_dict = torch.load(f"{adapter_path_local}/adapter_model.safetensors")
model.load_state_dict(state_dict, strict=False)
print("Model and adapter loaded successfully!")
except Exception as e:
print(f"Error during model loading: {e}")
raise
def generate_response(model, tokenizer, instruction, max_new_tokens=128):
"""Generate a response from the model based on an instruction."""
try:
messages = [{"role": "user", "content": instruction}]
input_text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(
inputs,
max_new_tokens=max_new_tokens,
temperature=0.2,
top_p=0.9,
do_sample=True,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
except Exception as e:
raise ValueError(f"Error generating response: {e}")
@app.post("/generate")
async def generate_text(input: ModelInput):
try:
response = generate_response(
model=model,
tokenizer=tokenizer,
instruction=input.prompt,
max_new_tokens=input.max_new_tokens
)
return {"generated_text": response}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
async def root():
return {"message": "Welcome to the Model API!"} |