Spaces:
Runtime error
Runtime error
File size: 7,110 Bytes
4cf05e2 87928b2 1b83353 35ae555 87928b2 ec80f26 50f9f62 87928b2 b560d3f 87928b2 b560d3f 87928b2 b560d3f de36b22 87928b2 a1dedb3 cf56a9f 4e76cb1 87928b2 b560d3f ec80f26 b560d3f 7dc650e cf56a9f 87928b2 7dc650e 87928b2 16e52aa 35ae555 87928b2 35ae555 87928b2 50f9f62 4e76cb1 87928b2 178be1e 67ca568 178be1e 67ca568 4cf05e2 cf56a9f 178be1e 4cf05e2 35ae555 4e76cb1 1b83353 4cf05e2 1b83353 4e76cb1 178be1e 1b83353 35ae555 87928b2 061044d 4cf05e2 67ca568 4cf05e2 67ca568 4cf05e2 67ca568 87928b2 5726c3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import uvicorn
import re
from dotenv import load_dotenv
import spaces
load_dotenv()
app = FastAPI()
global_data = {
'models': {},
'tokens': {
'eos': 'eos_token',
'pad': 'pad_token',
'padding': 'padding_token',
'unk': 'unk_token',
'bos': 'bos_token',
'sep': 'sep_token',
'cls': 'cls_token',
'mask': 'mask_token'
}
}
model_configs = [
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
{"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
{"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
{"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
{"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
{"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
]
class ModelManager:
def __init__(self):
self.loaded = False
def load_model(self, model_config):
try:
return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
except Exception as e:
print(f"Error loading model {model_config['name']}: {e}")
return None
def load_all_models(self):
if self.loaded:
return global_data['models']
try:
with ThreadPoolExecutor() as executor:
futures = [executor.submit(self.load_model, config) for config in model_configs]
models = []
for future in as_completed(futures):
model = future.result()
if model:
models.append(model)
global_data['models'] = models
self.loaded = True
return models
except Exception as e:
print(f"Error loading all models: {e}")
return []
model_manager = ModelManager()
model_manager.load_all_models()
class ChatRequest(BaseModel):
message: str
top_k: int = 50
top_p: float = 0.95
temperature: float = 0.7
def normalize_input(input_text):
return input_text.strip()
def remove_duplicates(text):
text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
text = text.replace('[/INST]', '')
lines = text.split('\n')
unique_lines = []
seen_lines = set()
for line in lines:
if line not in seen_lines:
seen_lines.add(line)
unique_lines.append(line)
return '\n'.join(unique_lines)
def remove_repetitive_responses(responses):
seen = set()
unique_responses = []
for response in responses:
normalized_response = remove_duplicates(response['response'])
if normalized_response not in seen:
seen.add(normalized_response)
unique_responses.append(response)
return unique_responses
def generate_chat_response(request, model_data):
model = model_data['model']
try:
user_input = normalize_input(request.message)
response = model(user_input, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
return response
except Exception as e:
print(f"Error generating response with model {model_data['name']}: {e}")
return None
@app.post("/generate")
@spaces.GPU(duration=0)
async def generate(request: ChatRequest):
try:
responses = []
models = global_data['models']
for model_data in models:
response = generate_chat_response(request, model_data)
if response:
responses.append({
"model": model_data['name'],
"response": response
})
if not responses:
raise HTTPException(status_code=500, detail="Error: No responses generated.")
responses = remove_repetitive_responses(responses)
best_response = responses[0] if responses else {}
return {
"best_response": best_response,
"all_responses": responses
}
except Exception:
pass
@app.api_route("/{method_name:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
async def handle_request(method_name: str, request: Request):
try:
body = await request.json()
return {"message": "Request handled successfully", "body": body}
except Exception:
pass
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)
|