Spaces:
Runtime error
Runtime error
File size: 6,890 Bytes
4cf05e2 87928b2 1b83353 ebc22be 87928b2 ebc22be 87928b2 ec80f26 50f9f62 87928b2 b560d3f 87928b2 b560d3f 87928b2 b560d3f de36b22 87928b2 a1dedb3 c8e35b7 ebc22be 4e76cb1 87928b2 b560d3f ec80f26 b560d3f 7dc650e 2aca525 7dc650e 2aca525 c8e35b7 2aca525 87928b2 7dc650e 87928b2 16e52aa 35ae555 87928b2 35ae555 87928b2 50f9f62 2aca525 87928b2 ebc22be 90624da ebc22be 1b83353 90624da c8e35b7 da3119b 90624da 67ca568 87928b2 2aca525 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import uvicorn
import re
from dotenv import load_dotenv
from spaces import GPU
load_dotenv()
app = FastAPI()
# Initialize ZeroGPU
try:
GPU.initialize()
except Exception as e:
print(f"ZeroGPU initialization failed: {e}")
# Global data dictionary
global_data = {
'models': {},
'tokens': {
'eos': 'eos_token',
'pad': 'pad_token',
'padding': 'padding_token',
'unk': 'unk_token',
'bos': 'bos_token',
'sep': 'sep_token',
'cls': 'cls_token',
'mask': 'mask_token'
}
}
model_configs = [
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
{"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
{"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
{"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
{"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
{"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
]
class ModelManager:
def __init__(self):
self.loaded = False
def load_model(self, model_config):
try:
return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
except Exception as e:
print(f"Error loading model {model_config['name']}: {e}")
return None
def load_all_models(self):
if self.loaded:
return global_data['models']
try:
with ThreadPoolExecutor() as executor:
futures = [executor.submit(self.load_model, config) for config in model_configs]
models = []
for future in as_completed(futures):
model = future.result()
if model:
models.append(model)
global_data['models'] = {model['name']: model['model'] for model in models}
self.loaded = True
return global_data['models']
except Exception as e:
print(f"Error loading models: {e}")
return {}
model_manager = ModelManager()
model_manager.load_all_models()
class ChatRequest(BaseModel):
message: str
top_k: int = 50
top_p: float = 0.95
temperature: float = 0.7
def normalize_input(input_text):
return input_text.strip()
def remove_duplicates(text):
text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
text = text.replace('[/INST]', '')
lines = text.split('\n')
unique_lines = []
seen_lines = set()
for line in lines:
if line not in seen_lines:
seen_lines.add(line)
unique_lines.append(line)
return '\n'.join(unique_lines)
def remove_repetitive_responses(responses):
seen = set()
unique_responses = []
for response in responses:
normalized_response = remove_duplicates(response['response'])
if normalized_response not in seen:
seen.add(normalized_response)
unique_responses.append({'model': response['model'], 'response': normalized_response})
return unique_responses
@app.post("/generate/")
@GPU(duration=0)
async def generate(request: ChatRequest):
try:
inputs = normalize_input(request.message)
futures = [
executor.submit(model.generate, inputs, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
for model in global_data['models'].values()
]
responses = [{'model': model, 'response': future.result()} for model, future in zip(global_data['models'].keys(), as_completed(futures))]
unique_responses = remove_repetitive_responses(responses)
return unique_responses
except Exception as e:
print(f"Error generating responses: {e}")
raise HTTPException(status_code=500, detail="Error generating responses")
@app.middleware("http")
async def process_request(request: Request, call_next):
try:
response = await call_next(request)
return response
except Exception as e:
print(f"Request error: {e}")
raise HTTPException(status_code=500, detail="Internal Server Error")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
|