from fastapi import FastAPI, HTTPException, Request from pydantic import BaseModel from llama_cpp import Llama from concurrent.futures import ThreadPoolExecutor, as_completed import uvicorn import re from dotenv import load_dotenv from spaces import GPU load_dotenv() app = FastAPI() # Initialize ZeroGPU try: GPU.initialize() except Exception as e: print(f"ZeroGPU initialization failed: {e}") # Global data dictionary global_data = { 'models': {}, 'tokens': { 'eos': 'eos_token', 'pad': 'pad_token', 'padding': 'padding_token', 'unk': 'unk_token', 'bos': 'bos_token', 'sep': 'sep_token', 'cls': 'cls_token', 'mask': 'mask_token' } } model_configs = [ {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"}, {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"}, {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"}, {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"}, {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"}, {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"}, {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"}, {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"}, {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}, {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"}, {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}, {"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"}, {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"}, {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"}, {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"}, {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"}, {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"}, {"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"}, {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"} ] class ModelManager: def __init__(self): self.loaded = False def load_model(self, model_config): try: return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']} except Exception as e: print(f"Error loading model {model_config['name']}: {e}") return None def load_all_models(self): if self.loaded: return global_data['models'] try: with ThreadPoolExecutor() as executor: futures = [executor.submit(self.load_model, config) for config in model_configs] models = [] for future in as_completed(futures): model = future.result() if model: models.append(model) global_data['models'] = {model['name']: model['model'] for model in models} self.loaded = True return global_data['models'] except Exception as e: print(f"Error loading models: {e}") return {} model_manager = ModelManager() model_manager.load_all_models() class ChatRequest(BaseModel): message: str top_k: int = 50 top_p: float = 0.95 temperature: float = 0.7 def normalize_input(input_text): return input_text.strip() def remove_duplicates(text): text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text) text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text) text = text.replace('[/INST]', '') lines = text.split('\n') unique_lines = [] seen_lines = set() for line in lines: if line not in seen_lines: seen_lines.add(line) unique_lines.append(line) return '\n'.join(unique_lines) def remove_repetitive_responses(responses): seen = set() unique_responses = [] for response in responses: normalized_response = remove_duplicates(response['response']) if normalized_response not in seen: seen.add(normalized_response) unique_responses.append({'model': response['model'], 'response': normalized_response}) return unique_responses @app.post("/generate/") @GPU(duration=0) async def generate(request: ChatRequest): try: inputs = normalize_input(request.message) futures = [ executor.submit(model.generate, inputs, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature) for model in global_data['models'].values() ] responses = [{'model': model, 'response': future.result()} for model, future in zip(global_data['models'].keys(), as_completed(futures))] unique_responses = remove_repetitive_responses(responses) return unique_responses except Exception as e: print(f"Error generating responses: {e}") raise HTTPException(status_code=500, detail="Error generating responses") @app.middleware("http") async def process_request(request: Request, call_next): try: response = await call_next(request) return response except Exception as e: print(f"Request error: {e}") raise HTTPException(status_code=500, detail="Internal Server Error") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)