File size: 6,890 Bytes
4cf05e2
87928b2
 
 
 
1b83353
 
ebc22be
87928b2
 
 
 
 
ebc22be
 
 
 
 
 
 
87928b2
ec80f26
50f9f62
 
 
 
 
 
 
 
 
 
87928b2
 
 
 
 
b560d3f
 
 
 
 
 
 
 
 
 
 
87928b2
 
b560d3f
 
 
87928b2
 
 
 
 
b560d3f
de36b22
87928b2
a1dedb3
 
c8e35b7
 
ebc22be
4e76cb1
87928b2
b560d3f
ec80f26
b560d3f
7dc650e
 
 
 
 
 
 
 
 
2aca525
7dc650e
2aca525
c8e35b7
 
2aca525
87928b2
 
7dc650e
87928b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16e52aa
 
 
 
 
 
35ae555
87928b2
 
 
 
 
35ae555
87928b2
50f9f62
2aca525
87928b2
 
ebc22be
90624da
ebc22be
1b83353
90624da
 
 
 
 
 
c8e35b7
 
da3119b
90624da
 
 
 
 
 
 
 
 
 
 
67ca568
87928b2
2aca525
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import uvicorn
import re
from dotenv import load_dotenv
from spaces import GPU

load_dotenv()

app = FastAPI()

# Initialize ZeroGPU
try:
    GPU.initialize()
except Exception as e:
    print(f"ZeroGPU initialization failed: {e}")

# Global data dictionary
global_data = {
    'models': {},
    'tokens': {
        'eos': 'eos_token',
        'pad': 'pad_token',
        'padding': 'padding_token',
        'unk': 'unk_token',
        'bos': 'bos_token',
        'sep': 'sep_token',
        'cls': 'cls_token',
        'mask': 'mask_token'
    }
}

model_configs = [
    {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
    {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
    {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
    {"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
    {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
    {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
    {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
    {"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
    {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
]

class ModelManager:
    def __init__(self):
        self.loaded = False

    def load_model(self, model_config):
        try:
            return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
        except Exception as e:
            print(f"Error loading model {model_config['name']}: {e}")
            return None

    def load_all_models(self):
        if self.loaded:
            return global_data['models']
        
        try:
            with ThreadPoolExecutor() as executor:
                futures = [executor.submit(self.load_model, config) for config in model_configs]
                models = []
                for future in as_completed(futures):
                    model = future.result()
                    if model:
                        models.append(model)
            
            global_data['models'] = {model['name']: model['model'] for model in models}
            self.loaded = True
            return global_data['models']
        except Exception as e:
            print(f"Error loading models: {e}")
            return {}

model_manager = ModelManager()
model_manager.load_all_models()

class ChatRequest(BaseModel):
    message: str
    top_k: int = 50
    top_p: float = 0.95
    temperature: float = 0.7

def normalize_input(input_text):
    return input_text.strip()

def remove_duplicates(text):
    text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
    text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
    text = text.replace('[/INST]', '')
    lines = text.split('\n')
    unique_lines = []
    seen_lines = set()
    for line in lines:
        if line not in seen_lines:
            seen_lines.add(line)
            unique_lines.append(line)
    return '\n'.join(unique_lines)

def remove_repetitive_responses(responses):
    seen = set()
    unique_responses = []
    for response in responses:
        normalized_response = remove_duplicates(response['response'])
        if normalized_response not in seen:
            seen.add(normalized_response)
            unique_responses.append({'model': response['model'], 'response': normalized_response})
    return unique_responses

@app.post("/generate/")
@GPU(duration=0)
async def generate(request: ChatRequest):
    try:
        inputs = normalize_input(request.message)
        futures = [
            executor.submit(model.generate, inputs, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
            for model in global_data['models'].values()
        ]
        responses = [{'model': model, 'response': future.result()} for model, future in zip(global_data['models'].keys(), as_completed(futures))]
        unique_responses = remove_repetitive_responses(responses)
        return unique_responses
    except Exception as e:
        print(f"Error generating responses: {e}")
        raise HTTPException(status_code=500, detail="Error generating responses")

@app.middleware("http")
async def process_request(request: Request, call_next):
    try:
        response = await call_next(request)
        return response
    except Exception as e:
        print(f"Request error: {e}")
        raise HTTPException(status_code=500, detail="Internal Server Error")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)