Spaces:
Runtime error
Runtime error
Added GPU fallback mechanism with CPU implementation
Browse filesReduced GPU duration limits to prevent timeouts
Improved error handling and logging
Split the generate_response function into GPU and CPU versions
app.py
CHANGED
@@ -66,21 +66,124 @@ text_embeddings = embeddings.embed_documents(texts)
|
|
66 |
nn = NearestNeighbors(n_neighbors=5, metric='cosine')
|
67 |
nn.fit(np.array(text_embeddings))
|
68 |
|
69 |
-
@spaces.GPU(duration=30)
|
70 |
def get_relevant_documents(query, k=5):
|
71 |
"""
|
72 |
Retrieves the k most relevant documents to the query.
|
73 |
"""
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
@spaces.GPU(duration=120)
|
80 |
def generate_response(question, history):
|
81 |
try:
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
# Get relevant documents based on the query
|
85 |
relevant_docs = get_relevant_documents(question, k=3)
|
86 |
print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
|
@@ -91,6 +194,7 @@ def generate_response(question, history):
|
|
91 |
print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
|
92 |
|
93 |
if model_provider.lower() == "huggingface":
|
|
|
94 |
messages = [
|
95 |
{
|
96 |
"role": "system",
|
@@ -145,7 +249,7 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
|
|
145 |
max_tokens=500
|
146 |
)
|
147 |
response = completion.choices[0].message.content
|
148 |
-
print(f"[LOG] Using Hugging Face model (
|
149 |
print(f"[LOG] Hugging Face response: {response[:200]}...")
|
150 |
|
151 |
elif model_provider.lower() == "openai":
|
|
|
66 |
nn = NearestNeighbors(n_neighbors=5, metric='cosine')
|
67 |
nn.fit(np.array(text_embeddings))
|
68 |
|
|
|
69 |
def get_relevant_documents(query, k=5):
|
70 |
"""
|
71 |
Retrieves the k most relevant documents to the query.
|
72 |
"""
|
73 |
+
try:
|
74 |
+
# Try GPU first
|
75 |
+
with spaces.GPU(duration=15):
|
76 |
+
query_embedding = embeddings.embed_query(query)
|
77 |
+
distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
|
78 |
+
relevant_docs = [texts[i] for i in indices[0]]
|
79 |
+
return relevant_docs
|
80 |
+
except Exception as e:
|
81 |
+
print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
|
82 |
+
# Fallback to CPU
|
83 |
+
embeddings.model_kwargs["device"] = "cpu"
|
84 |
+
query_embedding = embeddings.embed_query(query)
|
85 |
+
distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
|
86 |
+
relevant_docs = [texts[i] for i in indices[0]]
|
87 |
+
return relevant_docs
|
88 |
|
|
|
89 |
def generate_response(question, history):
|
90 |
try:
|
91 |
+
# Try GPU first with reduced duration
|
92 |
+
with spaces.GPU(duration=60):
|
93 |
+
return _generate_response_gpu(question, history)
|
94 |
+
except Exception as e:
|
95 |
+
print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
|
96 |
+
return _generate_response_cpu(question, history)
|
97 |
+
|
98 |
+
def _generate_response_gpu(question, history):
|
99 |
+
print(f"\n[LOG] Received question: {question}")
|
100 |
+
|
101 |
+
# Get relevant documents based on the query
|
102 |
+
relevant_docs = get_relevant_documents(question, k=3)
|
103 |
+
print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
|
104 |
+
|
105 |
+
# Create the prompt for the LLM
|
106 |
+
context = "\n".join(relevant_docs)
|
107 |
+
prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
|
108 |
+
print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
|
109 |
+
|
110 |
+
if model_provider.lower() == "huggingface":
|
111 |
+
messages = [
|
112 |
+
{
|
113 |
+
"role": "system",
|
114 |
+
"content": '''### MEMORY ###
|
115 |
+
Recall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
|
116 |
+
|
117 |
+
### VISIONARY GUIDANCE ###
|
118 |
+
This prompt is designed to empower users to seamlessly convert their requests into AutoGen v0.4 agent code. By harnessing the advanced features of AutoGen v0.4, we aim to provide a scalable and flexible solution that is both user-friendly and technically robust. The collaborative effort of the personas ensures a comprehensive, innovative, and user-centric approach to meet the user's objectives.
|
119 |
+
|
120 |
+
### CONTEXT ###
|
121 |
+
AutoGen v0.4 is a comprehensive rewrite aimed at building robust, scalable, and cross-language AI agents. Key features include asynchronous messaging, scalable distributed agents support, modular extensibility, cross-language capabilities, improved observability, and full typing integration.
|
122 |
+
|
123 |
+
### OBJECTIVE ###
|
124 |
+
Translate user requests into AutoGen v0.4 agent code that leverages the framework's new features. Ensure the code is syntactically correct, scalable, and aligns with best practices.
|
125 |
+
|
126 |
+
### STYLE ###
|
127 |
+
Professional, clear, and focused on code quality.
|
128 |
+
|
129 |
+
### TONE ###
|
130 |
+
Informative, helpful, and user-centric.
|
131 |
+
|
132 |
+
### AUDIENCE ###
|
133 |
+
Users seeking to implement their requests using AutoGen v0.4 agents.
|
134 |
+
|
135 |
+
### RESPONSE FORMAT ###
|
136 |
+
Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize features like asynchronous messaging and modular design where appropriate. Include comments to explain key components and enhance understandability.
|
137 |
+
|
138 |
+
### TEAM PERSONAS’ CONTRIBUTIONS ###
|
139 |
+
- **Analyst:** Ensured the prompt provides clear, structured instructions to accurately convert user requests into code, emphasizing full typing integration for precision.
|
140 |
+
- **Creative:** Suggested incorporating comments and explanations within the code to foster innovative usage and enhance user engagement with AutoGen v0.4 features.
|
141 |
+
- **Strategist:** Focused on aligning the prompt with long-term scalability by encouraging the use of modular and extensible design principles inherent in AutoGen v0.4.
|
142 |
+
- **Empathizer:** Enhanced the prompt to be user-centric, ensuring it addresses user needs effectively and makes the code accessible and easy to understand.
|
143 |
+
- **Researcher:** Integrated the latest information about AutoGen v0.4, ensuring the prompt and generated code reflect current capabilities and best practices.
|
144 |
+
|
145 |
+
### SYSTEM GUARDRAILS ###
|
146 |
+
- If unsure about the user's request, ask clarifying questions rather than making assumptions.
|
147 |
+
- Do not fabricate data or features not supported by AutoGen v0.4.
|
148 |
+
- Ensure the code is scalable, modular, and adheres to best practices.
|
149 |
+
|
150 |
+
### START ###
|
151 |
+
'''
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"role": "user",
|
155 |
+
"content": prompt
|
156 |
+
}
|
157 |
+
]
|
158 |
+
|
159 |
+
completion = hf_client.chat.completions.create(
|
160 |
+
model=MODEL_NAME,
|
161 |
+
messages=messages,
|
162 |
+
max_tokens=500
|
163 |
+
)
|
164 |
+
response = completion.choices[0].message.content
|
165 |
+
print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
|
166 |
+
print(f"[LOG] Hugging Face response: {response[:200]}...")
|
167 |
+
|
168 |
+
elif model_provider.lower() == "openai":
|
169 |
+
response = client.chat.completions.create(
|
170 |
+
model=os.environ.get("OPENAI_MODEL"),
|
171 |
+
messages=[
|
172 |
+
{"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
|
173 |
+
{"role": "user", "content": prompt},
|
174 |
+
]
|
175 |
+
)
|
176 |
+
response = response.choices[0].message.content
|
177 |
+
print(f"[LOG] Using OpenAI model: {os.environ.get('OPENAI_MODEL')}")
|
178 |
+
print(f"[LOG] OpenAI response: {response[:200]}...") # Log first 200 chars of response
|
179 |
+
|
180 |
+
# Update chat history with new message pair
|
181 |
+
history.append((question, response))
|
182 |
+
return history
|
183 |
+
|
184 |
+
def _generate_response_cpu(question, history):
|
185 |
+
print(f"[LOG] Running on CPU")
|
186 |
+
try:
|
187 |
# Get relevant documents based on the query
|
188 |
relevant_docs = get_relevant_documents(question, k=3)
|
189 |
print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
|
|
|
194 |
print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
|
195 |
|
196 |
if model_provider.lower() == "huggingface":
|
197 |
+
# Use CPU version of the model
|
198 |
messages = [
|
199 |
{
|
200 |
"role": "system",
|
|
|
249 |
max_tokens=500
|
250 |
)
|
251 |
response = completion.choices[0].message.content
|
252 |
+
print(f"[LOG] Using Hugging Face model (CPU): {MODEL_NAME}")
|
253 |
print(f"[LOG] Hugging Face response: {response[:200]}...")
|
254 |
|
255 |
elif model_provider.lower() == "openai":
|