tosin2013 commited on
Commit
c6155ce
·
1 Parent(s): 71875a9

Added GPU fallback mechanism with CPU implementation

Browse files

Reduced GPU duration limits to prevent timeouts
Improved error handling and logging
Split the generate_response function into GPU and CPU versions

Files changed (1) hide show
  1. app.py +113 -9
app.py CHANGED
@@ -66,21 +66,124 @@ text_embeddings = embeddings.embed_documents(texts)
66
  nn = NearestNeighbors(n_neighbors=5, metric='cosine')
67
  nn.fit(np.array(text_embeddings))
68
 
69
- @spaces.GPU(duration=30)
70
  def get_relevant_documents(query, k=5):
71
  """
72
  Retrieves the k most relevant documents to the query.
73
  """
74
- query_embedding = embeddings.embed_query(query)
75
- distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
76
- relevant_docs = [texts[i] for i in indices[0]]
77
- return relevant_docs
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- @spaces.GPU(duration=120)
80
  def generate_response(question, history):
81
  try:
82
- print(f"\n[LOG] Received question: {question}")
83
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Get relevant documents based on the query
85
  relevant_docs = get_relevant_documents(question, k=3)
86
  print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
@@ -91,6 +194,7 @@ def generate_response(question, history):
91
  print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
92
 
93
  if model_provider.lower() == "huggingface":
 
94
  messages = [
95
  {
96
  "role": "system",
@@ -145,7 +249,7 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
145
  max_tokens=500
146
  )
147
  response = completion.choices[0].message.content
148
- print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
149
  print(f"[LOG] Hugging Face response: {response[:200]}...")
150
 
151
  elif model_provider.lower() == "openai":
 
66
  nn = NearestNeighbors(n_neighbors=5, metric='cosine')
67
  nn.fit(np.array(text_embeddings))
68
 
 
69
  def get_relevant_documents(query, k=5):
70
  """
71
  Retrieves the k most relevant documents to the query.
72
  """
73
+ try:
74
+ # Try GPU first
75
+ with spaces.GPU(duration=15):
76
+ query_embedding = embeddings.embed_query(query)
77
+ distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
78
+ relevant_docs = [texts[i] for i in indices[0]]
79
+ return relevant_docs
80
+ except Exception as e:
81
+ print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
82
+ # Fallback to CPU
83
+ embeddings.model_kwargs["device"] = "cpu"
84
+ query_embedding = embeddings.embed_query(query)
85
+ distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
86
+ relevant_docs = [texts[i] for i in indices[0]]
87
+ return relevant_docs
88
 
 
89
  def generate_response(question, history):
90
  try:
91
+ # Try GPU first with reduced duration
92
+ with spaces.GPU(duration=60):
93
+ return _generate_response_gpu(question, history)
94
+ except Exception as e:
95
+ print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
96
+ return _generate_response_cpu(question, history)
97
+
98
+ def _generate_response_gpu(question, history):
99
+ print(f"\n[LOG] Received question: {question}")
100
+
101
+ # Get relevant documents based on the query
102
+ relevant_docs = get_relevant_documents(question, k=3)
103
+ print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
104
+
105
+ # Create the prompt for the LLM
106
+ context = "\n".join(relevant_docs)
107
+ prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
108
+ print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
109
+
110
+ if model_provider.lower() == "huggingface":
111
+ messages = [
112
+ {
113
+ "role": "system",
114
+ "content": '''### MEMORY ###
115
+ Recall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
116
+
117
+ ### VISIONARY GUIDANCE ###
118
+ This prompt is designed to empower users to seamlessly convert their requests into AutoGen v0.4 agent code. By harnessing the advanced features of AutoGen v0.4, we aim to provide a scalable and flexible solution that is both user-friendly and technically robust. The collaborative effort of the personas ensures a comprehensive, innovative, and user-centric approach to meet the user's objectives.
119
+
120
+ ### CONTEXT ###
121
+ AutoGen v0.4 is a comprehensive rewrite aimed at building robust, scalable, and cross-language AI agents. Key features include asynchronous messaging, scalable distributed agents support, modular extensibility, cross-language capabilities, improved observability, and full typing integration.
122
+
123
+ ### OBJECTIVE ###
124
+ Translate user requests into AutoGen v0.4 agent code that leverages the framework's new features. Ensure the code is syntactically correct, scalable, and aligns with best practices.
125
+
126
+ ### STYLE ###
127
+ Professional, clear, and focused on code quality.
128
+
129
+ ### TONE ###
130
+ Informative, helpful, and user-centric.
131
+
132
+ ### AUDIENCE ###
133
+ Users seeking to implement their requests using AutoGen v0.4 agents.
134
+
135
+ ### RESPONSE FORMAT ###
136
+ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize features like asynchronous messaging and modular design where appropriate. Include comments to explain key components and enhance understandability.
137
+
138
+ ### TEAM PERSONAS’ CONTRIBUTIONS ###
139
+ - **Analyst:** Ensured the prompt provides clear, structured instructions to accurately convert user requests into code, emphasizing full typing integration for precision.
140
+ - **Creative:** Suggested incorporating comments and explanations within the code to foster innovative usage and enhance user engagement with AutoGen v0.4 features.
141
+ - **Strategist:** Focused on aligning the prompt with long-term scalability by encouraging the use of modular and extensible design principles inherent in AutoGen v0.4.
142
+ - **Empathizer:** Enhanced the prompt to be user-centric, ensuring it addresses user needs effectively and makes the code accessible and easy to understand.
143
+ - **Researcher:** Integrated the latest information about AutoGen v0.4, ensuring the prompt and generated code reflect current capabilities and best practices.
144
+
145
+ ### SYSTEM GUARDRAILS ###
146
+ - If unsure about the user's request, ask clarifying questions rather than making assumptions.
147
+ - Do not fabricate data or features not supported by AutoGen v0.4.
148
+ - Ensure the code is scalable, modular, and adheres to best practices.
149
+
150
+ ### START ###
151
+ '''
152
+ },
153
+ {
154
+ "role": "user",
155
+ "content": prompt
156
+ }
157
+ ]
158
+
159
+ completion = hf_client.chat.completions.create(
160
+ model=MODEL_NAME,
161
+ messages=messages,
162
+ max_tokens=500
163
+ )
164
+ response = completion.choices[0].message.content
165
+ print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
166
+ print(f"[LOG] Hugging Face response: {response[:200]}...")
167
+
168
+ elif model_provider.lower() == "openai":
169
+ response = client.chat.completions.create(
170
+ model=os.environ.get("OPENAI_MODEL"),
171
+ messages=[
172
+ {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
173
+ {"role": "user", "content": prompt},
174
+ ]
175
+ )
176
+ response = response.choices[0].message.content
177
+ print(f"[LOG] Using OpenAI model: {os.environ.get('OPENAI_MODEL')}")
178
+ print(f"[LOG] OpenAI response: {response[:200]}...") # Log first 200 chars of response
179
+
180
+ # Update chat history with new message pair
181
+ history.append((question, response))
182
+ return history
183
+
184
+ def _generate_response_cpu(question, history):
185
+ print(f"[LOG] Running on CPU")
186
+ try:
187
  # Get relevant documents based on the query
188
  relevant_docs = get_relevant_documents(question, k=3)
189
  print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
 
194
  print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
195
 
196
  if model_provider.lower() == "huggingface":
197
+ # Use CPU version of the model
198
  messages = [
199
  {
200
  "role": "system",
 
249
  max_tokens=500
250
  )
251
  response = completion.choices[0].message.content
252
+ print(f"[LOG] Using Hugging Face model (CPU): {MODEL_NAME}")
253
  print(f"[LOG] Hugging Face response: {response[:200]}...")
254
 
255
  elif model_provider.lower() == "openai":