Spaces:

tosin2013
/

autogen-agent-gen

Runtime error

tosin2013 commited on Jan 2

Commit

c6155ce

1 Parent(s): 71875a9

Added GPU fallback mechanism with CPU implementation

Reduced GPU duration limits to prevent timeouts
Improved error handling and logging
Split the generate_response function into GPU and CPU versions

Files changed (1) hide show

app.py +113 -9

app.py CHANGED Viewed

@@ -66,21 +66,124 @@ text_embeddings = embeddings.embed_documents(texts)
 nn = NearestNeighbors(n_neighbors=5, metric='cosine')
 nn.fit(np.array(text_embeddings))
-@spaces.GPU(duration=30)
 def get_relevant_documents(query, k=5):
     """
     Retrieves the k most relevant documents to the query.
     """
-    query_embedding = embeddings.embed_query(query)
-    distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
-    relevant_docs = [texts[i] for i in indices[0]]
-    return relevant_docs
-@spaces.GPU(duration=120)
 def generate_response(question, history):
     try:
-        print(f"\n[LOG] Received question: {question}")
         # Get relevant documents based on the query
         relevant_docs = get_relevant_documents(question, k=3)
         print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
@@ -91,6 +194,7 @@ def generate_response(question, history):
         print(f"[LOG] Generated prompt: {prompt[:200]}...")  # Log first 200 chars of prompt
         if model_provider.lower() == "huggingface":
             messages = [
                 {
                     "role": "system",
@@ -145,7 +249,7 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
                 max_tokens=500
             )
             response = completion.choices[0].message.content
-            print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
             print(f"[LOG] Hugging Face response: {response[:200]}...")
         elif model_provider.lower() == "openai":

 nn = NearestNeighbors(n_neighbors=5, metric='cosine')
 nn.fit(np.array(text_embeddings))
 def get_relevant_documents(query, k=5):
     """
     Retrieves the k most relevant documents to the query.
     """
+    try:
+        # Try GPU first
+        with spaces.GPU(duration=15):
+            query_embedding = embeddings.embed_query(query)
+            distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
+            relevant_docs = [texts[i] for i in indices[0]]
+            return relevant_docs
+    except Exception as e:
+        print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
+        # Fallback to CPU
+        embeddings.model_kwargs["device"] = "cpu"
+        query_embedding = embeddings.embed_query(query)
+        distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
+        relevant_docs = [texts[i] for i in indices[0]]
+        return relevant_docs
 def generate_response(question, history):
     try:
+        # Try GPU first with reduced duration
+        with spaces.GPU(duration=60):
+            return _generate_response_gpu(question, history)
+    except Exception as e:
+        print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
+        return _generate_response_cpu(question, history)
+def _generate_response_gpu(question, history):
+    print(f"\n[LOG] Received question: {question}")
+    # Get relevant documents based on the query
+    relevant_docs = get_relevant_documents(question, k=3)
+    print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
+    # Create the prompt for the LLM
+    context = "\n".join(relevant_docs)
+    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
+    print(f"[LOG] Generated prompt: {prompt[:200]}...")  # Log first 200 chars of prompt
+    if model_provider.lower() == "huggingface":
+        messages = [
+            {
+                "role": "system",
+                "content": '''### MEMORY ###
+Recall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
+### VISIONARY GUIDANCE ###
+This prompt is designed to empower users to seamlessly convert their requests into AutoGen v0.4 agent code. By harnessing the advanced features of AutoGen v0.4, we aim to provide a scalable and flexible solution that is both user-friendly and technically robust. The collaborative effort of the personas ensures a comprehensive, innovative, and user-centric approach to meet the user's objectives.
+### CONTEXT ###
+AutoGen v0.4 is a comprehensive rewrite aimed at building robust, scalable, and cross-language AI agents. Key features include asynchronous messaging, scalable distributed agents support, modular extensibility, cross-language capabilities, improved observability, and full typing integration.
+### OBJECTIVE ###
+Translate user requests into AutoGen v0.4 agent code that leverages the framework's new features. Ensure the code is syntactically correct, scalable, and aligns with best practices.
+### STYLE ###
+Professional, clear, and focused on code quality.
+### TONE ###
+Informative, helpful, and user-centric.
+### AUDIENCE ###
+Users seeking to implement their requests using AutoGen v0.4 agents.
+### RESPONSE FORMAT ###
+Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize features like asynchronous messaging and modular design where appropriate. Include comments to explain key components and enhance understandability.
+### TEAM PERSONAS’ CONTRIBUTIONS ###
+- **Analyst:** Ensured the prompt provides clear, structured instructions to accurately convert user requests into code, emphasizing full typing integration for precision.
+- **Creative:** Suggested incorporating comments and explanations within the code to foster innovative usage and enhance user engagement with AutoGen v0.4 features.
+- **Strategist:** Focused on aligning the prompt with long-term scalability by encouraging the use of modular and extensible design principles inherent in AutoGen v0.4.
+- **Empathizer:** Enhanced the prompt to be user-centric, ensuring it addresses user needs effectively and makes the code accessible and easy to understand.
+- **Researcher:** Integrated the latest information about AutoGen v0.4, ensuring the prompt and generated code reflect current capabilities and best practices.
+### SYSTEM GUARDRAILS ###
+- If unsure about the user's request, ask clarifying questions rather than making assumptions.
+- Do not fabricate data or features not supported by AutoGen v0.4.
+- Ensure the code is scalable, modular, and adheres to best practices.
+### START ###
+'''
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ]
+        completion = hf_client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=500
+        )
+        response = completion.choices[0].message.content
+        print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
+        print(f"[LOG] Hugging Face response: {response[:200]}...")
+    elif model_provider.lower() == "openai":
+        response = client.chat.completions.create(
+            model=os.environ.get("OPENAI_MODEL"),
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
+                {"role": "user", "content": prompt},
+            ]
+        )
+        response = response.choices[0].message.content
+        print(f"[LOG] Using OpenAI model: {os.environ.get('OPENAI_MODEL')}")
+        print(f"[LOG] OpenAI response: {response[:200]}...")  # Log first 200 chars of response
+    # Update chat history with new message pair
+    history.append((question, response))
+    return history
+def _generate_response_cpu(question, history):
+    print(f"[LOG] Running on CPU")
+    try:
         # Get relevant documents based on the query
         relevant_docs = get_relevant_documents(question, k=3)
         print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
         print(f"[LOG] Generated prompt: {prompt[:200]}...")  # Log first 200 chars of prompt
         if model_provider.lower() == "huggingface":
+            # Use CPU version of the model
             messages = [
                 {
                     "role": "system",
                 max_tokens=500
             )
             response = completion.choices[0].message.content
+            print(f"[LOG] Using Hugging Face model (CPU): {MODEL_NAME}")
             print(f"[LOG] Hugging Face response: {response[:200]}...")
         elif model_provider.lower() == "openai":