Spaces:

LamiaYT
/

gaia-llamaindex-agent

Sleeping

App Files Files Community

LamiaYT commited on Jun 25

Commit

e4f0435

1 Parent(s): bbe4b6b

Fix Dockerfile & Gradio compatibility

Browse files

Files changed (1) hide show

agent/local_llm.py +23 -48

agent/local_llm.py CHANGED Viewed

@@ -8,19 +8,16 @@ except ImportError as e:
 class LocalLLM:
     def __init__(self):
-        # Use a simple, reliable model that works well with LlamaIndex
-        self.model_name = "microsoft/DialoGPT-small"  # Changed to smaller model
         print(f"Initializing LocalLLM with model: {self.model_name}")
         self.llm = self._create_llama_index_llm()
     def _create_llama_index_llm(self):
-        """Create LlamaIndex compatible LLM"""
         try:
             print("Loading tokenizer...")
             tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
             print("Loading model...")
             model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
@@ -28,79 +25,57 @@ class LocalLLM:
                 device_map="auto" if torch.cuda.is_available() else None,
                 low_cpu_mem_usage=True
             )
-            print("Creating LlamaIndex LLM...")
-            # Fix the generate_kwargs to avoid conflicts
             llm = HuggingFaceLLM(
                 model=model,
                 tokenizer=tokenizer,
-                # Simplified generate_kwargs to avoid conflicts
-                generate_kwargs={
-                    "do_sample": True,
-                    "temperature": 0.7,
-                    "pad_token_id": tokenizer.eos_token_id
-                },
-                # Set these parameters at the LLM level instead
-                max_new_tokens=256,
                 device_map="auto" if torch.cuda.is_available() else None
             )
-            print("LLM created successfully!")
             return llm
         except Exception as e:
-            print(f"Failed to load model {self.model_name}: {str(e)}")
-            # Fallback to even simpler model
             return self._create_fallback_llm()
     def _create_fallback_llm(self):
-        """Fallback to a very basic model"""
-        print("Using fallback model: gpt2")
         model_name = "gpt2"
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             tokenizer.pad_token = tokenizer.eos_token
             model = AutoModelForCausalLM.from_pretrained(model_name)
             return HuggingFaceLLM(
                 model=model,
                 tokenizer=tokenizer,
-                generate_kwargs={
-                    "do_sample": True,
-                    "temperature": 0.7,
-                    "pad_token_id": tokenizer.eos_token_id
-                },
-                max_new_tokens=256
             )
         except Exception as e:
-            print(f"Even fallback model failed: {str(e)}")
-            # Return a mock LLM for testing
             return self._create_mock_llm()
     def _create_mock_llm(self):
-        """Create a mock LLM for testing when models fail"""
-        print("Creating mock LLM for testing...")
         class MockLLM:
             def chat(self, messages, **kwargs):
-                # Simple mock response
                 class MockResponse:
                     def __init__(self, text):
                         self.message = type('obj', (object,), {'content': text})
-                return MockResponse("This is a mock response. The actual LLM failed to load.")
             def complete(self, prompt, **kwargs):
                 class MockCompletion:
                     def __init__(self, text):
                         self.text = text
                 return MockCompletion("Mock completion response.")
         return MockLLM()
     def get_llm(self):
-        """Return the LlamaIndex LLM instance"""
-        return self.llm

 class LocalLLM:
     def __init__(self):
+        # Use a chat-compatible model
+        self.model_name = "HuggingFaceH4/zephyr-7b-alpha"
         print(f"Initializing LocalLLM with model: {self.model_name}")
         self.llm = self._create_llama_index_llm()
     def _create_llama_index_llm(self):
         try:
             print("Loading tokenizer...")
             tokenizer = AutoTokenizer.from_pretrained(self.model_name)
             print("Loading model...")
             model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map="auto" if torch.cuda.is_available() else None,
                 low_cpu_mem_usage=True
             )
+            print("Creating LlamaIndex-compatible LLM...")
             llm = HuggingFaceLLM(
                 model=model,
                 tokenizer=tokenizer,
+                context_window=4096,
+                generate_kwargs={"temperature": 0.7, "max_new_tokens": 256},
+                tokenizer_kwargs={"use_fast": True},
                 device_map="auto" if torch.cuda.is_available() else None
             )
+            print("✅ LLM created successfully!")
             return llm
         except Exception as e:
+            print(f"❌ Failed to load {self.model_name}: {e}")
             return self._create_fallback_llm()
     def _create_fallback_llm(self):
+        print("⚠️ Falling back to GPT2 model")
         model_name = "gpt2"
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             tokenizer.pad_token = tokenizer.eos_token
             model = AutoModelForCausalLM.from_pretrained(model_name)
             return HuggingFaceLLM(
                 model=model,
                 tokenizer=tokenizer,
+                generate_kwargs={"temperature": 0.7, "max_new_tokens": 256},
             )
         except Exception as e:
+            print(f"⚠️ Fallback model also failed: {e}")
             return self._create_mock_llm()
     def _create_mock_llm(self):
+        print("⚠️ Using mock LLM")
         class MockLLM:
             def chat(self, messages, **kwargs):
                 class MockResponse:
                     def __init__(self, text):
                         self.message = type('obj', (object,), {'content': text})
+                return MockResponse("Mock chat response.")
             def complete(self, prompt, **kwargs):
                 class MockCompletion:
                     def __init__(self, text):
                         self.text = text
                 return MockCompletion("Mock completion response.")
         return MockLLM()
     def get_llm(self):
+        return self.llm