LamiaYT commited on
Commit
e4f0435
·
1 Parent(s): bbe4b6b

Fix Dockerfile & Gradio compatibility

Browse files
Files changed (1) hide show
  1. agent/local_llm.py +23 -48
agent/local_llm.py CHANGED
@@ -8,19 +8,16 @@ except ImportError as e:
8
 
9
  class LocalLLM:
10
  def __init__(self):
11
- # Use a simple, reliable model that works well with LlamaIndex
12
- self.model_name = "microsoft/DialoGPT-small" # Changed to smaller model
13
  print(f"Initializing LocalLLM with model: {self.model_name}")
14
  self.llm = self._create_llama_index_llm()
15
-
16
  def _create_llama_index_llm(self):
17
- """Create LlamaIndex compatible LLM"""
18
  try:
19
  print("Loading tokenizer...")
20
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
21
- if tokenizer.pad_token is None:
22
- tokenizer.pad_token = tokenizer.eos_token
23
-
24
  print("Loading model...")
25
  model = AutoModelForCausalLM.from_pretrained(
26
  self.model_name,
@@ -28,79 +25,57 @@ class LocalLLM:
28
  device_map="auto" if torch.cuda.is_available() else None,
29
  low_cpu_mem_usage=True
30
  )
31
-
32
- print("Creating LlamaIndex LLM...")
33
- # Fix the generate_kwargs to avoid conflicts
34
  llm = HuggingFaceLLM(
35
  model=model,
36
  tokenizer=tokenizer,
37
- # Simplified generate_kwargs to avoid conflicts
38
- generate_kwargs={
39
- "do_sample": True,
40
- "temperature": 0.7,
41
- "pad_token_id": tokenizer.eos_token_id
42
- },
43
- # Set these parameters at the LLM level instead
44
- max_new_tokens=256,
45
  device_map="auto" if torch.cuda.is_available() else None
46
  )
47
-
48
- print("LLM created successfully!")
49
  return llm
50
-
51
  except Exception as e:
52
- print(f"Failed to load model {self.model_name}: {str(e)}")
53
- # Fallback to even simpler model
54
  return self._create_fallback_llm()
55
-
56
  def _create_fallback_llm(self):
57
- """Fallback to a very basic model"""
58
- print("Using fallback model: gpt2")
59
  model_name = "gpt2"
60
-
61
  try:
62
  tokenizer = AutoTokenizer.from_pretrained(model_name)
63
  tokenizer.pad_token = tokenizer.eos_token
64
-
65
  model = AutoModelForCausalLM.from_pretrained(model_name)
66
-
67
  return HuggingFaceLLM(
68
  model=model,
69
  tokenizer=tokenizer,
70
- generate_kwargs={
71
- "do_sample": True,
72
- "temperature": 0.7,
73
- "pad_token_id": tokenizer.eos_token_id
74
- },
75
- max_new_tokens=256
76
  )
77
  except Exception as e:
78
- print(f"Even fallback model failed: {str(e)}")
79
- # Return a mock LLM for testing
80
  return self._create_mock_llm()
81
-
82
  def _create_mock_llm(self):
83
- """Create a mock LLM for testing when models fail"""
84
- print("Creating mock LLM for testing...")
85
-
86
  class MockLLM:
87
  def chat(self, messages, **kwargs):
88
- # Simple mock response
89
  class MockResponse:
90
  def __init__(self, text):
91
  self.message = type('obj', (object,), {'content': text})
92
-
93
- return MockResponse("This is a mock response. The actual LLM failed to load.")
94
 
95
  def complete(self, prompt, **kwargs):
96
  class MockCompletion:
97
  def __init__(self, text):
98
  self.text = text
99
-
100
  return MockCompletion("Mock completion response.")
101
 
102
  return MockLLM()
103
-
104
  def get_llm(self):
105
- """Return the LlamaIndex LLM instance"""
106
- return self.llm
 
8
 
9
  class LocalLLM:
10
  def __init__(self):
11
+ # Use a chat-compatible model
12
+ self.model_name = "HuggingFaceH4/zephyr-7b-alpha"
13
  print(f"Initializing LocalLLM with model: {self.model_name}")
14
  self.llm = self._create_llama_index_llm()
15
+
16
  def _create_llama_index_llm(self):
 
17
  try:
18
  print("Loading tokenizer...")
19
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
20
+
 
 
21
  print("Loading model...")
22
  model = AutoModelForCausalLM.from_pretrained(
23
  self.model_name,
 
25
  device_map="auto" if torch.cuda.is_available() else None,
26
  low_cpu_mem_usage=True
27
  )
28
+
29
+ print("Creating LlamaIndex-compatible LLM...")
 
30
  llm = HuggingFaceLLM(
31
  model=model,
32
  tokenizer=tokenizer,
33
+ context_window=4096,
34
+ generate_kwargs={"temperature": 0.7, "max_new_tokens": 256},
35
+ tokenizer_kwargs={"use_fast": True},
 
 
 
 
 
36
  device_map="auto" if torch.cuda.is_available() else None
37
  )
38
+ print("✅ LLM created successfully!")
 
39
  return llm
40
+
41
  except Exception as e:
42
+ print(f"Failed to load {self.model_name}: {e}")
 
43
  return self._create_fallback_llm()
44
+
45
  def _create_fallback_llm(self):
46
+ print("⚠️ Falling back to GPT2 model")
 
47
  model_name = "gpt2"
 
48
  try:
49
  tokenizer = AutoTokenizer.from_pretrained(model_name)
50
  tokenizer.pad_token = tokenizer.eos_token
51
+
52
  model = AutoModelForCausalLM.from_pretrained(model_name)
53
+
54
  return HuggingFaceLLM(
55
  model=model,
56
  tokenizer=tokenizer,
57
+ generate_kwargs={"temperature": 0.7, "max_new_tokens": 256},
 
 
 
 
 
58
  )
59
  except Exception as e:
60
+ print(f"⚠️ Fallback model also failed: {e}")
 
61
  return self._create_mock_llm()
62
+
63
  def _create_mock_llm(self):
64
+ print("⚠️ Using mock LLM")
 
 
65
  class MockLLM:
66
  def chat(self, messages, **kwargs):
 
67
  class MockResponse:
68
  def __init__(self, text):
69
  self.message = type('obj', (object,), {'content': text})
70
+ return MockResponse("Mock chat response.")
 
71
 
72
  def complete(self, prompt, **kwargs):
73
  class MockCompletion:
74
  def __init__(self, text):
75
  self.text = text
 
76
  return MockCompletion("Mock completion response.")
77
 
78
  return MockLLM()
79
+
80
  def get_llm(self):
81
+ return self.llm