LamiaYT commited on
Commit
e0860a0
·
1 Parent(s): 8ac5ef4

Fix quantization dependencies and add fallback

Browse files
Files changed (4) hide show
  1. README.md +1 -0
  2. agent/local_llm.py +25 -1
  3. app.py +8 -3
  4. requirements.txt +10 -9
README.md CHANGED
@@ -11,3 +11,4 @@ short_description: Test To Pass GAIA
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
agent/local_llm.py CHANGED
@@ -1,5 +1,6 @@
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
2
  import torch
 
3
 
4
  class LocalLLM:
5
  def __init__(self):
@@ -8,12 +9,35 @@ class LocalLLM:
8
  self.pipeline = self._load_model()
9
 
10
  def _load_model(self):
 
 
 
 
 
 
 
 
11
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  self.model_name,
14
  torch_dtype=torch.float16,
15
  device_map="auto",
16
- load_in_4bit=True # Quantization to save memory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  )
18
  return pipeline(
19
  "text-generation",
 
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
2
  import torch
3
+ from accelerate import Accelerator
4
 
5
  class LocalLLM:
6
  def __init__(self):
 
9
  self.pipeline = self._load_model()
10
 
11
  def _load_model(self):
12
+ try:
13
+ # First try with 4-bit quantization
14
+ return self._load_quantized_model()
15
+ except Exception as e:
16
+ print(f"4-bit loading failed: {str(e)}. Trying without quantization...")
17
+ return self._load_fallback_model()
18
+
19
+ def _load_quantized_model(self):
20
  tokenizer = AutoTokenizer.from_pretrained(self.model_name)
21
  model = AutoModelForCausalLM.from_pretrained(
22
  self.model_name,
23
  torch_dtype=torch.float16,
24
  device_map="auto",
25
+ load_in_4bit=True,
26
+ low_cpu_mem_usage=True
27
+ )
28
+ return pipeline(
29
+ "text-generation",
30
+ model=model,
31
+ tokenizer=tokenizer,
32
+ device=self.device
33
+ )
34
+
35
+ def _load_fallback_model(self):
36
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ self.model_name,
39
+ torch_dtype=torch.float16,
40
+ device_map="auto"
41
  )
42
  return pipeline(
43
  "text-generation",
app.py CHANGED
@@ -5,9 +5,14 @@ from llama_index.core.agent import ReActAgent
5
  from utils.gaia_api import GaiaAPI
6
 
7
  # Initialize components
8
- llm = LocalLLM()
9
- agent = ReActAgent.from_tools(gaia_tools, llm=llm.pipeline)
10
-
 
 
 
 
 
11
  def process_question(question_text: str) -> str:
12
  """Process GAIA question through agent"""
13
  try:
 
5
  from utils.gaia_api import GaiaAPI
6
 
7
  # Initialize components
8
+ try:
9
+ from agent.local_llm import LocalLLM
10
+ llm = LocalLLM()
11
+ agent = ReActAgent.from_tools(gaia_tools, llm=llm.pipeline)
12
+ except Exception as e:
13
+ print(f"Failed to initialize LLM: {str(e)}")
14
+ # Fallback to a simpler agent if needed
15
+ agent = None
16
  def process_question(question_text: str) -> str:
17
  """Process GAIA question through agent"""
18
  try:
requirements.txt CHANGED
@@ -1,9 +1,10 @@
1
- llama-index==0.10.0
2
- transformers==4.34.0
3
- torch==2.0.1
4
- accelerate==0.23.0
5
- sentence-transformers==2.2.2
6
- python-dotenv==1.0.0
7
- gradio==3.41.0
8
- requests==2.31.0
9
- bitsandbytes==0.41.1
 
 
1
+ accelerate>=0.23.0
2
+ bitsandbytes>=0.41.1
3
+ torch>=2.0.1
4
+ transformers>=4.34.0
5
+ llama-index>=0.10.0
6
+ gradio>=3.41.0
7
+ sentence-transformers>=2.2.2
8
+ python-dotenv>=1.0.0
9
+ requests>=2.31.0
10
+ nltk>=3.8.1