Spaces:
Sleeping
Sleeping
try: | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from llama_index.llms.huggingface import HuggingFaceLLM | |
import torch | |
except ImportError as e: | |
print(f"Import error in local_llm.py: {e}") | |
raise | |
class LocalLLM: | |
def __init__(self): | |
# Use a simple, reliable model that works well with LlamaIndex | |
self.model_name = "microsoft/DialoGPT-small" # Changed to smaller model | |
print(f"Initializing LocalLLM with model: {self.model_name}") | |
self.llm = self._create_llama_index_llm() | |
def _create_llama_index_llm(self): | |
"""Create LlamaIndex compatible LLM""" | |
try: | |
print("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
print("Loading model...") | |
model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
device_map="auto" if torch.cuda.is_available() else None, | |
low_cpu_mem_usage=True | |
) | |
print("Creating LlamaIndex LLM...") | |
# Fix the generate_kwargs to avoid conflicts | |
llm = HuggingFaceLLM( | |
model=model, | |
tokenizer=tokenizer, | |
# Simplified generate_kwargs to avoid conflicts | |
generate_kwargs={ | |
"do_sample": True, | |
"temperature": 0.7, | |
"pad_token_id": tokenizer.eos_token_id | |
}, | |
# Set these parameters at the LLM level instead | |
max_new_tokens=256, | |
device_map="auto" if torch.cuda.is_available() else None | |
) | |
print("LLM created successfully!") | |
return llm | |
except Exception as e: | |
print(f"Failed to load model {self.model_name}: {str(e)}") | |
# Fallback to even simpler model | |
return self._create_fallback_llm() | |
def _create_fallback_llm(self): | |
"""Fallback to a very basic model""" | |
print("Using fallback model: gpt2") | |
model_name = "gpt2" | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
return HuggingFaceLLM( | |
model=model, | |
tokenizer=tokenizer, | |
generate_kwargs={ | |
"do_sample": True, | |
"temperature": 0.7, | |
"pad_token_id": tokenizer.eos_token_id | |
}, | |
max_new_tokens=256 | |
) | |
except Exception as e: | |
print(f"Even fallback model failed: {str(e)}") | |
# Return a mock LLM for testing | |
return self._create_mock_llm() | |
def _create_mock_llm(self): | |
"""Create a mock LLM for testing when models fail""" | |
print("Creating mock LLM for testing...") | |
class MockLLM: | |
def chat(self, messages, **kwargs): | |
# Simple mock response | |
class MockResponse: | |
def __init__(self, text): | |
self.message = type('obj', (object,), {'content': text}) | |
return MockResponse("This is a mock response. The actual LLM failed to load.") | |
def complete(self, prompt, **kwargs): | |
class MockCompletion: | |
def __init__(self, text): | |
self.text = text | |
return MockCompletion("Mock completion response.") | |
return MockLLM() | |
def get_llm(self): | |
"""Return the LlamaIndex LLM instance""" | |
return self.llm |