Spaces:
Sleeping
Sleeping
try: | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from llama_index.llms.huggingface import HuggingFaceLLM | |
import torch | |
except ImportError as e: | |
print(f"Import error in local_llm.py: {e}") | |
raise | |
class LocalLLM: | |
def __init__(self): | |
# Use a lightweight chat-compatible model that actually exists | |
self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
print(f"Initializing LocalLLM with model: {self.model_name}") | |
self.llm = self._create_llama_index_llm() | |
def _create_llama_index_llm(self): | |
try: | |
print("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
print("Loading model...") | |
model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
device_map="auto" if torch.cuda.is_available() else None, | |
low_cpu_mem_usage=True | |
) | |
print("Creating LlamaIndex-compatible LLM...") | |
llm = HuggingFaceLLM( | |
model=model, | |
tokenizer=tokenizer, | |
context_window=2048, | |
generate_kwargs={"temperature": 0.7, "max_new_tokens": 256}, | |
tokenizer_kwargs={"use_fast": True}, | |
device_map="auto" if torch.cuda.is_available() else None | |
) | |
print("✅ LLM created successfully!") | |
return llm | |
except Exception as e: | |
print(f"❌ Failed to load {self.model_name}: {e}") | |
return self._create_fallback_llm() | |
def _create_fallback_llm(self): | |
print("⚠️ Falling back to GPT2 model") | |
model_name = "gpt2" | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
return HuggingFaceLLM( | |
model=model, | |
tokenizer=tokenizer, | |
generate_kwargs={"temperature": 0.7, "max_new_tokens": 256}, | |
) | |
except Exception as e: | |
print(f"⚠️ Fallback model also failed: {e}") | |
return self._create_mock_llm() | |
def _create_mock_llm(self): | |
print("⚠️ Using mock LLM") | |
class MockLLM: | |
def chat(self, messages, **kwargs): | |
class MockResponse: | |
def __init__(self, text): | |
self.message = type('obj', (object,), {'content': text}) | |
return MockResponse("Mock chat response: I would analyze this question and provide an answer.") | |
def complete(self, prompt, **kwargs): | |
class MockCompletion: | |
def __init__(self, text): | |
self.text = text | |
return MockCompletion("Mock completion response: I would analyze this question and provide an answer.") | |
return MockLLM() | |
def get_llm(self): | |
return self.llm |