File size: 3,124 Bytes
e51386e
 
 
 
 
 
 
8ac5ef4
 
 
c2e1cfe
 
e51386e
580bcf5
e4f0435
580bcf5
e0860a0
e51386e
580bcf5
e4f0435
e51386e
580bcf5
 
 
 
 
 
e4f0435
 
580bcf5
 
 
c2e1cfe
e4f0435
 
72146a4
580bcf5
e4f0435
580bcf5
e4f0435
e0860a0
e4f0435
580bcf5
e4f0435
580bcf5
e4f0435
580bcf5
72146a4
 
 
e4f0435
72146a4
e4f0435
72146a4
 
 
e4f0435
72146a4
 
e4f0435
72146a4
e4f0435
72146a4
e4f0435
72146a4
 
 
 
 
c2e1cfe
72146a4
 
 
 
 
c2e1cfe
580bcf5
72146a4
e4f0435
580bcf5
c2e1cfe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
try:
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from llama_index.llms.huggingface import HuggingFaceLLM
    import torch
except ImportError as e:
    print(f"Import error in local_llm.py: {e}")
    raise

class LocalLLM:
    def __init__(self):
        # Use a lightweight chat-compatible model that actually exists
        self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        print(f"Initializing LocalLLM with model: {self.model_name}")
        self.llm = self._create_llama_index_llm()

    def _create_llama_index_llm(self):
        try:
            print("Loading tokenizer...")
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)

            print("Loading model...")
            model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map="auto" if torch.cuda.is_available() else None,
                low_cpu_mem_usage=True
            )

            print("Creating LlamaIndex-compatible LLM...")
            llm = HuggingFaceLLM(
                model=model,
                tokenizer=tokenizer,
                context_window=2048,
                generate_kwargs={"temperature": 0.7, "max_new_tokens": 256},
                tokenizer_kwargs={"use_fast": True},
                device_map="auto" if torch.cuda.is_available() else None
            )
            print("✅ LLM created successfully!")
            return llm

        except Exception as e:
            print(f"❌ Failed to load {self.model_name}: {e}")
            return self._create_fallback_llm()

    def _create_fallback_llm(self):
        print("⚠️ Falling back to GPT2 model")
        model_name = "gpt2"
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            tokenizer.pad_token = tokenizer.eos_token

            model = AutoModelForCausalLM.from_pretrained(model_name)

            return HuggingFaceLLM(
                model=model,
                tokenizer=tokenizer,
                generate_kwargs={"temperature": 0.7, "max_new_tokens": 256},
            )
        except Exception as e:
            print(f"⚠️ Fallback model also failed: {e}")
            return self._create_mock_llm()

    def _create_mock_llm(self):
        print("⚠️ Using mock LLM")
        class MockLLM:
            def chat(self, messages, **kwargs):
                class MockResponse:
                    def __init__(self, text):
                        self.message = type('obj', (object,), {'content': text})
                return MockResponse("Mock chat response: I would analyze this question and provide an answer.")
            
            def complete(self, prompt, **kwargs):
                class MockCompletion:
                    def __init__(self, text):
                        self.text = text
                return MockCompletion("Mock completion response: I would analyze this question and provide an answer.")
        
        return MockLLM()

    def get_llm(self):
        return self.llm