from transformers import pipeline import torch import os from dotenv import load_dotenv load_dotenv() class LLMPipeline: def __init__(self): model_id = os.getenv("HF_MODEL_ID", "mradermacher/Huihui-gemma-3n-E4B-it-abliterated-GGUF") try: # Try to use CUDA if available if torch.cuda.is_available(): device = "cuda" dtype = torch.float16 else: device = "cpu" dtype = torch.float32 self.pipeline = pipeline( "text-generation", model=model_id, torch_dtype=dtype, device_map="auto" if device == "cuda" else None, model_kwargs={"low_cpu_mem_usage": True} ) except Exception as e: print(f"Error loading model: {e}") raise async def generate(self, prompt: str, max_length: int = 100) -> str: """Generate text using the local Gemma model.""" try: result = self.pipeline( prompt, max_length=max_length, num_return_sequences=1, temperature=0.7, top_p=0.9 ) return result[0]['generated_text'] except Exception as e: print(f"Error in LLM generation: {e}") return ""