2B / app /core /llm.py
37-AN
Fix 403 error by using local models
403ced7
raw
history blame
4.92 kB
from langchain.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import sys
import os
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Add project root to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS
def get_llm():
"""Initialize and return the language model."""
# Set up cache directories with proper permissions
cache_dir = "/app/models"
if not os.path.exists(cache_dir):
try:
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)
except Exception as e:
logger.warning(f"Could not create cache directory: {e}")
cache_dir = None
# Never rely on API key in Spaces environment
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
logger.info(f"Using model: {LLM_MODEL}")
# Always try local pipeline first (most reliable in Spaces)
try:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
logger.info(f"Loading model {LLM_MODEL} as local pipeline")
# Try loading with more specific model classes for better compatibility
try:
# Load tokenizer and model explicitly
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
model = AutoModelForCausalLM.from_pretrained(LLM_MODEL)
# Create pipeline with loaded components
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=MAX_TOKENS,
temperature=DEFAULT_TEMPERATURE
)
return HuggingFacePipeline(pipeline=pipe)
except Exception as e:
logger.warning(f"Error loading with explicit model/tokenizer: {e}")
# Fallback to simpler pipeline instantiation
pipe = pipeline(
"text-generation",
model=LLM_MODEL,
max_length=MAX_TOKENS,
temperature=DEFAULT_TEMPERATURE
)
return HuggingFacePipeline(pipeline=pipe)
except Exception as e:
logger.warning(f"Error creating local pipeline: {e}")
# Last resort - mock LLM for fallback
from langchain.llms.fake import FakeListLLM
logger.warning("Using mock LLM as fallback")
return FakeListLLM(
responses=[
"I'm running in fallback mode due to model loading issues. I have limited capabilities right now.",
"I can't access the language model currently. Please check the Space logs for more information.",
"I'm operating with a simplified model. For better performance, try running this app locally with proper models configured."
]
)
def get_embeddings():
"""Initialize and return the embeddings model."""
# Set up cache directories with proper permissions
cache_dir = "/app/models"
if not os.path.exists(cache_dir):
try:
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)
except Exception as e:
logger.warning(f"Could not create cache directory: {e}")
cache_dir = None
# Try to use local embeddings
try:
logger.info(f"Loading embeddings model: {EMBEDDING_MODEL}")
return HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
cache_folder=cache_dir
)
except Exception as e:
logger.warning(f"Error initializing embeddings: {e}")
# Create mock embeddings that return random vectors for fallback
from langchain.embeddings.fake import FakeEmbeddings
logger.warning("Using mock embeddings as fallback")
return FakeEmbeddings(size=384) # Standard size for small embedding models
def get_chat_model():
"""
Create a chat-like interface using a regular LLM.
This is necessary because many free HF models don't have chat interfaces.
"""
llm = get_llm()
# Create a chat-like prompt template
chat_template = """
Context: {context}
Chat History:
{chat_history}
User: {question}
AI Assistant:
"""
prompt = PromptTemplate(
input_variables=["context", "chat_history", "question"],
template=chat_template
)
# Create a chain
return LLMChain(llm=llm, prompt=prompt)