Spaces:

p3rc03
/

2B

Sleeping

2B / app /core /llm.py

37-AN

Fix 403 error by using local models

403ced7 4 months ago

4.92 kB

	from langchain.llms import HuggingFaceHub
	from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	import sys
	import os
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Add project root to path for imports
	sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
	from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS

	def get_llm():
	"""Initialize and return the language model."""
	# Set up cache directories with proper permissions
	cache_dir = "/app/models"
	if not os.path.exists(cache_dir):
	try:
	os.makedirs(cache_dir, exist_ok=True)
	os.chmod(cache_dir, 0o777)
	except Exception as e:
	logger.warning(f"Could not create cache directory: {e}")
	cache_dir = None

	# Never rely on API key in Spaces environment
	api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
	logger.info(f"Using model: {LLM_MODEL}")

	# Always try local pipeline first (most reliable in Spaces)
	try:
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

	logger.info(f"Loading model {LLM_MODEL} as local pipeline")

	# Try loading with more specific model classes for better compatibility
	try:
	# Load tokenizer and model explicitly
	tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
	model = AutoModelForCausalLM.from_pretrained(LLM_MODEL)

	# Create pipeline with loaded components
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=MAX_TOKENS,
	temperature=DEFAULT_TEMPERATURE
	)

	return HuggingFacePipeline(pipeline=pipe)
	except Exception as e:
	logger.warning(f"Error loading with explicit model/tokenizer: {e}")

	# Fallback to simpler pipeline instantiation
	pipe = pipeline(
	"text-generation",
	model=LLM_MODEL,
	max_length=MAX_TOKENS,
	temperature=DEFAULT_TEMPERATURE
	)

	return HuggingFacePipeline(pipeline=pipe)

	except Exception as e:
	logger.warning(f"Error creating local pipeline: {e}")

	# Last resort - mock LLM for fallback
	from langchain.llms.fake import FakeListLLM
	logger.warning("Using mock LLM as fallback")
	return FakeListLLM(
	responses=[
	"I'm running in fallback mode due to model loading issues. I have limited capabilities right now.",
	"I can't access the language model currently. Please check the Space logs for more information.",
	"I'm operating with a simplified model. For better performance, try running this app locally with proper models configured."
	]
	)

	def get_embeddings():
	"""Initialize and return the embeddings model."""
	# Set up cache directories with proper permissions
	cache_dir = "/app/models"
	if not os.path.exists(cache_dir):
	try:
	os.makedirs(cache_dir, exist_ok=True)
	os.chmod(cache_dir, 0o777)
	except Exception as e:
	logger.warning(f"Could not create cache directory: {e}")
	cache_dir = None

	# Try to use local embeddings
	try:
	logger.info(f"Loading embeddings model: {EMBEDDING_MODEL}")
	return HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL,
	cache_folder=cache_dir
	)
	except Exception as e:
	logger.warning(f"Error initializing embeddings: {e}")

	# Create mock embeddings that return random vectors for fallback
	from langchain.embeddings.fake import FakeEmbeddings
	logger.warning("Using mock embeddings as fallback")
	return FakeEmbeddings(size=384) # Standard size for small embedding models

	def get_chat_model():
	"""
	Create a chat-like interface using a regular LLM.
	This is necessary because many free HF models don't have chat interfaces.
	"""
	llm = get_llm()

	# Create a chat-like prompt template
	chat_template = """
	Context: {context}

	Chat History:
	{chat_history}

	User: {question}
	AI Assistant:
	"""

	prompt = PromptTemplate(
	input_variables=["context", "chat_history", "question"],
	template=chat_template
	)

	# Create a chain
	return LLMChain(llm=llm, prompt=prompt)