import os import streamlit as st import google.generativeai as genai from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer from db import get_dataset_summary, get_entry_by_index # Configure Gemini API key GEMINI_API_KEY = os.getenv("gemini_api") if GEMINI_API_KEY: genai.configure(api_key=GEMINI_API_KEY) else: st.error("⚠️ Google API key is missing! Set it in Hugging Face Secrets.") # Load pre-trained sentiment analysis model MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment" try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) except Exception as e: st.error(f"❌ Error loading sentiment model: {e}") # Load Topic Extraction Model try: topic_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") except Exception as e: st.error(f"❌ Error loading topic extraction model: {e}") # Predefined topic labels for classification TOPIC_LABELS = [ "Technology", "Politics", "Business", "Sports", "Entertainment", "Health", "Science", "Education", "Finance", "Travel", "Food" ] def analyze_sentiment(text): try: sentiment_result = sentiment_pipeline(text)[0] label = sentiment_result['label'] score = sentiment_result['score'] sentiment_mapping = { "LABEL_0": "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive" } return sentiment_mapping.get(label, "Unknown"), score except Exception as e: return f"Error analyzing sentiment: {e}", None def extract_topic(text): try: topic_result = topic_pipeline(text, TOPIC_LABELS) top_topic = topic_result["labels"][0] confidence = topic_result["scores"][0] return top_topic, confidence except Exception as e: return f"Error extracting topic: {e}", None def is_dataset_query(prompt): keywords = ["dataset", "data", "csv", "mongodb", "historical"] return any(keyword in prompt.lower() for keyword in keywords) def extract_entry_index(prompt): # Map ordinal words to indices (0-indexed) ordinals = { "first": 0, "1st": 0, "second": 1, "2nd": 1, "third": 2, "3rd": 2, "fourth": 3, "4th": 3, "fifth": 4, "5th": 4, } for word, index in ordinals.items(): if word in prompt.lower(): return index return None def chatbot_response(user_prompt): if not user_prompt: return None, None, None, None, None # Check if the query is about a specific dataset entry. entry_index = extract_entry_index(user_prompt) if entry_index is not None: entry_text = get_entry_by_index(entry_index) if entry_text: # Create a combined prompt for Gemini to generate detailed insights. combined_prompt = ( f"Analyze the following dataset entry from MongoDB:\n\n{entry_text}\n\n" "Provide detailed insights, including sentiment analysis and category extraction." ) model_gen = genai.GenerativeModel("gemini-1.5-pro") ai_response = model_gen.generate_content(combined_prompt) # Analyze the entry text. sentiment_label, sentiment_confidence = analyze_sentiment(entry_text) topic_label, topic_confidence = extract_topic(entry_text) return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence else: return f"❌ No entry found for index {entry_index+1}.", None, None, None, None # Otherwise, if the query is about the dataset in general. if is_dataset_query(user_prompt): dataset_insights = get_dataset_summary() combined_prompt = ( f"{user_prompt}\n\nDataset Insights:\n{dataset_insights}\n\n" "Provide a detailed answer that incorporates these dataset insights." ) else: combined_prompt = user_prompt model_gen = genai.GenerativeModel("gemini-1.5-pro") ai_response = model_gen.generate_content(combined_prompt) # Run sentiment analysis and topic extraction on the original user prompt. sentiment_label, sentiment_confidence = analyze_sentiment(user_prompt) topic_label, topic_confidence = extract_topic(user_prompt) return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence