File size: 4,600 Bytes
7268351
 
 
be89ae1
6e2dc41
b83a640
be89ae1
7268351
 
 
 
 
 
be89ae1
 
7268351
be89ae1
 
f5b718b
7268351
be89ae1
7268351
5a94c8e
 
 
 
 
 
 
 
 
 
 
f763dd0
7268351
 
 
f16063a
 
7268351
 
 
 
 
 
 
 
 
5a94c8e
f763dd0
5a94c8e
f16063a
f5b718b
5a94c8e
f763dd0
5a94c8e
f763dd0
e332fa0
 
 
867c886
6e2dc41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7268351
 
5a94c8e
7268351
6e2dc41
 
 
 
 
 
e332fa0
6e2dc41
 
e332fa0
6e2dc41
 
 
 
 
 
e332fa0
6e2dc41
e332fa0
6e2dc41
 
 
 
 
 
 
 
 
e332fa0
6e2dc41
 
e332fa0
6e2dc41
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import streamlit as st
import google.generativeai as genai
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from db import get_dataset_summary, get_entry_by_index

# Configure Gemini API key
GEMINI_API_KEY = os.getenv("gemini_api")
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
else:
    st.error("⚠️ Google API key is missing! Set it in Hugging Face Secrets.")

# Load pre-trained sentiment analysis model
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
except Exception as e:
    st.error(f"❌ Error loading sentiment model: {e}")

# Load Topic Extraction Model
try:
    topic_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
except Exception as e:
    st.error(f"❌ Error loading topic extraction model: {e}")

# Predefined topic labels for classification
TOPIC_LABELS = [
    "Technology", "Politics", "Business", "Sports", "Entertainment",
    "Health", "Science", "Education", "Finance", "Travel", "Food"
]

def analyze_sentiment(text):
    try:
        sentiment_result = sentiment_pipeline(text)[0]
        label = sentiment_result['label']
        score = sentiment_result['score']
        sentiment_mapping = {
            "LABEL_0": "Negative",
            "LABEL_1": "Neutral",
            "LABEL_2": "Positive"
        }
        return sentiment_mapping.get(label, "Unknown"), score
    except Exception as e:
        return f"Error analyzing sentiment: {e}", None

def extract_topic(text):
    try:
        topic_result = topic_pipeline(text, TOPIC_LABELS)
        top_topic = topic_result["labels"][0]
        confidence = topic_result["scores"][0]
        return top_topic, confidence
    except Exception as e:
        return f"Error extracting topic: {e}", None

def is_dataset_query(prompt):
    keywords = ["dataset", "data", "csv", "mongodb", "historical"]
    return any(keyword in prompt.lower() for keyword in keywords)

def extract_entry_index(prompt):
    # Map ordinal words to indices (0-indexed)
    ordinals = {
        "first": 0,
        "1st": 0,
        "second": 1,
        "2nd": 1,
        "third": 2,
        "3rd": 2,
        "fourth": 3,
        "4th": 3,
        "fifth": 4,
        "5th": 4,
    }
    for word, index in ordinals.items():
        if word in prompt.lower():
            return index
    return None

def chatbot_response(user_prompt):
    if not user_prompt:
        return None, None, None, None, None

    # Check if the query is about a specific dataset entry.
    entry_index = extract_entry_index(user_prompt)
    if entry_index is not None:
        entry_text = get_entry_by_index(entry_index)
        if entry_text:
            # Create a combined prompt for Gemini to generate detailed insights.
            combined_prompt = (
                f"Analyze the following dataset entry from MongoDB:\n\n{entry_text}\n\n"
                "Provide detailed insights, including sentiment analysis and category extraction."
            )
            model_gen = genai.GenerativeModel("gemini-1.5-pro")
            ai_response = model_gen.generate_content(combined_prompt)
            # Analyze the entry text.
            sentiment_label, sentiment_confidence = analyze_sentiment(entry_text)
            topic_label, topic_confidence = extract_topic(entry_text)
            return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence
        else:
            return f"❌ No entry found for index {entry_index+1}.", None, None, None, None

    # Otherwise, if the query is about the dataset in general.
    if is_dataset_query(user_prompt):
        dataset_insights = get_dataset_summary()
        combined_prompt = (
            f"{user_prompt}\n\nDataset Insights:\n{dataset_insights}\n\n"
            "Provide a detailed answer that incorporates these dataset insights."
        )
    else:
        combined_prompt = user_prompt

    model_gen = genai.GenerativeModel("gemini-1.5-pro")
    ai_response = model_gen.generate_content(combined_prompt)

    # Run sentiment analysis and topic extraction on the original user prompt.
    sentiment_label, sentiment_confidence = analyze_sentiment(user_prompt)
    topic_label, topic_confidence = extract_topic(user_prompt)

    return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence