Spaces:
Sleeping
Sleeping
KrSharangrav
commited on
Commit
Β·
6e2dc41
1
Parent(s):
e332fa0
change to integrate mongodb
Browse files- app.py +4 -3
- chatbot.py +51 -21
- db.py +9 -5
app.py
CHANGED
@@ -6,11 +6,12 @@ from chatbot import chatbot_response
|
|
6 |
# Insert historical data into MongoDB if not already present.
|
7 |
insert_data_if_empty()
|
8 |
|
9 |
-
# Connect to MongoDB (
|
10 |
collection = get_mongo_client()
|
11 |
|
12 |
-
st.subheader("π¬ Chatbot with Sentiment, Topic Analysis, and Dataset Insights")
|
13 |
-
user_prompt = st.text_area(
|
|
|
14 |
|
15 |
if st.button("Get AI Response"):
|
16 |
ai_response, sentiment_label, sentiment_confidence, topic_label, topic_confidence = chatbot_response(user_prompt)
|
|
|
6 |
# Insert historical data into MongoDB if not already present.
|
7 |
insert_data_if_empty()
|
8 |
|
9 |
+
# Connect to MongoDB (for additional visualizations if needed).
|
10 |
collection = get_mongo_client()
|
11 |
|
12 |
+
st.subheader("π¬ Chatbot with Sentiment, Topic Analysis, and Dataset Entry Insights")
|
13 |
+
user_prompt = st.text_area(
|
14 |
+
"Ask me something (e.g., 'What is the sentiment and category for the first data entry in the dataset'):")
|
15 |
|
16 |
if st.button("Get AI Response"):
|
17 |
ai_response, sentiment_label, sentiment_confidence, topic_label, topic_confidence = chatbot_response(user_prompt)
|
chatbot.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import streamlit as st
|
3 |
import google.generativeai as genai
|
4 |
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
|
5 |
-
from db import get_dataset_summary
|
6 |
|
7 |
# Configure Gemini API key
|
8 |
GEMINI_API_KEY = os.getenv("gemini_api")
|
@@ -32,7 +32,6 @@ TOPIC_LABELS = [
|
|
32 |
"Health", "Science", "Education", "Finance", "Travel", "Food"
|
33 |
]
|
34 |
|
35 |
-
# Function to analyze sentiment using the pre-trained model
|
36 |
def analyze_sentiment(text):
|
37 |
try:
|
38 |
sentiment_result = sentiment_pipeline(text)[0]
|
@@ -47,7 +46,6 @@ def analyze_sentiment(text):
|
|
47 |
except Exception as e:
|
48 |
return f"Error analyzing sentiment: {e}", None
|
49 |
|
50 |
-
# Function to extract topic using zero-shot classification
|
51 |
def extract_topic(text):
|
52 |
try:
|
53 |
topic_result = topic_pipeline(text, TOPIC_LABELS)
|
@@ -57,35 +55,67 @@ def extract_topic(text):
|
|
57 |
except Exception as e:
|
58 |
return f"Error extracting topic: {e}", None
|
59 |
|
60 |
-
# Helper to check if the user query is about the dataset
|
61 |
def is_dataset_query(prompt):
|
62 |
keywords = ["dataset", "data", "csv", "mongodb", "historical"]
|
63 |
return any(keyword in prompt.lower() for keyword in keywords)
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def chatbot_response(user_prompt):
|
67 |
if not user_prompt:
|
68 |
return None, None, None, None, None
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
74 |
combined_prompt = (
|
75 |
-
f"
|
76 |
-
"Provide
|
77 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
else:
|
79 |
-
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
topic_label, topic_confidence = extract_topic(user_prompt)
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
2 |
import streamlit as st
|
3 |
import google.generativeai as genai
|
4 |
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
|
5 |
+
from db import get_dataset_summary, get_entry_by_index
|
6 |
|
7 |
# Configure Gemini API key
|
8 |
GEMINI_API_KEY = os.getenv("gemini_api")
|
|
|
32 |
"Health", "Science", "Education", "Finance", "Travel", "Food"
|
33 |
]
|
34 |
|
|
|
35 |
def analyze_sentiment(text):
|
36 |
try:
|
37 |
sentiment_result = sentiment_pipeline(text)[0]
|
|
|
46 |
except Exception as e:
|
47 |
return f"Error analyzing sentiment: {e}", None
|
48 |
|
|
|
49 |
def extract_topic(text):
|
50 |
try:
|
51 |
topic_result = topic_pipeline(text, TOPIC_LABELS)
|
|
|
55 |
except Exception as e:
|
56 |
return f"Error extracting topic: {e}", None
|
57 |
|
|
|
58 |
def is_dataset_query(prompt):
|
59 |
keywords = ["dataset", "data", "csv", "mongodb", "historical"]
|
60 |
return any(keyword in prompt.lower() for keyword in keywords)
|
61 |
|
62 |
+
def extract_entry_index(prompt):
|
63 |
+
# Map ordinal words to indices (0-indexed)
|
64 |
+
ordinals = {
|
65 |
+
"first": 0,
|
66 |
+
"1st": 0,
|
67 |
+
"second": 1,
|
68 |
+
"2nd": 1,
|
69 |
+
"third": 2,
|
70 |
+
"3rd": 2,
|
71 |
+
"fourth": 3,
|
72 |
+
"4th": 3,
|
73 |
+
"fifth": 4,
|
74 |
+
"5th": 4,
|
75 |
+
}
|
76 |
+
for word, index in ordinals.items():
|
77 |
+
if word in prompt.lower():
|
78 |
+
return index
|
79 |
+
return None
|
80 |
+
|
81 |
def chatbot_response(user_prompt):
|
82 |
if not user_prompt:
|
83 |
return None, None, None, None, None
|
84 |
|
85 |
+
# Check if the query is about a specific dataset entry.
|
86 |
+
entry_index = extract_entry_index(user_prompt)
|
87 |
+
if entry_index is not None:
|
88 |
+
entry_text = get_entry_by_index(entry_index)
|
89 |
+
if entry_text:
|
90 |
+
# Create a combined prompt for Gemini to generate detailed insights.
|
91 |
combined_prompt = (
|
92 |
+
f"Analyze the following dataset entry from MongoDB:\n\n{entry_text}\n\n"
|
93 |
+
"Provide detailed insights, including sentiment analysis and category extraction."
|
94 |
)
|
95 |
+
model_gen = genai.GenerativeModel("gemini-1.5-pro")
|
96 |
+
ai_response = model_gen.generate_content(combined_prompt)
|
97 |
+
# Analyze the entry text.
|
98 |
+
sentiment_label, sentiment_confidence = analyze_sentiment(entry_text)
|
99 |
+
topic_label, topic_confidence = extract_topic(entry_text)
|
100 |
+
return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence
|
101 |
else:
|
102 |
+
return f"β No entry found for index {entry_index+1}.", None, None, None, None
|
103 |
|
104 |
+
# Otherwise, if the query is about the dataset in general.
|
105 |
+
if is_dataset_query(user_prompt):
|
106 |
+
dataset_insights = get_dataset_summary()
|
107 |
+
combined_prompt = (
|
108 |
+
f"{user_prompt}\n\nDataset Insights:\n{dataset_insights}\n\n"
|
109 |
+
"Provide a detailed answer that incorporates these dataset insights."
|
110 |
+
)
|
111 |
+
else:
|
112 |
+
combined_prompt = user_prompt
|
113 |
|
114 |
+
model_gen = genai.GenerativeModel("gemini-1.5-pro")
|
115 |
+
ai_response = model_gen.generate_content(combined_prompt)
|
|
|
116 |
|
117 |
+
# Run sentiment analysis and topic extraction on the original user prompt.
|
118 |
+
sentiment_label, sentiment_confidence = analyze_sentiment(user_prompt)
|
119 |
+
topic_label, topic_confidence = extract_topic(user_prompt)
|
120 |
+
|
121 |
+
return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence
|
db.py
CHANGED
@@ -3,13 +3,11 @@ import requests
|
|
3 |
import io
|
4 |
from pymongo import MongoClient
|
5 |
|
6 |
-
# Function to connect to MongoDB.
|
7 |
def get_mongo_client():
|
8 |
client = MongoClient("mongodb+srv://groupA:[email protected]/?retryWrites=true&w=majority&appName=SentimentCluster")
|
9 |
db = client["sentiment_db"]
|
10 |
return db["tweets"]
|
11 |
|
12 |
-
# Function to insert data if the collection is empty.
|
13 |
def insert_data_if_empty():
|
14 |
collection = get_mongo_client()
|
15 |
if collection.count_documents({}) == 0:
|
@@ -24,15 +22,12 @@ def insert_data_if_empty():
|
|
24 |
except Exception as e:
|
25 |
print(f"β Error loading dataset: {e}")
|
26 |
|
27 |
-
# Function to get dataset summary from MongoDB.
|
28 |
def get_dataset_summary():
|
29 |
collection = get_mongo_client()
|
30 |
-
# Aggregate counts for each sentiment target.
|
31 |
pipeline = [
|
32 |
{"$group": {"_id": "$target", "count": {"$sum": 1}}}
|
33 |
]
|
34 |
results = list(collection.aggregate(pipeline))
|
35 |
-
# Map the sentiment target values to labels.
|
36 |
mapping = {"0": "Negative", "2": "Neutral", "4": "Positive"}
|
37 |
summary_parts = []
|
38 |
total = 0
|
@@ -44,3 +39,12 @@ def get_dataset_summary():
|
|
44 |
summary_parts.append(f"{label}: {count}")
|
45 |
summary = f"Total tweets: {total}. " + ", ".join(summary_parts) + "."
|
46 |
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import io
|
4 |
from pymongo import MongoClient
|
5 |
|
|
|
6 |
def get_mongo_client():
|
7 |
client = MongoClient("mongodb+srv://groupA:[email protected]/?retryWrites=true&w=majority&appName=SentimentCluster")
|
8 |
db = client["sentiment_db"]
|
9 |
return db["tweets"]
|
10 |
|
|
|
11 |
def insert_data_if_empty():
|
12 |
collection = get_mongo_client()
|
13 |
if collection.count_documents({}) == 0:
|
|
|
22 |
except Exception as e:
|
23 |
print(f"β Error loading dataset: {e}")
|
24 |
|
|
|
25 |
def get_dataset_summary():
|
26 |
collection = get_mongo_client()
|
|
|
27 |
pipeline = [
|
28 |
{"$group": {"_id": "$target", "count": {"$sum": 1}}}
|
29 |
]
|
30 |
results = list(collection.aggregate(pipeline))
|
|
|
31 |
mapping = {"0": "Negative", "2": "Neutral", "4": "Positive"}
|
32 |
summary_parts = []
|
33 |
total = 0
|
|
|
39 |
summary_parts.append(f"{label}: {count}")
|
40 |
summary = f"Total tweets: {total}. " + ", ".join(summary_parts) + "."
|
41 |
return summary
|
42 |
+
|
43 |
+
def get_entry_by_index(index):
|
44 |
+
collection = get_mongo_client()
|
45 |
+
# Sort by _id (assumes insertion order), skip to the requested index, and get one document.
|
46 |
+
document = collection.find({}, {"_id": 0}).sort("_id", 1).skip(index).limit(1)
|
47 |
+
docs = list(document)
|
48 |
+
if docs:
|
49 |
+
return docs[0].get("text", None)
|
50 |
+
return None
|