Spaces:

sharangrav24
/

SentimentAnalysis

Sleeping

App Files Files Community

KrSharangrav commited on Mar 15

Commit

be89ae1

1 Parent(s): f16063a

changes made further into the model with chatbot interacting with the dataset

Browse files

Files changed (2) hide show

app.py +4 -4
chatbot.py +37 -94

app.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import streamlit as st
 import pandas as pd
 from db import insert_data_if_empty, get_mongo_client
-from chatbot import chatbot_response  # Import the updated chatbot functionality
-# Ensure that historical data is in the database
 insert_data_if_empty()
-# Connect to MongoDB
 collection = get_mongo_client()
-st.subheader("💬 Chatbot with Sentiment & Topic Analysis")
 user_prompt = st.text_area("Ask me something:")
 if st.button("Get AI Response"):

 import streamlit as st
 import pandas as pd
 from db import insert_data_if_empty, get_mongo_client
+from chatbot import chatbot_response
+# Insert historical data into MongoDB if not already present
 insert_data_if_empty()
+# Connect to MongoDB (available for further extension or analysis)
 collection = get_mongo_client()
+st.subheader("💬 Chatbot with Sentiment, Topic Analysis, and Dataset Insights")
 user_prompt = st.text_area("Ask me something:")
 if st.button("Get AI Response"):

chatbot.py CHANGED Viewed

@@ -1,103 +1,23 @@
 import os
 import streamlit as st
 import google.generativeai as genai
-from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
-from datasets import load_dataset
-# --- Monkey Patch for Accelerator ---
-try:
-    import accelerate
-    from accelerate import Accelerator
-    import inspect
-    # If the Accelerator.__init__ does not accept "dispatch_batches", remove it from kwargs.
-    if 'dispatch_batches' not in inspect.signature(Accelerator.__init__).parameters:
-        old_init = Accelerator.__init__
-        def new_init(self, *args, **kwargs):
-            if 'dispatch_batches' in kwargs:
-                kwargs.pop('dispatch_batches')
-            old_init(self, *args, **kwargs)
-        Accelerator.__init__ = new_init
-except Exception as e:
-    st.error(f"Error patching Accelerator: {e}")
-# --- Configure Gemini API ---
 GEMINI_API_KEY = os.getenv("gemini_api")
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
 else:
     st.error("⚠️ Google API key is missing! Set it in Hugging Face Secrets.")
-# Path to save/load the fine-tuned model
-FINE_TUNED_MODEL_DIR = "fine-tuned-sentiment-model"
-# --- Fine-tune the Sentiment Model ---
-def fine_tune_model():
-    st.info("Fine-tuning sentiment model. This may take a while...")
-    # Load the dataset from the local CSV file.
-    try:
-        dataset = load_dataset('csv', data_files={'train': 'sentiment140.csv'}, encoding='ISO-8859-1')
-    except Exception as e:
-        st.error(f"❌ Error loading dataset: {e}")
-        return None, None
-    # Convert sentiment labels: sentiment140 labels are 0 (Negative), 2 (Neutral), 4 (Positive).
-    def convert_labels(example):
-        mapping = {0: 0, 2: 1, 4: 2}
-        example["label"] = mapping[int(example["target"])]
-        return example
-    dataset = dataset.map(convert_labels)
-    base_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
-    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-    model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=3)
-    # Tokenize the dataset; assuming the CSV has a column named "text"
-    def tokenize_function(examples):
-        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
-    tokenized_dataset = dataset.map(tokenize_function, batched=True)
-    training_args = TrainingArguments(
-        output_dir="./results",
-        num_train_epochs=1,                  # For demonstration, we train for 1 epoch.
-        per_device_train_batch_size=8,
-        logging_steps=10,
-        save_steps=50,
-        evaluation_strategy="no",
-        learning_rate=2e-5,
-        weight_decay=0.01,
-        logging_dir='./logs',
-        disable_tqdm=False
-    )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_dataset["train"]
-    )
-    trainer.train()
-    model.save_pretrained(FINE_TUNED_MODEL_DIR)
-    tokenizer.save_pretrained(FINE_TUNED_MODEL_DIR)
-    st.success("✅ Fine-tuning complete and model saved.")
-    return model, tokenizer
-# Load or fine-tune the sentiment model
-if not os.path.exists(FINE_TUNED_MODEL_DIR):
-    model, tokenizer = fine_tune_model()
-    if model is None or tokenizer is None:
-        st.error("❌ Failed to fine-tune the sentiment analysis model.")
-else:
-    tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_MODEL_DIR)
-    model = AutoModelForSequenceClassification.from_pretrained(FINE_TUNED_MODEL_DIR)
-# Create sentiment analysis pipeline from the fine-tuned model
 try:
     sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 except Exception as e:
-    st.error(f"❌ Error loading sentiment pipeline: {e}")
 # Load Topic Extraction Model
 try:
@@ -111,7 +31,7 @@ TOPIC_LABELS = [
     "Health", "Science", "Education", "Finance", "Travel", "Food"
 ]
-# Function to analyze sentiment
 def analyze_sentiment(text):
     try:
         sentiment_result = sentiment_pipeline(text)[0]
@@ -126,7 +46,7 @@ def analyze_sentiment(text):
     except Exception as e:
         return f"Error analyzing sentiment: {e}", None
-# Function to extract topic
 def extract_topic(text):
     try:
         topic_result = topic_pipeline(text, TOPIC_LABELS)
@@ -136,22 +56,45 @@ def extract_topic(text):
     except Exception as e:
         return f"Error extracting topic: {e}", None
-# Function to generate AI response along with sentiment and topic analysis
 def chatbot_response(user_prompt):
     if not user_prompt:
         return None, None, None, None, None
     try:
-        # Generate AI Response using Gemini
         model_gen = genai.GenerativeModel("gemini-1.5-pro")
         ai_response = model_gen.generate_content(user_prompt)
-        # Sentiment Analysis
         sentiment_label, sentiment_confidence = analyze_sentiment(user_prompt)
-        # Topic Extraction
         topic_label, topic_confidence = extract_topic(user_prompt)
-        return ai_response.text, sentiment_label, sentiment_confidence, topic_label, topic_confidence
     except Exception as e:
         return f"❌ Error: {e}", None, None, None, None

 import os
 import streamlit as st
 import google.generativeai as genai
+from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
+# Configure Gemini API key
 GEMINI_API_KEY = os.getenv("gemini_api")
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
 else:
     st.error("⚠️ Google API key is missing! Set it in Hugging Face Secrets.")
+# Load pre-trained sentiment analysis model
+MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
 try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
     sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 except Exception as e:
+    st.error(f"❌ Error loading sentiment model: {e}")
 # Load Topic Extraction Model
 try:
     "Health", "Science", "Education", "Finance", "Travel", "Food"
 ]
+# Function to analyze sentiment using the pre-trained model
 def analyze_sentiment(text):
     try:
         sentiment_result = sentiment_pipeline(text)[0]
     except Exception as e:
         return f"Error analyzing sentiment: {e}", None
+# Function to extract topic using zero-shot classification
 def extract_topic(text):
     try:
         topic_result = topic_pipeline(text, TOPIC_LABELS)
     except Exception as e:
         return f"Error extracting topic: {e}", None
+# Function to generate AI response along with sentiment and topic analysis.
+# Also, if the query relates to the dataset, fetch statistics from MongoDB.
 def chatbot_response(user_prompt):
     if not user_prompt:
         return None, None, None, None, None
     try:
+        # Generate AI response using Gemini
         model_gen = genai.GenerativeModel("gemini-1.5-pro")
         ai_response = model_gen.generate_content(user_prompt)
+        # Perform sentiment analysis on the user prompt
         sentiment_label, sentiment_confidence = analyze_sentiment(user_prompt)
+        # Perform topic extraction on the user prompt
         topic_label, topic_confidence = extract_topic(user_prompt)
+        # If the prompt seems related to the dataset, get MongoDB statistics.
+        if any(keyword in user_prompt.lower() for keyword in ["sentiment140", "dataset", "historical", "mongodb", "stored data"]):
+            from db import get_mongo_client
+            collection = get_mongo_client()
+            # Aggregate counts by the 'target' field (assumed to be in the CSV)
+            pipeline = [
+                {"$group": {"_id": "$target", "count": {"$sum": 1}}}
+            ]
+            results = list(collection.aggregate(pipeline))
+            sentiment_map = {0: "Negative", 2: "Neutral", 4: "Positive"}
+            stats_str = ""
+            total = 0
+            for r in results:
+                key = sentiment_map.get(r["_id"], r["_id"])
+                count = r["count"]
+                total += count
+                stats_str += f"{key}: {count}\n"
+            stats_str += f"Total records: {total}"
+            ai_response_text = ai_response.text + "\n\nDataset Information:\n" + stats_str
+        else:
+            ai_response_text = ai_response.text
+        return ai_response_text, sentiment_label, sentiment_confidence, topic_label, topic_confidence
     except Exception as e:
         return f"❌ Error: {e}", None, None, None, None