submission-template / eda_functions.py
sumesh4C's picture
Upload 2 files
60d8d9d verified
raw
history blame
4.19 kB
import pandas as pd
import os
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
import spacy
import string
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")
# Get English stop words from NLTK
stop_words = set(stopwords.words("english"))
def process_text(text):
"""
Process text by:
1. Lowercasing
2. Removing punctuation and non-alphanumeric characters
3. Removing stop words
4. Lemmatization
"""
# Step 1: Tokenization & Processing with spaCy
doc = nlp(text.lower()) # Process text with spaCy
# Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
processed_tokens = [
re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
for token in doc
if token.text not in stop_words and token.text not in string.punctuation
]
# Optional: Filter out empty strings resulting from the regex replacement
processed_tokens = [word for word in processed_tokens if word]
return processed_tokens
def generate_word_clouds_by_category(df, output_dir="wordclouds"):
"""
Generates and saves word clouds for each category in the DataFrame.
Args:
df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
output_dir (str): Directory to save the word cloud images.
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Group words by category
category_word_map = defaultdict(list)
for _, row in df.iterrows():
category_word_map[row["label"]].extend(row["processed_quote"])
# Generate and save word clouds
for category, words in category_word_map.items():
word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))
# Plot and save the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.title(category)
# Save the plot as an image
filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
plt.savefig(filename, bbox_inches='tight')
print(f"Word cloud saved for category '{category}' at {filename}")
plt.close() # Close the figure to avoid memory issues
def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
"""
This function reduces high-dimensional embeddings into 2D and visualizes them.
Args:
df (pd.DataFrame): DataFrame containing the text data and labels.
embedding_column (str): The column containing the embeddings.
label_column (str): The column containing the labels.
method (str): The dimensionality reduction method ('PCA' or 'tSNE').
random_state (int): Random state for reproducibility.
"""
# Step 1: Use dimensionality reduction (PCA or t-SNE)
if method == 'PCA':
reducer = PCA(n_components=2, random_state=random_state)
elif method == 'tSNE':
reducer = TSNE(n_components=2, random_state=random_state)
else:
raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")
# Reduce the embeddings to 2D
embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())
# Step 2: Plot the 2D embeddings
plt.figure(figsize=(10, 8))
# Scatter plot, coloring points by their label
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')
# Create a legend for the classes (labels)
plt.legend(*scatter.legend_elements(), title="Classes")
# Adding labels and title
plt.title("2D Visualization of Embeddings")
plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")
plt.colorbar(scatter)
plt.show()