import pandas as pd import os import re from nltk.corpus import stopwords from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import defaultdict import spacy import string from sklearn.decomposition import PCA from sklearn.manifold import TSNE # Load the spaCy model for English nlp = spacy.load("en_core_web_sm") # Get English stop words from NLTK stop_words = set(stopwords.words("english")) def process_text(text): """ Process text by: 1. Lowercasing 2. Removing punctuation and non-alphanumeric characters 3. Removing stop words 4. Lemmatization """ # Step 1: Tokenization & Processing with spaCy doc = nlp(text.lower()) # Process text with spaCy # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization processed_tokens = [ re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters for token in doc if token.text not in stop_words and token.text not in string.punctuation ] # Optional: Filter out empty strings resulting from the regex replacement processed_tokens = [word for word in processed_tokens if word] return processed_tokens def generate_word_clouds_by_category(df, output_dir="wordclouds"): """ Generates and saves word clouds for each category in the DataFrame. Args: df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns. output_dir (str): Directory to save the word cloud images. """ # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) # Group words by category category_word_map = defaultdict(list) for _, row in df.iterrows(): category_word_map[row["label"]].extend(row["processed_quote"]) # Generate and save word clouds for category, words in category_word_map.items(): word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words)) # Plot and save the word cloud plt.figure(figsize=(10, 5)) plt.imshow(word_cloud, interpolation='bilinear') plt.axis('off') plt.title(category) # Save the plot as an image filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png") plt.savefig(filename, bbox_inches='tight') print(f"Word cloud saved for category '{category}' at {filename}") plt.close() # Close the figure to avoid memory issues def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42): """ This function reduces high-dimensional embeddings into 2D and visualizes them. Args: df (pd.DataFrame): DataFrame containing the text data and labels. embedding_column (str): The column containing the embeddings. label_column (str): The column containing the labels. method (str): The dimensionality reduction method ('PCA' or 'tSNE'). random_state (int): Random state for reproducibility. """ # Step 1: Use dimensionality reduction (PCA or t-SNE) if method == 'PCA': reducer = PCA(n_components=2, random_state=random_state) elif method == 'tSNE': reducer = TSNE(n_components=2, random_state=random_state) else: raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.") # Reduce the embeddings to 2D embeddings_2d = reducer.fit_transform(df[embedding_column].tolist()) # Step 2: Plot the 2D embeddings plt.figure(figsize=(10, 8)) # Scatter plot, coloring points by their label scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis') # Create a legend for the classes (labels) plt.legend(*scatter.legend_elements(), title="Classes") # Adding labels and title plt.title("2D Visualization of Embeddings") plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1") plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2") plt.colorbar(scatter) plt.show()