submission-template

Sleeping

File size: 4,185 Bytes

60d8d9d

import pandas as pd
import os
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
import spacy
import string
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Get English stop words from NLTK
stop_words = set(stopwords.words("english"))

def process_text(text):
    """
    Process text by:
    1. Lowercasing
    2. Removing punctuation and non-alphanumeric characters
    3. Removing stop words
    4. Lemmatization
    """
    # Step 1: Tokenization & Processing with spaCy
    doc = nlp(text.lower())  # Process text with spaCy

    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
    processed_tokens = [
        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
        for token in doc 
        if token.text not in stop_words and token.text not in string.punctuation
    ]
    
    # Optional: Filter out empty strings resulting from the regex replacement
    processed_tokens = [word for word in processed_tokens if word]
    
    return processed_tokens



def generate_word_clouds_by_category(df, output_dir="wordclouds"):
    """
    Generates and saves word clouds for each category in the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
        output_dir (str): Directory to save the word cloud images.
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Group words by category
    category_word_map = defaultdict(list)
    for _, row in df.iterrows():
        category_word_map[row["label"]].extend(row["processed_quote"])
    
    # Generate and save word clouds
    for category, words in category_word_map.items():
        word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))
        
        # Plot and save the word cloud
        plt.figure(figsize=(10, 5))
        plt.imshow(word_cloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(category)
        
        # Save the plot as an image
        filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
        plt.savefig(filename, bbox_inches='tight')
        print(f"Word cloud saved for category '{category}' at {filename}")
        
        plt.close()  # Close the figure to avoid memory issues




def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
    """
    This function reduces high-dimensional embeddings into 2D and visualizes them.

    Args:
        df (pd.DataFrame): DataFrame containing the text data and labels.
        embedding_column (str): The column containing the embeddings.
        label_column (str): The column containing the labels.
        method (str): The dimensionality reduction method ('PCA' or 'tSNE').
        random_state (int): Random state for reproducibility.
    """
    # Step 1: Use dimensionality reduction (PCA or t-SNE)
    if method == 'PCA':
        reducer = PCA(n_components=2, random_state=random_state)
    elif method == 'tSNE':
        reducer = TSNE(n_components=2, random_state=random_state)
    else:
        raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")
    
    # Reduce the embeddings to 2D
    embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())
    
    # Step 2: Plot the 2D embeddings
    plt.figure(figsize=(10, 8))
    
    # Scatter plot, coloring points by their label
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')
    
    # Create a legend for the classes (labels)
    plt.legend(*scatter.legend_elements(), title="Classes")
    
    # Adding labels and title
    plt.title("2D Visualization of Embeddings")
    plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
    plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")
    
    plt.colorbar(scatter)
    plt.show()