Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import os | |
| import re | |
| from nltk.corpus import stopwords | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from collections import defaultdict | |
| import spacy | |
| import string | |
| from sklearn.decomposition import PCA | |
| from sklearn.manifold import TSNE | |
| # Load the spaCy model for English | |
| nlp = spacy.load("en_core_web_sm") | |
| # Get English stop words from NLTK | |
| stop_words = set(stopwords.words("english")) | |
| def process_text(text): | |
| """ | |
| Process text by: | |
| 1. Lowercasing | |
| 2. Removing punctuation and non-alphanumeric characters | |
| 3. Removing stop words | |
| 4. Lemmatization | |
| """ | |
| # Step 1: Tokenization & Processing with spaCy | |
| doc = nlp(text.lower()) # Process text with spaCy | |
| # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization | |
| processed_tokens = [ | |
| re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters | |
| for token in doc | |
| if token.text not in stop_words and token.text not in string.punctuation | |
| ] | |
| # Optional: Filter out empty strings resulting from the regex replacement | |
| processed_tokens = [word for word in processed_tokens if word] | |
| return processed_tokens | |
| def generate_word_clouds_by_category(df, output_dir="wordclouds"): | |
| """ | |
| Generates and saves word clouds for each category in the DataFrame. | |
| Args: | |
| df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns. | |
| output_dir (str): Directory to save the word cloud images. | |
| """ | |
| # Create output directory if it doesn't exist | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Group words by category | |
| category_word_map = defaultdict(list) | |
| for _, row in df.iterrows(): | |
| category_word_map[row["label"]].extend(row["processed_quote"]) | |
| # Generate and save word clouds | |
| for category, words in category_word_map.items(): | |
| word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words)) | |
| # Plot and save the word cloud | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(word_cloud, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.title(category) | |
| # Save the plot as an image | |
| filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png") | |
| plt.savefig(filename, bbox_inches='tight') | |
| print(f"Word cloud saved for category '{category}' at {filename}") | |
| plt.close() # Close the figure to avoid memory issues | |
| def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42): | |
| """ | |
| This function reduces high-dimensional embeddings into 2D and visualizes them. | |
| Args: | |
| df (pd.DataFrame): DataFrame containing the text data and labels. | |
| embedding_column (str): The column containing the embeddings. | |
| label_column (str): The column containing the labels. | |
| method (str): The dimensionality reduction method ('PCA' or 'tSNE'). | |
| random_state (int): Random state for reproducibility. | |
| """ | |
| # Step 1: Use dimensionality reduction (PCA or t-SNE) | |
| if method == 'PCA': | |
| reducer = PCA(n_components=2, random_state=random_state) | |
| elif method == 'tSNE': | |
| reducer = TSNE(n_components=2, random_state=random_state) | |
| else: | |
| raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.") | |
| # Reduce the embeddings to 2D | |
| embeddings_2d = reducer.fit_transform(df[embedding_column].tolist()) | |
| # Step 2: Plot the 2D embeddings | |
| plt.figure(figsize=(10, 8)) | |
| # Scatter plot, coloring points by their label | |
| scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis') | |
| # Create a legend for the classes (labels) | |
| plt.legend(*scatter.legend_elements(), title="Classes") | |
| # Adding labels and title | |
| plt.title("2D Visualization of Embeddings") | |
| plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1") | |
| plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2") | |
| plt.colorbar(scatter) | |
| plt.show() |