Spaces:
Sleeping
Sleeping
import pandas as pd | |
import os | |
import re | |
from nltk.corpus import stopwords | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
from collections import defaultdict | |
import spacy | |
import string | |
from sklearn.decomposition import PCA | |
from sklearn.manifold import TSNE | |
# Load the spaCy model for English | |
nlp = spacy.load("en_core_web_sm") | |
# Get English stop words from NLTK | |
stop_words = set(stopwords.words("english")) | |
def process_text(text): | |
""" | |
Process text by: | |
1. Lowercasing | |
2. Removing punctuation and non-alphanumeric characters | |
3. Removing stop words | |
4. Lemmatization | |
""" | |
# Step 1: Tokenization & Processing with spaCy | |
doc = nlp(text.lower()) # Process text with spaCy | |
# Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization | |
processed_tokens = [ | |
re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters | |
for token in doc | |
if token.text not in stop_words and token.text not in string.punctuation | |
] | |
# Optional: Filter out empty strings resulting from the regex replacement | |
processed_tokens = [word for word in processed_tokens if word] | |
return processed_tokens | |
def generate_word_clouds_by_category(df, output_dir="wordclouds"): | |
""" | |
Generates and saves word clouds for each category in the DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns. | |
output_dir (str): Directory to save the word cloud images. | |
""" | |
# Create output directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
# Group words by category | |
category_word_map = defaultdict(list) | |
for _, row in df.iterrows(): | |
category_word_map[row["label"]].extend(row["processed_quote"]) | |
# Generate and save word clouds | |
for category, words in category_word_map.items(): | |
word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words)) | |
# Plot and save the word cloud | |
plt.figure(figsize=(10, 5)) | |
plt.imshow(word_cloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.title(category) | |
# Save the plot as an image | |
filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png") | |
plt.savefig(filename, bbox_inches='tight') | |
print(f"Word cloud saved for category '{category}' at {filename}") | |
plt.close() # Close the figure to avoid memory issues | |
def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42): | |
""" | |
This function reduces high-dimensional embeddings into 2D and visualizes them. | |
Args: | |
df (pd.DataFrame): DataFrame containing the text data and labels. | |
embedding_column (str): The column containing the embeddings. | |
label_column (str): The column containing the labels. | |
method (str): The dimensionality reduction method ('PCA' or 'tSNE'). | |
random_state (int): Random state for reproducibility. | |
""" | |
# Step 1: Use dimensionality reduction (PCA or t-SNE) | |
if method == 'PCA': | |
reducer = PCA(n_components=2, random_state=random_state) | |
elif method == 'tSNE': | |
reducer = TSNE(n_components=2, random_state=random_state) | |
else: | |
raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.") | |
# Reduce the embeddings to 2D | |
embeddings_2d = reducer.fit_transform(df[embedding_column].tolist()) | |
# Step 2: Plot the 2D embeddings | |
plt.figure(figsize=(10, 8)) | |
# Scatter plot, coloring points by their label | |
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis') | |
# Create a legend for the classes (labels) | |
plt.legend(*scatter.legend_elements(), title="Classes") | |
# Adding labels and title | |
plt.title("2D Visualization of Embeddings") | |
plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1") | |
plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2") | |
plt.colorbar(scatter) | |
plt.show() |