|
import pandas as pd |
|
import os |
|
import re |
|
from nltk.corpus import stopwords |
|
from wordcloud import WordCloud |
|
import matplotlib.pyplot as plt |
|
from collections import defaultdict |
|
import spacy |
|
import string |
|
from sklearn.decomposition import PCA |
|
from sklearn.manifold import TSNE |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
stop_words = set(stopwords.words("english")) |
|
|
|
def process_text(text): |
|
""" |
|
Process text by: |
|
1. Lowercasing |
|
2. Removing punctuation and non-alphanumeric characters |
|
3. Removing stop words |
|
4. Lemmatization |
|
""" |
|
|
|
doc = nlp(text.lower()) |
|
|
|
|
|
processed_tokens = [ |
|
re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) |
|
for token in doc |
|
if token.text not in stop_words and token.text not in string.punctuation |
|
] |
|
|
|
|
|
processed_tokens = [word for word in processed_tokens if word] |
|
|
|
return processed_tokens |
|
|
|
|
|
|
|
def generate_word_clouds_by_category(df, output_dir="wordclouds"): |
|
""" |
|
Generates and saves word clouds for each category in the DataFrame. |
|
|
|
Args: |
|
df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns. |
|
output_dir (str): Directory to save the word cloud images. |
|
""" |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
category_word_map = defaultdict(list) |
|
for _, row in df.iterrows(): |
|
category_word_map[row["label"]].extend(row["processed_quote"]) |
|
|
|
|
|
for category, words in category_word_map.items(): |
|
word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words)) |
|
|
|
|
|
plt.figure(figsize=(10, 5)) |
|
plt.imshow(word_cloud, interpolation='bilinear') |
|
plt.axis('off') |
|
plt.title(category) |
|
|
|
|
|
filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png") |
|
plt.savefig(filename, bbox_inches='tight') |
|
print(f"Word cloud saved for category '{category}' at {filename}") |
|
|
|
plt.close() |
|
|
|
|
|
|
|
|
|
def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42): |
|
""" |
|
This function reduces high-dimensional embeddings into 2D and visualizes them. |
|
|
|
Args: |
|
df (pd.DataFrame): DataFrame containing the text data and labels. |
|
embedding_column (str): The column containing the embeddings. |
|
label_column (str): The column containing the labels. |
|
method (str): The dimensionality reduction method ('PCA' or 'tSNE'). |
|
random_state (int): Random state for reproducibility. |
|
""" |
|
|
|
if method == 'PCA': |
|
reducer = PCA(n_components=2, random_state=random_state) |
|
elif method == 'tSNE': |
|
reducer = TSNE(n_components=2, random_state=random_state) |
|
else: |
|
raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.") |
|
|
|
|
|
embeddings_2d = reducer.fit_transform(df[embedding_column].tolist()) |
|
|
|
|
|
plt.figure(figsize=(10, 8)) |
|
|
|
|
|
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis') |
|
|
|
|
|
plt.legend(*scatter.legend_elements(), title="Classes") |
|
|
|
|
|
plt.title("2D Visualization of Embeddings") |
|
plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1") |
|
plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2") |
|
|
|
plt.colorbar(scatter) |
|
plt.show() |