submission-template

Running

App Files Files Community

submission-template / eda_functions.py

sumesh4C

Upload 2 files

60d8d9d verified 4 months ago

raw

history blame

4.19 kB

	import pandas as pd
	import os
	import re
	from nltk.corpus import stopwords
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from collections import defaultdict
	import spacy
	import string
	from sklearn.decomposition import PCA
	from sklearn.manifold import TSNE

	# Load the spaCy model for English
	nlp = spacy.load("en_core_web_sm")

	# Get English stop words from NLTK
	stop_words = set(stopwords.words("english"))

	def process_text(text):
	"""
	Process text by:
	1. Lowercasing
	2. Removing punctuation and non-alphanumeric characters
	3. Removing stop words
	4. Lemmatization
	"""
	# Step 1: Tokenization & Processing with spaCy
	doc = nlp(text.lower()) # Process text with spaCy

	# Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
	processed_tokens = [
	re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
	for token in doc
	if token.text not in stop_words and token.text not in string.punctuation
	]

	# Optional: Filter out empty strings resulting from the regex replacement
	processed_tokens = [word for word in processed_tokens if word]

	return processed_tokens



	def generate_word_clouds_by_category(df, output_dir="wordclouds"):
	"""
	Generates and saves word clouds for each category in the DataFrame.

	Args:
	df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
	output_dir (str): Directory to save the word cloud images.
	"""
	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Group words by category
	category_word_map = defaultdict(list)
	for _, row in df.iterrows():
	category_word_map[row["label"]].extend(row["processed_quote"])

	# Generate and save word clouds
	for category, words in category_word_map.items():
	word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))

	# Plot and save the word cloud
	plt.figure(figsize=(10, 5))
	plt.imshow(word_cloud, interpolation='bilinear')
	plt.axis('off')
	plt.title(category)

	# Save the plot as an image
	filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
	plt.savefig(filename, bbox_inches='tight')
	print(f"Word cloud saved for category '{category}' at {filename}")

	plt.close() # Close the figure to avoid memory issues




	def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
	"""
	This function reduces high-dimensional embeddings into 2D and visualizes them.

	Args:
	df (pd.DataFrame): DataFrame containing the text data and labels.
	embedding_column (str): The column containing the embeddings.
	label_column (str): The column containing the labels.
	method (str): The dimensionality reduction method ('PCA' or 'tSNE').
	random_state (int): Random state for reproducibility.
	"""
	# Step 1: Use dimensionality reduction (PCA or t-SNE)
	if method == 'PCA':
	reducer = PCA(n_components=2, random_state=random_state)
	elif method == 'tSNE':
	reducer = TSNE(n_components=2, random_state=random_state)
	else:
	raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")

	# Reduce the embeddings to 2D
	embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())

	# Step 2: Plot the 2D embeddings
	plt.figure(figsize=(10, 8))

	# Scatter plot, coloring points by their label
	scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')

	# Create a legend for the classes (labels)
	plt.legend(*scatter.legend_elements(), title="Classes")

	# Adding labels and title
	plt.title("2D Visualization of Embeddings")
	plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
	plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")

	plt.colorbar(scatter)
	plt.show()