submission-template

Sleeping

App Files Files Community

sumesh4C commited on Jan 23

Commit

60d8d9d

verified ·

1 Parent(s): 9685f7b

Upload 2 files

Browse files

Files changed (2) hide show

eda.py +12 -0
eda_functions.py +117 -0

eda.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from datasets import load_dataset
+from src.utils.eda_functions import process_text, generate_word_clouds_by_category
+dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
+train = dataset["train"].to_pandas()
+test = dataset["test"].to_pandas()
+# train["processed_quote"] = train["quote"].apply(process_text)
+train.to_csv("outputs/train_v1.csv", sep=";", index=False)
+test.to_csv("outputs/test.csv", sep=";", index=False)
+# Generate word clouds
+# generate_word_clouds_by_category(train)

eda_functions.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import pandas as pd
+import os
+import re
+from nltk.corpus import stopwords
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+from collections import defaultdict
+import spacy
+import string
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+# Load the spaCy model for English
+nlp = spacy.load("en_core_web_sm")
+# Get English stop words from NLTK
+stop_words = set(stopwords.words("english"))
+def process_text(text):
+    """
+    Process text by:
+    1. Lowercasing
+    2. Removing punctuation and non-alphanumeric characters
+    3. Removing stop words
+    4. Lemmatization
+    """
+    # Step 1: Tokenization & Processing with spaCy
+    doc = nlp(text.lower())  # Process text with spaCy
+    # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
+    processed_tokens = [
+        re.sub(r'[^a-zA-Z0-9]', '', token.lemma_)  # Remove non-alphanumeric characters
+        for token in doc
+        if token.text not in stop_words and token.text not in string.punctuation
+    ]
+    # Optional: Filter out empty strings resulting from the regex replacement
+    processed_tokens = [word for word in processed_tokens if word]
+    return processed_tokens
+def generate_word_clouds_by_category(df, output_dir="wordclouds"):
+    """
+    Generates and saves word clouds for each category in the DataFrame.
+    Args:
+        df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
+        output_dir (str): Directory to save the word cloud images.
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    # Group words by category
+    category_word_map = defaultdict(list)
+    for _, row in df.iterrows():
+        category_word_map[row["label"]].extend(row["processed_quote"])
+    # Generate and save word clouds
+    for category, words in category_word_map.items():
+        word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))
+        # Plot and save the word cloud
+        plt.figure(figsize=(10, 5))
+        plt.imshow(word_cloud, interpolation='bilinear')
+        plt.axis('off')
+        plt.title(category)
+        # Save the plot as an image
+        filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
+        plt.savefig(filename, bbox_inches='tight')
+        print(f"Word cloud saved for category '{category}' at {filename}")
+        plt.close()  # Close the figure to avoid memory issues
+def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
+    """
+    This function reduces high-dimensional embeddings into 2D and visualizes them.
+    Args:
+        df (pd.DataFrame): DataFrame containing the text data and labels.
+        embedding_column (str): The column containing the embeddings.
+        label_column (str): The column containing the labels.
+        method (str): The dimensionality reduction method ('PCA' or 'tSNE').
+        random_state (int): Random state for reproducibility.
+    """
+    # Step 1: Use dimensionality reduction (PCA or t-SNE)
+    if method == 'PCA':
+        reducer = PCA(n_components=2, random_state=random_state)
+    elif method == 'tSNE':
+        reducer = TSNE(n_components=2, random_state=random_state)
+    else:
+        raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")
+    # Reduce the embeddings to 2D
+    embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())
+    # Step 2: Plot the 2D embeddings
+    plt.figure(figsize=(10, 8))
+    # Scatter plot, coloring points by their label
+    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')
+    # Create a legend for the classes (labels)
+    plt.legend(*scatter.legend_elements(), title="Classes")
+    # Adding labels and title
+    plt.title("2D Visualization of Embeddings")
+    plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
+    plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")
+    plt.colorbar(scatter)
+    plt.show()