sumesh4C commited on
Commit
60d8d9d
·
verified ·
1 Parent(s): 9685f7b

Upload 2 files

Browse files
Files changed (2) hide show
  1. eda.py +12 -0
  2. eda_functions.py +117 -0
eda.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from src.utils.eda_functions import process_text, generate_word_clouds_by_category
3
+ dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
4
+ train = dataset["train"].to_pandas()
5
+ test = dataset["test"].to_pandas()
6
+ # train["processed_quote"] = train["quote"].apply(process_text)
7
+
8
+ train.to_csv("outputs/train_v1.csv", sep=";", index=False)
9
+ test.to_csv("outputs/test.csv", sep=";", index=False)
10
+
11
+ # Generate word clouds
12
+ # generate_word_clouds_by_category(train)
eda_functions.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import re
4
+ from nltk.corpus import stopwords
5
+ from wordcloud import WordCloud
6
+ import matplotlib.pyplot as plt
7
+ from collections import defaultdict
8
+ import spacy
9
+ import string
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.manifold import TSNE
12
+
13
+ # Load the spaCy model for English
14
+ nlp = spacy.load("en_core_web_sm")
15
+
16
+ # Get English stop words from NLTK
17
+ stop_words = set(stopwords.words("english"))
18
+
19
+ def process_text(text):
20
+ """
21
+ Process text by:
22
+ 1. Lowercasing
23
+ 2. Removing punctuation and non-alphanumeric characters
24
+ 3. Removing stop words
25
+ 4. Lemmatization
26
+ """
27
+ # Step 1: Tokenization & Processing with spaCy
28
+ doc = nlp(text.lower()) # Process text with spaCy
29
+
30
+ # Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
31
+ processed_tokens = [
32
+ re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
33
+ for token in doc
34
+ if token.text not in stop_words and token.text not in string.punctuation
35
+ ]
36
+
37
+ # Optional: Filter out empty strings resulting from the regex replacement
38
+ processed_tokens = [word for word in processed_tokens if word]
39
+
40
+ return processed_tokens
41
+
42
+
43
+
44
+ def generate_word_clouds_by_category(df, output_dir="wordclouds"):
45
+ """
46
+ Generates and saves word clouds for each category in the DataFrame.
47
+
48
+ Args:
49
+ df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
50
+ output_dir (str): Directory to save the word cloud images.
51
+ """
52
+ # Create output directory if it doesn't exist
53
+ os.makedirs(output_dir, exist_ok=True)
54
+
55
+ # Group words by category
56
+ category_word_map = defaultdict(list)
57
+ for _, row in df.iterrows():
58
+ category_word_map[row["label"]].extend(row["processed_quote"])
59
+
60
+ # Generate and save word clouds
61
+ for category, words in category_word_map.items():
62
+ word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))
63
+
64
+ # Plot and save the word cloud
65
+ plt.figure(figsize=(10, 5))
66
+ plt.imshow(word_cloud, interpolation='bilinear')
67
+ plt.axis('off')
68
+ plt.title(category)
69
+
70
+ # Save the plot as an image
71
+ filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
72
+ plt.savefig(filename, bbox_inches='tight')
73
+ print(f"Word cloud saved for category '{category}' at {filename}")
74
+
75
+ plt.close() # Close the figure to avoid memory issues
76
+
77
+
78
+
79
+
80
+ def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
81
+ """
82
+ This function reduces high-dimensional embeddings into 2D and visualizes them.
83
+
84
+ Args:
85
+ df (pd.DataFrame): DataFrame containing the text data and labels.
86
+ embedding_column (str): The column containing the embeddings.
87
+ label_column (str): The column containing the labels.
88
+ method (str): The dimensionality reduction method ('PCA' or 'tSNE').
89
+ random_state (int): Random state for reproducibility.
90
+ """
91
+ # Step 1: Use dimensionality reduction (PCA or t-SNE)
92
+ if method == 'PCA':
93
+ reducer = PCA(n_components=2, random_state=random_state)
94
+ elif method == 'tSNE':
95
+ reducer = TSNE(n_components=2, random_state=random_state)
96
+ else:
97
+ raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")
98
+
99
+ # Reduce the embeddings to 2D
100
+ embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())
101
+
102
+ # Step 2: Plot the 2D embeddings
103
+ plt.figure(figsize=(10, 8))
104
+
105
+ # Scatter plot, coloring points by their label
106
+ scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')
107
+
108
+ # Create a legend for the classes (labels)
109
+ plt.legend(*scatter.legend_elements(), title="Classes")
110
+
111
+ # Adding labels and title
112
+ plt.title("2D Visualization of Embeddings")
113
+ plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
114
+ plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")
115
+
116
+ plt.colorbar(scatter)
117
+ plt.show()