Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- eda.py +12 -0
- eda_functions.py +117 -0
eda.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from src.utils.eda_functions import process_text, generate_word_clouds_by_category
|
3 |
+
dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
|
4 |
+
train = dataset["train"].to_pandas()
|
5 |
+
test = dataset["test"].to_pandas()
|
6 |
+
# train["processed_quote"] = train["quote"].apply(process_text)
|
7 |
+
|
8 |
+
train.to_csv("outputs/train_v1.csv", sep=";", index=False)
|
9 |
+
test.to_csv("outputs/test.csv", sep=";", index=False)
|
10 |
+
|
11 |
+
# Generate word clouds
|
12 |
+
# generate_word_clouds_by_category(train)
|
eda_functions.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
from wordcloud import WordCloud
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from collections import defaultdict
|
8 |
+
import spacy
|
9 |
+
import string
|
10 |
+
from sklearn.decomposition import PCA
|
11 |
+
from sklearn.manifold import TSNE
|
12 |
+
|
13 |
+
# Load the spaCy model for English
|
14 |
+
nlp = spacy.load("en_core_web_sm")
|
15 |
+
|
16 |
+
# Get English stop words from NLTK
|
17 |
+
stop_words = set(stopwords.words("english"))
|
18 |
+
|
19 |
+
def process_text(text):
|
20 |
+
"""
|
21 |
+
Process text by:
|
22 |
+
1. Lowercasing
|
23 |
+
2. Removing punctuation and non-alphanumeric characters
|
24 |
+
3. Removing stop words
|
25 |
+
4. Lemmatization
|
26 |
+
"""
|
27 |
+
# Step 1: Tokenization & Processing with spaCy
|
28 |
+
doc = nlp(text.lower()) # Process text with spaCy
|
29 |
+
|
30 |
+
# Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
|
31 |
+
processed_tokens = [
|
32 |
+
re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
|
33 |
+
for token in doc
|
34 |
+
if token.text not in stop_words and token.text not in string.punctuation
|
35 |
+
]
|
36 |
+
|
37 |
+
# Optional: Filter out empty strings resulting from the regex replacement
|
38 |
+
processed_tokens = [word for word in processed_tokens if word]
|
39 |
+
|
40 |
+
return processed_tokens
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
def generate_word_clouds_by_category(df, output_dir="wordclouds"):
|
45 |
+
"""
|
46 |
+
Generates and saves word clouds for each category in the DataFrame.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
|
50 |
+
output_dir (str): Directory to save the word cloud images.
|
51 |
+
"""
|
52 |
+
# Create output directory if it doesn't exist
|
53 |
+
os.makedirs(output_dir, exist_ok=True)
|
54 |
+
|
55 |
+
# Group words by category
|
56 |
+
category_word_map = defaultdict(list)
|
57 |
+
for _, row in df.iterrows():
|
58 |
+
category_word_map[row["label"]].extend(row["processed_quote"])
|
59 |
+
|
60 |
+
# Generate and save word clouds
|
61 |
+
for category, words in category_word_map.items():
|
62 |
+
word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))
|
63 |
+
|
64 |
+
# Plot and save the word cloud
|
65 |
+
plt.figure(figsize=(10, 5))
|
66 |
+
plt.imshow(word_cloud, interpolation='bilinear')
|
67 |
+
plt.axis('off')
|
68 |
+
plt.title(category)
|
69 |
+
|
70 |
+
# Save the plot as an image
|
71 |
+
filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
|
72 |
+
plt.savefig(filename, bbox_inches='tight')
|
73 |
+
print(f"Word cloud saved for category '{category}' at {filename}")
|
74 |
+
|
75 |
+
plt.close() # Close the figure to avoid memory issues
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
|
81 |
+
"""
|
82 |
+
This function reduces high-dimensional embeddings into 2D and visualizes them.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
df (pd.DataFrame): DataFrame containing the text data and labels.
|
86 |
+
embedding_column (str): The column containing the embeddings.
|
87 |
+
label_column (str): The column containing the labels.
|
88 |
+
method (str): The dimensionality reduction method ('PCA' or 'tSNE').
|
89 |
+
random_state (int): Random state for reproducibility.
|
90 |
+
"""
|
91 |
+
# Step 1: Use dimensionality reduction (PCA or t-SNE)
|
92 |
+
if method == 'PCA':
|
93 |
+
reducer = PCA(n_components=2, random_state=random_state)
|
94 |
+
elif method == 'tSNE':
|
95 |
+
reducer = TSNE(n_components=2, random_state=random_state)
|
96 |
+
else:
|
97 |
+
raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")
|
98 |
+
|
99 |
+
# Reduce the embeddings to 2D
|
100 |
+
embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())
|
101 |
+
|
102 |
+
# Step 2: Plot the 2D embeddings
|
103 |
+
plt.figure(figsize=(10, 8))
|
104 |
+
|
105 |
+
# Scatter plot, coloring points by their label
|
106 |
+
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')
|
107 |
+
|
108 |
+
# Create a legend for the classes (labels)
|
109 |
+
plt.legend(*scatter.legend_elements(), title="Classes")
|
110 |
+
|
111 |
+
# Adding labels and title
|
112 |
+
plt.title("2D Visualization of Embeddings")
|
113 |
+
plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
|
114 |
+
plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")
|
115 |
+
|
116 |
+
plt.colorbar(scatter)
|
117 |
+
plt.show()
|