auto-notebook-creator / utils /notebook_utils.py
asoria's picture
asoria HF staff
EDA template partially finished (need to filter numerical operations)
4dc6cd8
raw
history blame
6.77 kB
def replace_wildcards(templates, wildcards, replacements):
if len(wildcards) != len(replacements):
raise ValueError(
"The number of wildcards must match the number of replacements."
)
new_templates = []
for tmp in templates:
tmp_text = tmp["source"]
for wildcard, replacement in zip(wildcards, replacements):
tmp_text = tmp_text.replace(wildcard, replacement)
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
return new_templates
rag_cells = [
{
"cell_type": "markdown",
"source": "# Retrieval-Augmented Generation (RAG) System Notebook",
},
{"cell_type": "code", "source": ""},
]
embeggins_cells = [
{
"cell_type": "markdown",
"source": "# Embeddings Generation Notebook",
},
{"cell_type": "code", "source": ""},
]
eda_cells = [
{
"cell_type": "markdown",
"source": """
---
# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
---
""",
},
{
"cell_type": "markdown",
"source": "## 1. Setup necessary libraries and load the dataset",
},
{
"cell_type": "code",
"source": """
# 1. Install and import necessary libraries.
!pip install pandas matplotlib seaborn
""",
},
{
"cell_type": "code",
"source": """
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
""",
},
{
"cell_type": "code",
"source": """
# 2. Load the dataset as a DataFrame
{first_code}
""",
},
{
"cell_type": "markdown",
"source": "## 2. Understanding the Dataset",
},
{
"cell_type": "code",
"source": """
# First rows of the dataset and info
print(df.head())
print(df.info())
print(df.describe())
""",
},
{
"cell_type": "code",
"source": """
# Check for missing values
print(df.isnull().sum())
""",
},
{
"cell_type": "code",
"source": """
# Identify data types of each column
print(df.dtypes)
""",
},
{
"cell_type": "code",
"source": """
# Detect duplicated rows
print(df.duplicated().sum())
""",
},
{
"cell_type": "code",
"source": """
# Generate descriptive statistics
print(df.describe())
""",
},
{
"cell_type": "code",
"source": """
# Unique values in categorical columns
df.select_dtypes(include=['object']).nunique()
""",
},
{
"cell_type": "markdown",
"source": "## 3. Data Visualization",
},
{
"cell_type": "code",
"source": """
# Correlation matrix for numerical columns
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()
""",
},
{
"cell_type": "code",
"source": """
# Distribution plots for numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
plt.figure(figsize=(8, 4))
sns.histplot(df[column], kde=True)
plt.title(f'Distribution of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.show()
""",
},
{
"cell_type": "code",
"source": """
# Count plots for categorical columns
for column in df.select_dtypes(include=['object']).columns:
plt.figure(figsize=(8, 4))
sns.countplot(x=column, data=df)
plt.title(f'Count Plot of {column}')
plt.xlabel(column)
plt.ylabel('Count')
plt.show()
""",
},
{
"cell_type": "code",
"source": """
# Box plots for detecting outliers in numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
plt.figure(figsize=(8, 4))
sns.boxplot(df[column])
plt.title(f'Box Plot of {column}')
plt.xlabel(column)
plt.show()
""",
},
]
def generate_embedding_system_prompt():
"""You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
The notebook should include:
1. Install necessary libraries with !pip install.
2. Import libraries.
3. Load the dataset as a DataFrame using the provided code.
4. Select the column to generate embeddings.
5. Remove duplicate data.
6. Convert the selected column to a list.
7. Load the sentence-transformers model.
8. Create a FAISS index.
9. Encode a query sample.
10. Search for similar documents using the FAISS index.
Ensure the notebook is well-organized with explanations for each step.
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
The user will provide dataset information in the following format:
## Columns and Data Types
## Sample Data
## Loading Data code
Use the provided code to load the dataset; do not use any other method.
"""
def generate_rag_system_prompt():
"""You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
The dataset is provided as a pandas DataFrame.
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
The RAG notebook should include:
1. Install necessary libraries.
2. Import libraries.
3. Load the dataset as a DataFrame using the provided code.
4. Select the column for generating embeddings.
5. Remove duplicate data.
6. Convert the selected column to a list.
7. Load the sentence-transformers model.
8. Create a FAISS index.
9. Encode a query sample.
10. Search for similar documents using the FAISS index.
11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
13. Send the prompt to the pipeline and display the answer.
Ensure the notebook is well-organized with explanations for each step.
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
The user will provide the dataset information in the following format:
## Columns and Data Types
## Sample Data
## Loading Data code
Use the provided code to load the dataset; do not use any other method.
"""