Spaces:
Running
Running
def replace_wildcards( | |
templates, wildcards, replacements, has_numeric_columns, has_categoric_columns | |
): | |
if len(wildcards) != len(replacements): | |
raise ValueError( | |
"The number of wildcards must match the number of replacements." | |
) | |
new_templates = [] | |
for tmp in templates: | |
if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns: | |
continue | |
if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns: | |
continue | |
tmp_text = tmp["source"] | |
for wildcard, replacement in zip(wildcards, replacements): | |
tmp_text = tmp_text.replace(wildcard, replacement) | |
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text}) | |
return new_templates | |
rag_cells = [ | |
{ | |
"cell_type": "markdown", | |
"source": "# Retrieval-Augmented Generation (RAG) System Notebook", | |
}, | |
{"cell_type": "code", "source": ""}, | |
] | |
embeggins_cells = [ | |
{ | |
"cell_type": "markdown", | |
"source": """ | |
--- | |
# **Embeddings Notebook for {dataset_name} dataset** | |
--- | |
""", | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "## 1. Setup necessary libraries and load the dataset", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Install and import necessary libraries. | |
!pip install pandas sentence-transformers faiss-cpu | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Load the dataset as a DataFrame | |
{first_code} | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Specify the column name that contains the text data to generate embeddings | |
column_to_generate_embeddings = '{longest_col}' | |
""", | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "## 2. Loading embedding model and creating FAISS index", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Remove duplicate entries based on the specified column | |
df = df.drop_duplicates(subset=column_to_generate_embeddings) | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Convert the column data to a list of text entries | |
text_list = df[column_to_generate_embeddings].tolist() | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Specify the embedding model you want to use | |
model = SentenceTransformer('distiluse-base-multilingual-cased') | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
vectors = model.encode(text_list) | |
vector_dimension = vectors.shape[1] | |
# Initialize the FAISS index with the appropriate dimension (384 for this model) | |
index = faiss.IndexFlatL2(vector_dimension) | |
# Encode the text list into embeddings and add them to the FAISS index | |
index.add(vectors) | |
""", | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "## 3. Perform a text search", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Specify the text you want to search for in the list | |
text_to_search = text_list[0] | |
print(f"Text to search: {text_to_search}") | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Generate the embedding for the search query | |
query_embedding = model.encode([text_to_search]) | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed) | |
D, I = index.search(query_embedding, k=10) | |
# Print the similar documents found | |
print(f"Similar documents: {[text_list[i] for i in I[0]]}") | |
""", | |
}, | |
] | |
eda_cells = [ | |
{ | |
"cell_type": "markdown", | |
"source": """ | |
--- | |
# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset** | |
--- | |
""", | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "## 1. Setup necessary libraries and load the dataset", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Install and import necessary libraries. | |
!pip install pandas matplotlib seaborn | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Load the dataset as a DataFrame | |
{first_code} | |
""", | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "## 2. Understanding the Dataset", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# First rows of the dataset and info | |
print(df.head()) | |
print(df.info()) | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Check for missing values | |
print(df.isnull().sum()) | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Identify data types of each column | |
print(df.dtypes) | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Detect duplicated rows | |
print(df.duplicated().sum()) | |
""", | |
}, | |
{ | |
"cell_type": "code", | |
"source": """ | |
# Generate descriptive statistics | |
print(df.describe()) | |
""", | |
}, | |
{ | |
"type": "categoric", | |
"cell_type": "code", | |
"source": """ | |
# Unique values in categorical columns | |
df.select_dtypes(include=['object']).nunique() | |
""", | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": "## 3. Data Visualization", | |
}, | |
{ | |
"type": "numeric", | |
"cell_type": "code", | |
"source": """ | |
# Correlation matrix for numerical columns | |
corr_matrix = df.corr(numeric_only=True) | |
plt.figure(figsize=(10, 8)) | |
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True) | |
plt.title('Correlation Matrix') | |
plt.show() | |
""", | |
}, | |
{ | |
"type": "numeric", | |
"cell_type": "code", | |
"source": """ | |
# Distribution plots for numerical columns | |
for column in df.select_dtypes(include=['int64', 'float64']).columns: | |
plt.figure(figsize=(8, 4)) | |
sns.histplot(df[column], kde=True) | |
plt.title(f'Distribution of {column}') | |
plt.xlabel(column) | |
plt.ylabel('Frequency') | |
plt.show() | |
""", | |
}, | |
{ | |
"type": "categoric", | |
"cell_type": "code", | |
"source": """ | |
# Count plots for categorical columns | |
for column in df.select_dtypes(include=['object']).columns: | |
plt.figure(figsize=(8, 4)) | |
sns.countplot(x=column, data=df) | |
plt.title(f'Count Plot of {column}') | |
plt.xlabel(column) | |
plt.ylabel('Count') | |
plt.show() | |
""", | |
}, | |
{ | |
"type": "numeric", | |
"cell_type": "code", | |
"source": """ | |
# Box plots for detecting outliers in numerical columns | |
for column in df.select_dtypes(include=['int64', 'float64']).columns: | |
plt.figure(figsize=(8, 4)) | |
sns.boxplot(df[column]) | |
plt.title(f'Box Plot of {column}') | |
plt.xlabel(column) | |
plt.show() | |
""", | |
}, | |
] | |
def generate_embedding_system_prompt(): | |
"""You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset. | |
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index. | |
The notebook should include: | |
1. Install necessary libraries with !pip install. | |
2. Import libraries. | |
3. Load the dataset as a DataFrame using the provided code. | |
4. Select the column to generate embeddings. | |
5. Remove duplicate data. | |
6. Convert the selected column to a list. | |
7. Load the sentence-transformers model. | |
8. Create a FAISS index. | |
9. Encode a query sample. | |
10. Search for similar documents using the FAISS index. | |
Ensure the notebook is well-organized with explanations for each step. | |
The output should be Markdown content with Python code snippets enclosed in "```python" and "```". | |
The user will provide dataset information in the following format: | |
## Columns and Data Types | |
## Sample Data | |
## Loading Data code | |
Use the provided code to load the dataset; do not use any other method. | |
""" | |
def generate_rag_system_prompt(): | |
"""You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset. | |
The dataset is provided as a pandas DataFrame. | |
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference. | |
The RAG notebook should include: | |
1. Install necessary libraries. | |
2. Import libraries. | |
3. Load the dataset as a DataFrame using the provided code. | |
4. Select the column for generating embeddings. | |
5. Remove duplicate data. | |
6. Convert the selected column to a list. | |
7. Load the sentence-transformers model. | |
8. Create a FAISS index. | |
9. Encode a query sample. | |
10. Search for similar documents using the FAISS index. | |
11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline. | |
12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query. | |
13. Send the prompt to the pipeline and display the answer. | |
Ensure the notebook is well-organized with explanations for each step. | |
The output should be Markdown content with Python code snippets enclosed in "```python" and "```". | |
The user will provide the dataset information in the following format: | |
## Columns and Data Types | |
## Sample Data | |
## Loading Data code | |
Use the provided code to load the dataset; do not use any other method. | |
""" | |