Spaces:
Running
Running
File size: 6,771 Bytes
e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 4dc6cd8 e62a0e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
def replace_wildcards(templates, wildcards, replacements):
if len(wildcards) != len(replacements):
raise ValueError(
"The number of wildcards must match the number of replacements."
)
new_templates = []
for tmp in templates:
tmp_text = tmp["source"]
for wildcard, replacement in zip(wildcards, replacements):
tmp_text = tmp_text.replace(wildcard, replacement)
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
return new_templates
rag_cells = [
{
"cell_type": "markdown",
"source": "# Retrieval-Augmented Generation (RAG) System Notebook",
},
{"cell_type": "code", "source": ""},
]
embeggins_cells = [
{
"cell_type": "markdown",
"source": "# Embeddings Generation Notebook",
},
{"cell_type": "code", "source": ""},
]
eda_cells = [
{
"cell_type": "markdown",
"source": """
---
# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
---
""",
},
{
"cell_type": "markdown",
"source": "## 1. Setup necessary libraries and load the dataset",
},
{
"cell_type": "code",
"source": """
# 1. Install and import necessary libraries.
!pip install pandas matplotlib seaborn
""",
},
{
"cell_type": "code",
"source": """
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
""",
},
{
"cell_type": "code",
"source": """
# 2. Load the dataset as a DataFrame
{first_code}
""",
},
{
"cell_type": "markdown",
"source": "## 2. Understanding the Dataset",
},
{
"cell_type": "code",
"source": """
# First rows of the dataset and info
print(df.head())
print(df.info())
print(df.describe())
""",
},
{
"cell_type": "code",
"source": """
# Check for missing values
print(df.isnull().sum())
""",
},
{
"cell_type": "code",
"source": """
# Identify data types of each column
print(df.dtypes)
""",
},
{
"cell_type": "code",
"source": """
# Detect duplicated rows
print(df.duplicated().sum())
""",
},
{
"cell_type": "code",
"source": """
# Generate descriptive statistics
print(df.describe())
""",
},
{
"cell_type": "code",
"source": """
# Unique values in categorical columns
df.select_dtypes(include=['object']).nunique()
""",
},
{
"cell_type": "markdown",
"source": "## 3. Data Visualization",
},
{
"cell_type": "code",
"source": """
# Correlation matrix for numerical columns
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()
""",
},
{
"cell_type": "code",
"source": """
# Distribution plots for numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
plt.figure(figsize=(8, 4))
sns.histplot(df[column], kde=True)
plt.title(f'Distribution of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.show()
""",
},
{
"cell_type": "code",
"source": """
# Count plots for categorical columns
for column in df.select_dtypes(include=['object']).columns:
plt.figure(figsize=(8, 4))
sns.countplot(x=column, data=df)
plt.title(f'Count Plot of {column}')
plt.xlabel(column)
plt.ylabel('Count')
plt.show()
""",
},
{
"cell_type": "code",
"source": """
# Box plots for detecting outliers in numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
plt.figure(figsize=(8, 4))
sns.boxplot(df[column])
plt.title(f'Box Plot of {column}')
plt.xlabel(column)
plt.show()
""",
},
]
def generate_embedding_system_prompt():
"""You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
The notebook should include:
1. Install necessary libraries with !pip install.
2. Import libraries.
3. Load the dataset as a DataFrame using the provided code.
4. Select the column to generate embeddings.
5. Remove duplicate data.
6. Convert the selected column to a list.
7. Load the sentence-transformers model.
8. Create a FAISS index.
9. Encode a query sample.
10. Search for similar documents using the FAISS index.
Ensure the notebook is well-organized with explanations for each step.
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
The user will provide dataset information in the following format:
## Columns and Data Types
## Sample Data
## Loading Data code
Use the provided code to load the dataset; do not use any other method.
"""
def generate_rag_system_prompt():
"""You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
The dataset is provided as a pandas DataFrame.
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
The RAG notebook should include:
1. Install necessary libraries.
2. Import libraries.
3. Load the dataset as a DataFrame using the provided code.
4. Select the column for generating embeddings.
5. Remove duplicate data.
6. Convert the selected column to a list.
7. Load the sentence-transformers model.
8. Create a FAISS index.
9. Encode a query sample.
10. Search for similar documents using the FAISS index.
11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
13. Send the prompt to the pipeline and display the answer.
Ensure the notebook is well-organized with explanations for each step.
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
The user will provide the dataset information in the following format:
## Columns and Data Types
## Sample Data
## Loading Data code
Use the provided code to load the dataset; do not use any other method.
"""
|