Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

auto-notebook-creator / utils /notebook_utils.py

asoria

Minor details for RAG

117da13 11 months ago

raw

history blame

12.6 kB

	def replace_wildcards(
	templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
	):
	if len(wildcards) != len(replacements):
	raise ValueError(
	"The number of wildcards must match the number of replacements."
	)

	new_templates = []
	for tmp in templates:
	if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns:
	continue
	if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
	continue
	tmp_text = tmp["source"].strip()
	for wildcard, replacement in zip(wildcards, replacements):
	tmp_text = tmp_text.replace(wildcard, replacement)
	new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})

	return new_templates


	embeggins_cells = [
	{
	"cell_type": "markdown",
	"source": """
	---
	# Embeddings Notebook for {dataset_name} dataset
	---
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 1. Setup necessary libraries and load the dataset",
	},
	{
	"cell_type": "code",
	"source": """
	# Install and import necessary libraries.
	!pip install pandas sentence-transformers faiss-cpu
	""",
	},
	{
	"cell_type": "code",
	"source": """
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import faiss
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Load the dataset as a DataFrame
	{first_code}
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Specify the column name that contains the text data to generate embeddings
	column_to_generate_embeddings = '{longest_col}'
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 2. Loading embedding model and creating FAISS index",
	},
	{
	"cell_type": "code",
	"source": """
	# Remove duplicate entries based on the specified column
	df = df.drop_duplicates(subset=column_to_generate_embeddings)
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Convert the column data to a list of text entries
	text_list = df[column_to_generate_embeddings].tolist()
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Specify the embedding model you want to use
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	""",
	},
	{
	"cell_type": "code",
	"source": """
	vectors = model.encode(text_list)
	vector_dimension = vectors.shape[1]

	# Initialize the FAISS index with the appropriate dimension (384 for this model)
	index = faiss.IndexFlatL2(vector_dimension)

	# Encode the text list into embeddings and add them to the FAISS index
	index.add(vectors)
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 3. Perform a text search",
	},
	{
	"cell_type": "code",
	"source": """
	# Specify the text you want to search for in the list
	text_to_search = text_list[0]
	print(f"Text to search: {text_to_search}")
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Generate the embedding for the search query
	query_embedding = model.encode([text_to_search])
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
	D, I = index.search(query_embedding, k=10)

	# Print the similar documents found
	print(f"Similar documents: {[text_list[i] for i in I[0]]}")
	""",
	},
	]

	eda_cells = [
	{
	"cell_type": "markdown",
	"source": """
	---
	# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset
	---
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 1. Setup necessary libraries and load the dataset",
	},
	{
	"cell_type": "code",
	"source": """
	# Install and import necessary libraries.
	!pip install pandas matplotlib seaborn
	""",
	},
	{
	"cell_type": "code",
	"source": """
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Load the dataset as a DataFrame
	{first_code}
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 2. Understanding the Dataset",
	},
	{
	"cell_type": "code",
	"source": """
	# First rows of the dataset and info
	print(df.head())
	print(df.info())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Check for missing values
	print(df.isnull().sum())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Identify data types of each column
	print(df.dtypes)
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Detect duplicated rows
	print(df.duplicated().sum())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Generate descriptive statistics
	print(df.describe())
	""",
	},
	{
	"type": "categoric",
	"cell_type": "code",
	"source": """
	# Unique values in categorical columns
	df.select_dtypes(include=['object']).nunique()
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 3. Data Visualization",
	},
	{
	"type": "numeric",
	"cell_type": "code",
	"source": """
	# Correlation matrix for numerical columns
	corr_matrix = df.corr(numeric_only=True)
	plt.figure(figsize=(10, 8))
	sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
	plt.title('Correlation Matrix')
	plt.show()
	""",
	},
	{
	"type": "numeric",
	"cell_type": "code",
	"source": """
	# Distribution plots for numerical columns
	for column in df.select_dtypes(include=['int64', 'float64']).columns:
	plt.figure(figsize=(8, 4))
	sns.histplot(df[column], kde=True)
	plt.title(f'Distribution of {column}')
	plt.xlabel(column)
	plt.ylabel('Frequency')
	plt.show()
	""",
	},
	{
	"type": "categoric",
	"cell_type": "code",
	"source": """
	# Count plots for categorical columns
	for column in df.select_dtypes(include=['object']).columns:
	plt.figure(figsize=(8, 4))
	sns.countplot(x=column, data=df)
	plt.title(f'Count Plot of {column}')
	plt.xlabel(column)
	plt.ylabel('Count')
	plt.show()
	""",
	},
	{
	"type": "numeric",
	"cell_type": "code",
	"source": """
	# Box plots for detecting outliers in numerical columns
	for column in df.select_dtypes(include=['int64', 'float64']).columns:
	plt.figure(figsize=(8, 4))
	sns.boxplot(df[column])
	plt.title(f'Box Plot of {column}')
	plt.xlabel(column)
	plt.show()
	""",
	},
	]


	rag_cells = [
	{
	"cell_type": "markdown",
	"source": """
	---
	# Retrieval-Augmented Generation Notebook for {dataset_name} dataset
	---
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 1. Setup necessary libraries and load the dataset",
	},
	{
	"cell_type": "code",
	"source": """
	# Install and import necessary libraries.
	!pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub
	""",
	},
	{
	"cell_type": "code",
	"source": """
	from sentence_transformers import SentenceTransformer
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from huggingface_hub import InferenceClient
	import pandas as pd
	import faiss
	import torch
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Load the dataset as a DataFrame
	{first_code}
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Specify the column name that contains the text data to generate embeddings
	column_to_generate_embeddings = '{longest_col}'
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 2. Loading embedding model and creating FAISS index",
	},
	{
	"cell_type": "code",
	"source": """
	# Remove duplicate entries based on the specified column
	df = df.drop_duplicates(subset=column_to_generate_embeddings)
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Convert the column data to a list of text entries
	text_list = df[column_to_generate_embeddings].tolist()
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Specify the embedding model you want to use
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	""",
	},
	{
	"cell_type": "code",
	"source": """
	vectors = model.encode(text_list)
	vector_dimension = vectors.shape[1]

	# Initialize the FAISS index with the appropriate dimension (384 for this model)
	index = faiss.IndexFlatL2(vector_dimension)

	# Encode the text list into embeddings and add them to the FAISS index
	index.add(vectors)
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 3. Perform a text search",
	},
	{
	"cell_type": "code",
	"source": """
	# Specify the text you want to search for in the list
	query = "How to prepare a cake?"

	# Generate the embedding for the search query
	query_embedding = model.encode([query])
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
	D, I = index.search(query_embedding, k=10)

	# Print the similar documents found
	print(f"Similar documents: {[text_list[i] for i in I[0]]}")
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 4. Load pipeline and perform inference locally",
	},
	{
	"cell_type": "code",
	"source": """
	# Adjust model name as needed
	checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'

	device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage

	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

	generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query
	selected_elements = [text_list[i] for i in I[0].tolist()]
	context = ','.join(selected_elements)
	messages = [
	{
	"role": "system",
	"content": f"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}",
	},
	{"role": "user", "content": query},
	]
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# Send the prompt to the pipeline and show the answer
	output = generator(messages)
	print("Generated result:")
	print(output[0]['generated_text'][-1]['content']) # Print the assistant's response content
	""",
	},
	{
	"cell_type": "markdown",
	"source": "## 5. Alternatively call the inference client",
	},
	{
	"cell_type": "code",
	"source": """
	# Adjust model name as needed
	checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"

	# Change here your Hugging Face API token
	token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

	inference_client = InferenceClient(checkpoint, token=token)
	output = inference_client.chat_completion(messages=messages, stream=False)
	print("Generated result:")
	print(output.choices[0].message.content)
	""",
	},
	]


	def generate_rag_system_prompt():
	"""

	1. Install necessary libraries.
	2. Import libraries.
	3. Load the dataset as a DataFrame using the provided code.
	4. Select the column for generating embeddings.
	5. Remove duplicate data.
	6. Convert the selected column to a list.
	7. Load the sentence-transformers model.
	8. Create a FAISS index.
	9. Encode a query sample.
	10. Search for similar documents using the FAISS index.
	11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
	12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
	13. Send the prompt to the pipeline and display the answer.

	Ensure the notebook is well-organized with explanations for each step.
	The output should be Markdown content with Python code snippets enclosed in "```python" and "```".

	The user will provide the dataset information in the following format:

	## Columns and Data Types

	## Sample Data

	## Loading Data code

	Use the provided code to load the dataset; do not use any other method.
	"""