Spaces:
Running
Running
| def replace_wildcards( | |
| templates, wildcards, replacements, has_numeric_columns, has_categoric_columns | |
| ): | |
| if len(wildcards) != len(replacements): | |
| raise ValueError( | |
| "The number of wildcards must match the number of replacements." | |
| ) | |
| new_templates = [] | |
| for tmp in templates: | |
| if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns: | |
| continue | |
| if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns: | |
| continue | |
| tmp_text = tmp["source"].strip() | |
| for wildcard, replacement in zip(wildcards, replacements): | |
| tmp_text = tmp_text.replace(wildcard, replacement) | |
| new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text}) | |
| return new_templates | |
| embeggins_cells = [ | |
| { | |
| "cell_type": "markdown", | |
| "source": """ | |
| --- | |
| # **Embeddings Notebook for {dataset_name} dataset** | |
| --- | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 1. Setup necessary libraries and load the dataset", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Install and import necessary libraries. | |
| !pip install pandas sentence-transformers faiss-cpu | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Load the dataset as a DataFrame | |
| {first_code} | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Specify the column name that contains the text data to generate embeddings | |
| column_to_generate_embeddings = '{longest_col}' | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 2. Loading embedding model and creating FAISS index", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Remove duplicate entries based on the specified column | |
| df = df.drop_duplicates(subset=column_to_generate_embeddings) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Convert the column data to a list of text entries | |
| text_list = df[column_to_generate_embeddings].tolist() | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Specify the embedding model you want to use | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| vectors = model.encode(text_list) | |
| vector_dimension = vectors.shape[1] | |
| # Initialize the FAISS index with the appropriate dimension (384 for this model) | |
| index = faiss.IndexFlatL2(vector_dimension) | |
| # Encode the text list into embeddings and add them to the FAISS index | |
| index.add(vectors) | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 3. Perform a text search", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Specify the text you want to search for in the list | |
| text_to_search = text_list[0] | |
| print(f"Text to search: {text_to_search}") | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Generate the embedding for the search query | |
| query_embedding = model.encode([text_to_search]) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed) | |
| D, I = index.search(query_embedding, k=10) | |
| # Print the similar documents found | |
| print(f"Similar documents: {[text_list[i] for i in I[0]]}") | |
| """, | |
| }, | |
| ] | |
| eda_cells = [ | |
| { | |
| "cell_type": "markdown", | |
| "source": """ | |
| --- | |
| # **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset** | |
| --- | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 1. Setup necessary libraries and load the dataset", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Install and import necessary libraries. | |
| !pip install pandas matplotlib seaborn | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Load the dataset as a DataFrame | |
| {first_code} | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 2. Understanding the Dataset", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # First rows of the dataset and info | |
| print(df.head()) | |
| print(df.info()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Check for missing values | |
| print(df.isnull().sum()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Identify data types of each column | |
| print(df.dtypes) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Detect duplicated rows | |
| print(df.duplicated().sum()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Generate descriptive statistics | |
| print(df.describe()) | |
| """, | |
| }, | |
| { | |
| "type": "categoric", | |
| "cell_type": "code", | |
| "source": """ | |
| # Unique values in categorical columns | |
| df.select_dtypes(include=['object']).nunique() | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 3. Data Visualization", | |
| }, | |
| { | |
| "type": "numeric", | |
| "cell_type": "code", | |
| "source": """ | |
| # Correlation matrix for numerical columns | |
| corr_matrix = df.corr(numeric_only=True) | |
| plt.figure(figsize=(10, 8)) | |
| sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True) | |
| plt.title('Correlation Matrix') | |
| plt.show() | |
| """, | |
| }, | |
| { | |
| "type": "numeric", | |
| "cell_type": "code", | |
| "source": """ | |
| # Distribution plots for numerical columns | |
| for column in df.select_dtypes(include=['int64', 'float64']).columns: | |
| plt.figure(figsize=(8, 4)) | |
| sns.histplot(df[column], kde=True) | |
| plt.title(f'Distribution of {column}') | |
| plt.xlabel(column) | |
| plt.ylabel('Frequency') | |
| plt.show() | |
| """, | |
| }, | |
| { | |
| "type": "categoric", | |
| "cell_type": "code", | |
| "source": """ | |
| # Count plots for categorical columns | |
| for column in df.select_dtypes(include=['object']).columns: | |
| plt.figure(figsize=(8, 4)) | |
| sns.countplot(x=column, data=df) | |
| plt.title(f'Count Plot of {column}') | |
| plt.xlabel(column) | |
| plt.ylabel('Count') | |
| plt.show() | |
| """, | |
| }, | |
| { | |
| "type": "numeric", | |
| "cell_type": "code", | |
| "source": """ | |
| # Box plots for detecting outliers in numerical columns | |
| for column in df.select_dtypes(include=['int64', 'float64']).columns: | |
| plt.figure(figsize=(8, 4)) | |
| sns.boxplot(df[column]) | |
| plt.title(f'Box Plot of {column}') | |
| plt.xlabel(column) | |
| plt.show() | |
| """, | |
| }, | |
| ] | |
| rag_cells = [ | |
| { | |
| "cell_type": "markdown", | |
| "source": """ | |
| --- | |
| # **Retrieval-Augmented Generation Notebook for {dataset_name} dataset** | |
| --- | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 1. Setup necessary libraries and load the dataset", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Install and import necessary libraries. | |
| !pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from huggingface_hub import InferenceClient | |
| import pandas as pd | |
| import faiss | |
| import torch | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Load the dataset as a DataFrame | |
| {first_code} | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Specify the column name that contains the text data to generate embeddings | |
| column_to_generate_embeddings = '{longest_col}' | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 2. Loading embedding model and creating FAISS index", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Remove duplicate entries based on the specified column | |
| df = df.drop_duplicates(subset=column_to_generate_embeddings) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Convert the column data to a list of text entries | |
| text_list = df[column_to_generate_embeddings].tolist() | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Specify the embedding model you want to use | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| vectors = model.encode(text_list) | |
| vector_dimension = vectors.shape[1] | |
| # Initialize the FAISS index with the appropriate dimension (384 for this model) | |
| index = faiss.IndexFlatL2(vector_dimension) | |
| # Encode the text list into embeddings and add them to the FAISS index | |
| index.add(vectors) | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 3. Perform a text search", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Specify the text you want to search for in the list | |
| query = "How to prepare a cake?" | |
| # Generate the embedding for the search query | |
| query_embedding = model.encode([query]) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed) | |
| D, I = index.search(query_embedding, k=10) | |
| # Print the similar documents found | |
| print(f"Similar documents: {[text_list[i] for i in I[0]]}") | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 4. Load pipeline and perform inference locally", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Adjust model name as needed | |
| checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct' | |
| device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device) | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query | |
| selected_elements = [text_list[i] for i in I[0].tolist()] | |
| context = ','.join(selected_elements) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": f"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}", | |
| }, | |
| {"role": "user", "content": query}, | |
| ] | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Send the prompt to the pipeline and show the answer | |
| output = generator(messages) | |
| print("Generated result:") | |
| print(output[0]['generated_text'][-1]['content']) # Print the assistant's response content | |
| """, | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": "## 5. Alternatively call the inference client", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # Adjust model name as needed | |
| checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| # Change here your Hugging Face API token | |
| token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | |
| inference_client = InferenceClient(checkpoint, token=token) | |
| output = inference_client.chat_completion(messages=messages, stream=False) | |
| print("Generated result:") | |
| print(output.choices[0].message.content) | |
| """, | |
| }, | |
| ] | |
| def generate_rag_system_prompt(): | |
| """ | |
| 1. Install necessary libraries. | |
| 2. Import libraries. | |
| 3. Load the dataset as a DataFrame using the provided code. | |
| 4. Select the column for generating embeddings. | |
| 5. Remove duplicate data. | |
| 6. Convert the selected column to a list. | |
| 7. Load the sentence-transformers model. | |
| 8. Create a FAISS index. | |
| 9. Encode a query sample. | |
| 10. Search for similar documents using the FAISS index. | |
| 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline. | |
| 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query. | |
| 13. Send the prompt to the pipeline and display the answer. | |
| Ensure the notebook is well-organized with explanations for each step. | |
| The output should be Markdown content with Python code snippets enclosed in "```python" and "```". | |
| The user will provide the dataset information in the following format: | |
| ## Columns and Data Types | |
| ## Sample Data | |
| ## Loading Data code | |
| Use the provided code to load the dataset; do not use any other method. | |
| """ | |