diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ab764002ae74f4991e3a9d991f1030ed9505c1f9 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/images/comaprision.png filter=lfs diff=lfs merge=lfs -text
+docs/images/iterative_flowchart-1.png filter=lfs diff=lfs merge=lfs -text
+docs/images/overview.png filter=lfs diff=lfs merge=lfs -text
+docs/images/ppc_1.png filter=lfs diff=lfs merge=lfs -text
+docs/images/ppc.png filter=lfs diff=lfs merge=lfs -text
+images/iterative_flowchart-1.png filter=lfs diff=lfs merge=lfs -text
+images/overview.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8a30d258ed9d0288d3e2889b3d0e2174b2028225
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,398 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# Visual Studio History (VSHistory) files
+.vshistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# Local History for Visual Studio Code
+.history/
+
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# JetBrains Rider
+*.sln.iml
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..6257f2e76f1f9c1d9a5bfae40b58cf218b41ba80
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..79656060de00aa4659ad2c276d5be8830664d544
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+ MIT License
+
+ Copyright (c) Microsoft Corporation.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4a618edb12700164383629e20063bfb1b651a58e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,16 @@
+.PHONY: install style test
+
+PYTHON := python
+CHECK_DIRS := promptwizard tests
+
+install:
+ @${PYTHON} setup.py bdist_wheel
+ @${PYTHON} -m pip install dist/sdtools*
+
+style:
+ black $(CHECK_DIRS)
+ isort -rc $(CHECK_DIRS)
+ flake8 $(CHECK_DIRS)
+
+test:
+ @${PYTHON} -m pytest -n auto --dist=loadfile -s -v ./tests/
\ No newline at end of file
diff --git a/README.md b/README.md
index f430dc2cc1b9f33245905a70d764207584a14eff..5d62256970b6349cd4633c2c36b6878f2aec18e5 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,265 @@
----
-license: bsd-2-clause
----
+
+# PromptWizard š§
+
+
+
+
+PromptWizard is a discrete prompt optimization framework that employs a self-evolving mechanism where the LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesis. This self-adaptive approach ensures holistic optimization by evolving both the instructions and in-context learning examples for better task performance.
+
+Three key components of PromptWizard are te following :
+
+- Feedback-driven Refinement: LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesisā
+- Critique and Synthesize diverse examples: Generates synthetic examples that are robust, diverse and task-aware. Also it optimizes both prompt and examples in tandemā
+- Self generated Chain of Thought (CoT) steps with combination of positive, negative and synthetic examples
+
+
Stage 1: Iterative optimization of instructions
+
+
+
+
+
Stage 2: Sequential optimization of instruction and examples
+
+
+
+
+## Installation ā¬ļø
+
+Follow these steps to set up the development environment and install the package:
+
+1) Clone the repository
+ ```
+ git clone https://github.com/microsoft/PromptWizard
+ cd PromptWizard
+ ```
+2) Create and activate a virtual environment
+
+ On Windows
+ ```
+ python -m venv venv
+ venv\Scripts\activate
+ ```
+ On macOS/Linux:
+ ```
+ python -m venv venv
+ source venv/bin/activate
+ ```
+3) Install the package in development mode:
+ ```
+ pip install -e .
+ ```
+
+
+## Quickstart š
+
+There are three main ways to use PromptWizard:
+- Scenario 1 : Optimizing prompts without examples
+- Scenario 2 : Generating synthetic examples and using them to optimize prompts
+- Scenario 3 : Optimizing prompts with training data
+
+**NOTE** : Refer this [notebook](demos/scenarios/dataset_scenarios_demo.ipynb) to get a detailed understanding of the usage for each of the scenarios. **This serves as a starting point to understand the usage of PromptWizard**
+
+#### High level overview of using PromptWizard
+- Decide your scenario
+- Fix the configuration and environmental varibles for API calling
+ - Use ```promptopt_config.yaml``` to set configurations. For example for GSM8k this [file](demos/gsm8k/configs/promptopt_config.yaml) can be used
+ - Use ```.env``` to set environmental varibles. For GSM8k this [file](demos/gsm8k/.env) can be used
+ ```
+ USE_OPENAI_API_KEY="XXXX"
+ # Replace with True/False based on whether or not to use OPENAI API key
+
+ # If the first variable is set to True then fill the following two
+ OPENAI_API_KEY="XXXX"
+ OPENAI_MODEL_NAME ="XXXX"
+
+ # If the first variable is set to False then fill the following three
+ AZURE_OPENAI_ENDPOINT="XXXXX"
+ # Replace with your Azure OpenAI Endpoint
+
+ OPENAI_API_VERSION="XXXX"
+ # Replace with the version of your API
+
+ AZURE_OPENAI_CHAT_DEPLOYMENT_NAME="XXXXX"
+ # Create a deployment for the model and place the deployment name here.
+ ```
+- Run the code
+ - To run PromptWizard on your custom dataset please jump [here](#run-on-custom-dataset)
+
+#### Running PromptWizard with training data (Scenario 3)
+- We support [GSM8k](https://huggingface.co/datasets/openai/gsm8k), [SVAMP](https://huggingface.co/datasets/ChilleD/SVAMP), [AQUARAT](https://huggingface.co/datasets/deepmind/aqua_rat) and [Instruction_Induction(BBII)](https://github.com/xqlin98/INSTINCT/tree/main/Induction/experiments/data/instruction_induction/raw) datasets
+- Please note that time taken for prompt optimzation is dependent on the dataset. In our experiments for the above mentioned datasets, it took around 20 - 30 minutes on average.
+
+#### Running on GSM8k (AQUARAT/SVAMP)
+
+- Please note that this code requires access to LLMs via API calling for which we support AZURE endpoints or OPENAI keys
+- Set the AZURE endpoint configurations in [.env](demos/gsm8k/.env)
+- Follow the steps in [demo.ipynb](demos/gsm8k/demo.ipynb) to download the data, run the prompt optimization and carry out inference.
+
+#### Running on BBII
+
+- BBII has many datasets in it, based on the dataset set the configs [here](demos/bbh/configs/promptopt_config.yaml)
+- In configs ```task_description```,```base_instruction``` and ```answer_format``` need to be changed for different datasets in BBII, the rest of the configs remain the same
+- A demo is presented in [demo.ipynb](demos/bbh/demo.ipynb)
+
+
+
+## Run on Custom Datasets šļø
+
+### Create Custom Dataset
+- Our code expects the dataset to be in ```.jsonl``` file format
+- Both the train and test set follow the same format
+- Every sample in the ```.jsonl``` should have 2 fields :
+ 1) ```question``` : It should contain the complete question that is to asked to the LLM
+ 2) ```answer``` : It should contain the ground truth answer which can be verbose or concise
+
+
+### Run on Custom Dataset
+
+NOTE : Refer to [demos](demos) folder for examples of folders for four datasets. The ```.ipynb``` in each of the folders shows how to run PromptWizard on that particular dataset. A similar procedure can be followed for a new dataset. Below is the explanation of each of the components of the ```.ipynb``` and the dataset specifc folder structure in detail
+
+#### Steps to be followed for custom datasets
+
+1) Every new dataset needs to have the following
+ - ```configs``` folder to store files for defining optimization hyperparameters and setup configs
+ - ```data``` folder to store ```train.jsonl``` and ```test.jsonl``` as curated [here](#create-custom-dataset) (this is done in the notebooks)
+ - ```.env``` file for environment varibles to be used for API calling
+ - ```.py/.ipynb``` script to run the code
+
+2) Set the hyperparameters like number of mutations, refine steps, in-context examples etc.
+ - Set the following in [promptopt_config.yaml](demos/gsm8k/configs/promptopt_config.yaml) :
+ - ```task_description``` : Desciption of the task at hand which will be fed into the prompt
+ - For GSM8k a description like the following can be used
+ ```
+ You are a mathematics expert. You will be given a mathematics problem which you need to solve
+ ```
+ - ```base_instruction``` : Base instruction in line with the dataset
+ - A commonly used base instruction could be
+ ```
+ Lets think step by step.
+ ```
+ - ```answer_format``` : Instruction for specifying the answer format
+ - It is crucial to set the ```answer_format``` properly to ensure correct extraction by ```def extract_final_answer()```
+ - Answer format could be :
+ ```
+ At the end, wrap only your final option between and tags
+ ```
+ Then in ```def extract_final_answer()``` we can simply write code to extract string between the tags
+
+ - ```seen_set_size``` : The number of train samples to be used for prompt optimization
+ - In our experiments we set this to be 25. In general any number between 20-50 would work
+ - ```few_shot_count``` : The number of in-context examples needed in the prompt
+ - The value can be set to any positive integer based on the requirement
+ - For generating zero-shot prompts, set the values to a small number (i.e between 2-5) and after the final prompt is generated the in-context examples can be removed. We suggest using some in-context examples as during the optimization process the instructions in the prompt are refined using in-context examples hence setting it to a small number will give better zero-shot instructions in the prompt
+ - ```generate_reasoning``` : Whether or not to generate reasoning for the in-context examples
+ - In our experiments we found it to improve the prompt overall as it provides a step-by-step approach to reach the final answer. However if there is a constraint on the prompt length or number of prompt tokens, it can be turned off to get smaller sized prompts
+ - ```generate_expert_identity``` and ```generate_intent_keywords``` : Having these helped improve the prompt as they help making the prompt relevant to the task
+ - Refer ```promptopt_config.yaml``` files in folders present [here](demos) for the descriptions used for AQUARAT, SVAMP and GSM8k. For BBII refer [description.py](demos/bbh/description.py) which has the meta instructions for each of the datasets
+ - Following are the global parameters which can be set based on the availability of the training data
+ - ```run_without_train_examples``` is a global hyperparameter which can be used when there are no training samples and in-context examples are not required in the final prompt
+ - ```generate_synthetic_examples``` is a global hyperparameter which can be used when there are no training samples and we want to generate synthetic data for training
+ - ```use_examples``` is a global hyperparameter which can be used to optimize prompts using training data
+3) Create a dataset specific class which inherits ```class DatasetSpecificProcessing``` similar to ```GSM8k(DatasetSpecificProcessing)``` in [demo.ipynb](demos/gsm8k/demo.ipynb) and define the following functions in it
+ 1) In ```def extract_answer_from_output()``` : This is a dataset specific function, given the ```answer``` from the dataset it should extract and return a concise form of the answer. Note that based on the dataset it can also simply return the ```answer``` as it is like in case of SVAMP and AQUARAT datasets
+ 2) ```def extract_final_answer()``` : This is a LLM output specific function, given the verbose answer from the LLM it should extract and return the concise final answer
+ 3) Define ```def access_answer()``` : This function takes an input the LLM output, then does the following:
+ - Extracts the concise answer using ```def extract_final_answer()``` from the LLM output as defined above
+ - Evaluates the extracted answer with the ground truth and retuns
+ - Extracted answer from LLM output
+ - Boolean value indicating if answer is correct or not
+ - The evaluation done here is dataset specific, for datasets like GSM8k, SVAMP and AQUARAT which have final answer as an number, we can do a direct match between the numbers generated and the ground truth, while for datasets where the answer is a sentence or paragraph it would be better to do evaluation with llm-as-a-judge, to compare the generated and ground truth paragraph/sentence. An example is available in ```def access_answer()``` in [this](demos/bbh/demo.ipynb) notebook
+
+
+## How PromptWizard Works š
+- Using the problem description and initial prompt instruction, PW generates variations of the instruction by prompting LLMs to mutate it. Based on performance, the best prompt is selected. PW incorporates a critique component that provides feedback, thus guiding and refining the prompt over multiple iterations.
+- PW also optimizes in-context examples. PW selects a diverse set of examples
+from the training data, identifying positive and negative examples based on their performance with
+the modified prompt. Negative examples help inform further prompt refinements.
+- Examples and instructions are sequentially optimized, using the critique to generate synthetic examples that address the current promptās weaknesses. These examples are integrated to further refine the prompt.
+- PW generates detailed reasoning chains via Chain-of-Thought (CoT), enriching the promptās capacity for problem-solving.
+- PW aligns prompts with human reasoning by integrating task intent and expert
+personas, enhancing both model performance and interpretability.
+
+## Configurations āļø
+
+Here we define the various hyperparameters used in prompt optimization process found in [promptopt_config.yaml](demos/gsm8k/configs/promptopt_config.yaml)
+
+- ```mutate_refine_iterations```: Number of iterations for conducting mutation of task description
+ followed by refinement of instructions
+- ```mutation_rounds```: Number of rounds of mutation to be performed when generating different styles
+- ```refine_task_eg_iterations```: Number of iterations for refining task description and in context examples
+- ```style_variation```: Number of thinking style variations to be used in prompt mutation
+- ```questions_batch_size```: Number of questions to be asked to LLM in a single batch, during training step
+- ```min_correct_count```: Minimum number of batches of questions to correctly answered, for a prompt to be considered as performing good
+- ```max_eval_batches```: Maximum number of mini-batches on which we should evaluate the prompt
+- ```top_n```: Number of top best prompts to be considered from scoring stage for the next stage
+- ```seen_set_size```: Number of samples from trainset to be used for training
+- ```few_shot_count```: Number of in-context examples required in final prompt
+
+## Best Practices š”
+
+Following are some of best pracitices we followed during are experiments
+- Regarding the parameters in [promptopt_config.yaml](demos/gsm8k/configs/promptopt_config.yaml)
+ - We found the best performing values for ```mutate_refine_iterations```,```mutation_rounds```,```refine_task_eg_iterations``` to be 3 or 5
+ - Other parameters have been set to their ideal values. ```seen_set_size``` can be increased to 50 and ```few_shot_count``` can be set based on the use case
+- The prompts generated at the end of the training process are usually very detailed, however user supervision can help tune it further for the task at hand
+- Trying both configurations of having synthetic in-context examples or in-context examples from the train set can be tried to find the best prompt based on use case.
+
+## Results š
+
+
+
+
PromptWizard consistently outperforms other methods across various
+thresholds, maintaining the highest p(Ļ) values, indicating that it consistently performs near the best
+possible accuracy across all tasks
+
+
+
+- The fiqure shows the performance profile curve for the instruction induction
+tasks. The performance profile curve visualizes how frequently
+different approachesā performance is within a given distance of the best performance. In this curve,
+the x-axis (Ļ) represents the performance ratio relative to the best-performing method, and the y-axis
+(p(Ļ )) reflects the fraction of tasks where a methodās performance is within this ratio. So for a given
+method, the curve tells what percentage of the tasks are within Ļ distance to the best performance.
+
+
+## How to contribute: ā
+This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
+When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact opencode@microsoft.com with any additional questions or comments.
+
+## Citation š
+
+If you make use of our work, please cite our paper:
+
+```
+@misc{agarwal2024promptwizardtaskawarepromptoptimization,
+ title={PromptWizard: Task-Aware Prompt Optimization Framework},
+ author={Eshaan Agarwal and Joykirat Singh and Vivek Dani and Raghav Magazine and Tanuja Ganu and Akshay Nambi},
+ year={2024},
+ eprint={2405.18369},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL},
+ url={https://arxiv.org/abs/2405.18369},
+}
+```
+## Responsible AI Considerations
+For guidelines and best practices related to Responsible AI, please refer to our [Responsible AI Guidelines](RESPONSIBLE_AI.md).
+
diff --git a/RESPONSIBLE_AI.md b/RESPONSIBLE_AI.md
new file mode 100644
index 0000000000000000000000000000000000000000..c548939893dae2120fb00267e6c7c54ae37c8406
--- /dev/null
+++ b/RESPONSIBLE_AI.md
@@ -0,0 +1,41 @@
+### PromptWizard: Responsible AI FAQ
+
+- What is PromptWizard?
+
+ PromptWizard is a novel framework for prompt optimization that supports to tune a good prompt for a given task and dataset, so that LLMsā output/accuracy can be optimized. PromptWizard is solely designed for research settings, and its testing has only been carried out in such environments. It should not be used in downstream applications without additional analysis and mitigation to address potential harm or bias in the proposed application. Please refer to the paper - [PromptWizard: Task-Aware Agent-driven Prompt Optimization Framework (arxiv.org)](https://arxiv.org/abs/2405.18369)-for more details.
+
+- What can PromptWizard do?
+
+ PromptWizard framework is an AI-based framework that internally uses LLM to find the optimal prompt for a given task. It takes as input task description, dataset format & few training examples, hyperparameter configurations and outputs an optimized prompt for the given LLM and task intent.
+ Unlike existing approaches, PromptWizard optimizes both prompt instructions and in-context examples, maximizing the LLM performance. It iteratively refines prompts by mutating instructions using and incorporating negative examples. It further enhances both instructions and examples with the aid of a critic provided by LLM on a candidate prompt.
+ New synthetic instructions and examples are generated with detailed reasoning steps using LLM.
+
+- What is/are PromptWizardās intended use(s)?
+
+ Please note that PromptWizard is an open-source framework under active development and intended for use for research purposes. It should not be used in any downstream applications without additional detailed evaluation of robustness, safety issues and assessment of any potential harm or bias in the proposed application. For all GenAI applications, prompt design and tuning are a tedious, skilful and laborious tasks. PromptWizardās intended use is to design and optimize the prompt along with the few shot examples for a given task/domain and dataset. This well crafted prompt would enable the LLM to provide more accurate and high quality answer. We have also integrated Azure AI Content Safety service, to avoid/slow-down malicious uses.
+
+- How was PromptWizard evaluated? What metrics are used to measure performance?
+
+ PromptWizard framework is generic enough to work on any domain/dataset/task. However, we have evaluated the performance of PromptWizard across 35 tasks on 8 datasets. More details can be found [PromptWizard: Task-Aware Agent-driven Prompt Optimization Framework (arxiv.org)](https://arxiv.org/abs/2405.18369)
+
+ The opensource datasets used for evaluation include
+ - Medical challenges ([MedQA](https://github.com/jind11/MedQA), [PubMedQA](https://pubmedqa.github.io/))
+ - Commonsense reasoning ([CSQA](https://amritasaha1812.github.io/CSQA/), [SQA](https://www.microsoft.com/en-in/download/details.aspx?id=54253))
+ - Math reasoning problems ([GSM8k](https://huggingface.co/datasets/openai/gsm8k))
+ - Hate speech classification ([Ethos](https://link.springer.com/article/10.1007/s40747-021-00608-2)),
+ - Complex domain-specific tasks ([MMLU](https://huggingface.co/datasets/cais/mmlu) 6 medical tasks, [Big-Bench-Hard-23](https://huggingface.co/datasets/maveriq/bigbenchhard))
+
+ Additionally, the team has also conducted āred teamā analysis to evaluate if PromptWizard optimizes harmful intent. With appropriate Azure content moderation deployed in the pipeline on the input to PromptWizard and output from PromptWizard, it didnāt optimize prompts for harmful intent. Please refer to the details for Azure content moderation [here](https://learn.microsoft.com/en-us/azure/ai-services/content-moderator/overview).
+
+- What are the limitations of PromptWizard? How can users minimize the impact of PromptWizardās limitations when using the system?
+
+ - The framework is evaluated primarily on English languages tasks as described in earlier section. The framework is not yet evaluated for multilingual settings.
+ - The framework generates synthetic examples for few-shot learning based on task description. User is required to validate the correctness and diversity of generated synthetic examples.
+ - PromptWizard utilizes existing LLMs and does not train a new model. Hence, it inherits the capabilities and limitations of its base model, as well as common limitations among other large language models or limitations caused by its training process. Hence, we suggest using the appropriate base LLM suitable for your use-cases to work with PromptWizard.
+
+- What operational factors and settings allow for effective and responsible use of PromptWizard?
+
+ - Input considerations: Better performance with PromptWizard can be achieved by specifying the input components like task and intent as clearly and concisely as possible.
+ - Human involvement: PromptWizard optimizes the prompt with prompt instruction and a few shot examples for the given intent and task. We suggest human oversight to review the optimized prompts before those are executed with LLMs.
+ - LLMs: Users can choose the LLM that is optimized for responsible use. The default LLM is GPT-4 which inherits the existing RAI mechanisms and filters from the LLM provider. Caching is enabled by default to increase reliability and control cost. We encourage developers to review [OpenAIās Usage policies](https://openai.com/policies/usage-policies/) and [Azure OpenAIās Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct) when using GPT-4.
+ - Content Safety: We have integrated [Azure AI Content Safety](https://learn.microsoft.com/en-us/azure/ai-services/content-safety/overview) service for content moderation. We suggest to deploy PromptWizard with such content safety system in the pipeline.
\ No newline at end of file
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3c89efc852e22f71eabf5dfbc6ac62493425eb6
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
+
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+ * Full paths of source file(s) related to the manifestation of the issue
+ * The location of the affected source code (tag/branch/commit or direct URL)
+ * Any special configuration required to reproduce the issue
+ * Step-by-step instructions to reproduce the issue
+ * Proof-of-concept or exploit code (if possible)
+ * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+
diff --git a/demos/aquarat/.env b/demos/aquarat/.env
new file mode 100644
index 0000000000000000000000000000000000000000..290a11ba1125d027f0e0f653133f0032c7dd491e
--- /dev/null
+++ b/demos/aquarat/.env
@@ -0,0 +1,8 @@
+USE_OPENAI_API_KEY="False"
+
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""
\ No newline at end of file
diff --git a/demos/aquarat/configs/prompt_library.yaml b/demos/aquarat/configs/prompt_library.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a2a5f7724defe0974bf3ba3572ca5ca1ae7b36
--- /dev/null
+++ b/demos/aquarat/configs/prompt_library.yaml
@@ -0,0 +1,36 @@
+system_prompts: |
+ You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+ Guidelines
+ - Your role must always be a helpful assistant that assists students in understanding research papers.
+ - Only answer questions that are directly or indirectly related to the referenced paper(s).
+
+mode:
+ chat:
+ - name: CHAT-FIRST-MESSAGE
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ - name: CHAT-NEXT-MESSAGES
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ generation:
+ - name: FLASH_PROFILE
+ prompt_template: |
+ {user_msg}
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+ llm_request_type: rag-query
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
\ No newline at end of file
diff --git a/demos/aquarat/configs/promptopt_config.yaml b/demos/aquarat/configs/promptopt_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fcf061cb819f3108b5a95b8f0510389155e239e
--- /dev/null
+++ b/demos/aquarat/configs/promptopt_config.yaml
@@ -0,0 +1,52 @@
+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "At the end, wrap only your final option between and tags"
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################
+
+
+
diff --git a/demos/aquarat/configs/setup_config.yaml b/demos/aquarat/configs/setup_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4847f1572b77fc92a049bfc6b82d74ef78b1193
--- /dev/null
+++ b/demos/aquarat/configs/setup_config.yaml
@@ -0,0 +1,14 @@
+assistant_llm:
+ # put the unique_model_id that you specified in llm_config.yaml
+ prompt_opt: gpt-4o
+dir_info:
+ # Base directory for everything
+ base_dir: logs
+ log_dir_name: glue_logs
+experiment_name: aquarat
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:
diff --git a/demos/aquarat/demo.ipynb b/demos/aquarat/demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..c7a5c9dffee0882b12b24dd2d7a0240388d331e5
--- /dev/null
+++ b/demos/aquarat/demo.ipynb
@@ -0,0 +1,296 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "695a1a9b",
+ "metadata": {},
+ "source": [
+ "#### Set environment variables in [.env](.env) for LLM API calling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8042a9cc",
+ "metadata": {},
+ "source": [
+ "### Import Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f1fb3d81-16b6-4b8c-a028-880fdce5e14a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, \"../../\")\n",
+ "import promptwizard\n",
+ "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+ "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+ "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+ "from typing import Any\n",
+ "from tqdm import tqdm\n",
+ "import json\n",
+ "import os\n",
+ "from datasets import load_dataset\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "load_dotenv(override = True)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bbe055e",
+ "metadata": {},
+ "source": [
+ "### Create a dataset specific class and define the required functions "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "5f325d33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "def extract_between(start, end, text):\n",
+ " \"\"\"\n",
+ " Extracts the substring from 'text' that is between 'start' and 'end' strings.\n",
+ " \n",
+ " Parameters:\n",
+ " - start (str): The starting delimiter string.\n",
+ " - end (str): The ending delimiter string.\n",
+ " - text (str): The text to search within.\n",
+ " \n",
+ " Returns:\n",
+ " - str: The extracted substring between the start and end delimiters.\n",
+ " \"\"\"\n",
+ " start_index = text.find(start)\n",
+ " if start_index == -1:\n",
+ " return '' \n",
+ " \n",
+ " start_index += len(start)\n",
+ " \n",
+ " end_index = text.find(end, start_index)\n",
+ " if end_index == -1:\n",
+ " return '' \n",
+ " return text[start_index:end_index]\n",
+ "\n",
+ "class AQUARAT(DatasetSpecificProcessing):\n",
+ "\n",
+ " def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+ " def extract_answer_from_output(completion):\n",
+ "\n",
+ " return completion\n",
+ "\n",
+ " examples_set = []\n",
+ "\n",
+ " for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+ " example = {\n",
+ " DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+ " DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+ " DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+ " }\n",
+ " examples_set.append(example)\n",
+ "\n",
+ " save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+ "\n",
+ " def extract_final_answer(self, answer: str):\n",
+ " \n",
+ " final_answer = extract_between(text=answer,start=\"\",end=\"\")\n",
+ " return final_answer\n",
+ " \n",
+ " def access_answer(self, llm_output: str, gt_answer: str):\n",
+ "\n",
+ " predicted_answer = self.extract_final_answer(llm_output)\n",
+ " is_correct = False\n",
+ " if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):\n",
+ " is_correct = True\n",
+ "\n",
+ " return is_correct, predicted_answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f384eb57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "aquarat_processor = AQUARAT()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "if not os.path.exists(\"data\"):\n",
+ " os.mkdir(\"data\")\n",
+ "dataset = load_dataset(\"deepmind/aqua_rat\", \"raw\")\n",
+ "num_samples = 1\n",
+ "for dataset_type in ['train','test']:\n",
+ " data_list = []\n",
+ " for data in dataset[dataset_type]:\n",
+ " options = data['options'][0]\n",
+ " for i in range(1,len(data['options'])):\n",
+ " options = options + \" \"+ data['options'][i]\n",
+ " data_list.append({\"question\": data['question']+\"\\n\"+options, \"answer\": data['correct']})\n",
+ " if num_samples == 100 and dataset_type == 'train': # We sample only 100 train examples and use 25 out them for training randomly\n",
+ " break\n",
+ " num_samples += 1\n",
+ " aquarat_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db891c34",
+ "metadata": {},
+ "source": [
+ "### Set paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_file_name = os.path.join(\"data\", \"train.jsonl\")\n",
+ "test_file_name = os.path.join(\"data\", \"test.jsonl\")\n",
+ "path_to_config = \"configs\"\n",
+ "llm_config_path = os.path.join(path_to_config, \"llm_config.yaml\")\n",
+ "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+ "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26ba1a62",
+ "metadata": {},
+ "source": [
+ "### Create an object for calling prompt optimization and inference functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " train_file_name,\n",
+ " aquarat_processor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b25843b",
+ "metadata": {},
+ "source": [
+ "### Call prompt optmization function\n",
+ "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+ "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+ "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Function call to generate optimal prompt and expert profile \n",
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "97549dd2",
+ "metadata": {},
+ "source": [
+ "### Save the optimized prompt and expert profile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import pickle \n",
+ "\n",
+ "if not os.path.exists(\"results\"):\n",
+ " os.system(\"mkdir results\")\n",
+ "\n",
+ "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+ " pickle.dump(best_prompt, f)\n",
+ "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+ " pickle.dump(expert_profile, f)\n",
+ "\n",
+ "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bdbb7e07",
+ "metadata": {},
+ "source": [
+ "### Evaluate the optimized prompt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "gp.EXPERT_PROFILE = expert_profile\n",
+ "gp.BEST_PROMPT = best_prompt\n",
+ "\n",
+ "# Function call to evaluate the prompt\n",
+ "accuracy = gp.evaluate(test_file_name)\n",
+ "\n",
+ "print(f\"Final Accuracy: {accuracy}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "PromptWizard",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demos/bbh/.env b/demos/bbh/.env
new file mode 100644
index 0000000000000000000000000000000000000000..290a11ba1125d027f0e0f653133f0032c7dd491e
--- /dev/null
+++ b/demos/bbh/.env
@@ -0,0 +1,8 @@
+USE_OPENAI_API_KEY="False"
+
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""
\ No newline at end of file
diff --git a/demos/bbh/configs/prompt_library.yaml b/demos/bbh/configs/prompt_library.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a2a5f7724defe0974bf3ba3572ca5ca1ae7b36
--- /dev/null
+++ b/demos/bbh/configs/prompt_library.yaml
@@ -0,0 +1,36 @@
+system_prompts: |
+ You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+ Guidelines
+ - Your role must always be a helpful assistant that assists students in understanding research papers.
+ - Only answer questions that are directly or indirectly related to the referenced paper(s).
+
+mode:
+ chat:
+ - name: CHAT-FIRST-MESSAGE
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ - name: CHAT-NEXT-MESSAGES
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ generation:
+ - name: FLASH_PROFILE
+ prompt_template: |
+ {user_msg}
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+ llm_request_type: rag-query
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
\ No newline at end of file
diff --git a/demos/bbh/configs/promptopt_config.yaml b/demos/bbh/configs/promptopt_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67b3812368c1fc25fc581e327ce45b4712f70b57
--- /dev/null
+++ b/demos/bbh/configs/promptopt_config.yaml
@@ -0,0 +1,52 @@
+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description : 'Extract the second letter from the input word.'
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction : 'Output the second letter. Think step by step to arrive at the solution.'
+# Instruction for specifying answer format
+answer_format : 'For each input word, present the reasoning followed by the extracted letter (only single letter) between and tags'
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################
+
+
+
diff --git a/demos/bbh/configs/setup_config.yaml b/demos/bbh/configs/setup_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b13dc3be5a748ad056a41c59dde3e2132693fc1
--- /dev/null
+++ b/demos/bbh/configs/setup_config.yaml
@@ -0,0 +1,14 @@
+assistant_llm:
+ # put the unique_model_id that you specified in llm_config.yaml
+ prompt_opt: gpt-4o
+dir_info:
+ # Base directory for everything
+ base_dir: logs
+ log_dir_name: glue_logs
+experiment_name: bbh
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:
diff --git a/demos/bbh/demo.ipynb b/demos/bbh/demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..00017b6d89cca0f9383fb82fd7696b1fe7ee1fc4
--- /dev/null
+++ b/demos/bbh/demo.ipynb
@@ -0,0 +1,428 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ece8514e",
+ "metadata": {},
+ "source": [
+ "#### Set environment variables in [.env](.env) for LLM API calling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "678ed8db",
+ "metadata": {},
+ "source": [
+ "### Import Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f1fb3d81-16b6-4b8c-a028-880fdce5e14a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, \"../../\")\n",
+ "import promptwizard\n",
+ "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+ "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+ "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+ "from typing import Any\n",
+ "from tqdm import tqdm\n",
+ "import json\n",
+ "import os\n",
+ "from azure.identity import get_bearer_token_provider, AzureCliCredential\n",
+ "from openai import AzureOpenAI\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "load_dotenv(override = True)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc9b746c",
+ "metadata": {},
+ "source": [
+ "### Below code can be used for LLM-as-a-judge eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "26719362",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_between(start, end, text):\n",
+ " \"\"\"\n",
+ " Extracts the substring from 'text' that is between 'start' and 'end' strings.\n",
+ " \n",
+ " Parameters:\n",
+ " - start (str): The starting delimiter string.\n",
+ " - end (str): The ending delimiter string.\n",
+ " - text (str): The text to search within.\n",
+ " \n",
+ " Returns:\n",
+ " - str: The extracted substring between the start and end delimiters.\n",
+ " \"\"\"\n",
+ " start_index = text.find(start)\n",
+ " if start_index == -1:\n",
+ " return '' \n",
+ " \n",
+ " start_index += len(start)\n",
+ " \n",
+ " end_index = text.find(end, start_index)\n",
+ " if end_index == -1:\n",
+ " return '' \n",
+ " return text[start_index:end_index]\n",
+ "\n",
+ "def call_api(messages):\n",
+ " \n",
+ " token_provider = get_bearer_token_provider(\n",
+ " AzureCliCredential(), \"https://cognitiveservices.azure.com/.default\"\n",
+ " )\n",
+ " client = AzureOpenAI(\n",
+ " api_version=\"\",\n",
+ " azure_endpoint=\"\",\n",
+ " azure_ad_token_provider=token_provider\n",
+ " )\n",
+ " response = client.chat.completions.create(\n",
+ " model=\"\",\n",
+ " messages=messages,\n",
+ " temperature=0.0,\n",
+ " )\n",
+ " prediction = response.choices[0].message.content\n",
+ " return prediction\n",
+ "\n",
+ "def llm_eval(predicted_answer,gt_answer):\n",
+ " \n",
+ " EVAL_PROMPT = f\"\"\"Given the Predicted_Answer and Reference_Answer, compare them and check they mean the same.\n",
+ " If they mean the same then return True between and tags , \n",
+ " If they differ in the meaning then return False between and tags \n",
+ " Following are the given :\n",
+ " Predicted_Answer: {predicted_answer}\n",
+ " Reference_Answer: {gt_answer}\"\"\"\n",
+ " messages = [\n",
+ " {\"role\": \"system\", \"content\": \"\"},\n",
+ " {\"role\": \"user\", \"content\": EVAL_PROMPT}\n",
+ " ]\n",
+ "\n",
+ " response = call_api(messages)\n",
+ " final_judgement = extract_between(start=\"\", end=\"\", text=response)\n",
+ " return final_judgement == \"True\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4a5084d7",
+ "metadata": {},
+ "source": [
+ "### Create a dataset specific class and define the required functions "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "5f325d33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "llm_as_judge_eval = True\n",
+ "\n",
+ "class BBH(DatasetSpecificProcessing):\n",
+ "\n",
+ " def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+ " def extract_answer_from_output(completion):\n",
+ "\n",
+ " return completion\n",
+ "\n",
+ " examples_set = []\n",
+ "\n",
+ " for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+ " example = {\n",
+ " DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+ " DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+ " DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+ " }\n",
+ " examples_set.append(example)\n",
+ "\n",
+ " save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+ "\n",
+ " def extract_final_answer(self, answer: str):\n",
+ " \n",
+ " final_answer = extract_between(text=answer,start=\"\",end=\"\")\n",
+ " return final_answer\n",
+ " \n",
+ " def access_answer(self, llm_output: str, gt_answer: str):\n",
+ "\n",
+ " if llm_as_judge_eval:\n",
+ " predicted_answer = self.extract_final_answer(llm_output)\n",
+ " is_correct = False\n",
+ " if llm_eval(predicted_answer,gt_answer):\n",
+ " is_correct = True\n",
+ " else:\n",
+ " predicted_answer = self.extract_final_answer(llm_output)\n",
+ " is_correct = False\n",
+ " if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):\n",
+ " is_correct = True\n",
+ "\n",
+ " return is_correct, predicted_answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "f384eb57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bbh_processor = BBH()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec7d1396",
+ "metadata": {},
+ "source": [
+ "### Load and save the dataset . \n",
+ "Set the ```dataset_to_run``` variable to choose 1 among the 19 datasets of BBII to run the optimization on"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "if not os.path.exists(\"data\"):\n",
+ " os.mkdir(\"data\")\n",
+ "dataset_list = ['informal_to_formal','letters_list','negation','orthography_starts_with','rhymes','second_word_letter','sum','diff','sentence_similarity','taxonomy_animal','auto_categorization','object_counting','odd_one_out','antonyms','word_unscrambling','cause_and_effect','common_concept','word_sorting','synonyms']\n",
+ "\n",
+ "# Set the dataset on which to run optimization out of the 19 \n",
+ "dataset_to_run = 'second_word_letter'\n",
+ "\n",
+ "if not os.path.exists(\"data/\"+dataset_to_run):\n",
+ " os.mkdir(\"data/\"+dataset_to_run)\n",
+ " \n",
+ "os.system(\"git clone https://github.com/xqlin98/INSTINCT\")\n",
+ "\n",
+ "\n",
+ "for mode in ['execute','induce']:\n",
+ " for dataset in dataset_list:\n",
+ "\n",
+ " if dataset_to_run == dataset:\n",
+ " data_list = []\n",
+ "\n",
+ " file_path = 'INSTINCT/Induction/experiments/data/instruction_induction/raw/'+mode+'/'+dataset+'.json' \n",
+ " with open(file_path, 'r') as file:\n",
+ " data = json.load(file)\n",
+ " \n",
+ " save_file_path = 'test.jsonl'\n",
+ " if mode == 'execute':\n",
+ " save_file_path = 'train.jsonl'\n",
+ "\n",
+ " for key,sample in data['examples'].items():\n",
+ " task = dataset\n",
+ " if(task == 'cause_and_effect'):\n",
+ " cause = sample[\"cause\"]\n",
+ " effect = sample[\"effect\"]\n",
+ " import random\n",
+ " pair = [cause, effect]\n",
+ " random.shuffle(pair)\n",
+ " question = f\"Sentence 1: {pair[0]} Sentence 2: {pair[1]}\",\n",
+ " answer = cause,\n",
+ " elif(task == 'antonyms'):\n",
+ " \n",
+ " question = sample[\"input\"],\n",
+ " answer = sample[\"output\"],\n",
+ "\n",
+ " elif(task == 'common_concept'):\n",
+ " concept = sample[\"concept\"]\n",
+ " items = sample[\"items\"]\n",
+ " input = \", \".join(items)\n",
+ " question = f\"Objects: {input}\"\n",
+ " answer = f\"{concept}\"\n",
+ "\n",
+ " elif(task == 'diff'):\n",
+ " input = sample[\"input\"]\n",
+ " output = sample[\"output\"]\n",
+ " question = f\"{input}\"\n",
+ " answer = f\"{output}\"\n",
+ "\n",
+ " elif(task == 'informal_to_formal'):\n",
+ " informal = sample[\"input\"]\n",
+ " formal = sample[\"output\"]\n",
+ " question = f\"{informal}\"\n",
+ " answer = f\"{formal}\"\n",
+ "\n",
+ " elif(task == 'synonyms' or task == 'word_unscrambling' or task == 'word_sorting' or task == 'letters_list' or task == 'negation' or task == 'orthography_starts_with' or task == 'second_word_letter' or task == 'sentence_similarity' or task == 'sum' or task == 'taxonomy_animal' or task == 'auto_categorization' or task == 'object_counting' or task == 'odd_one_out'):\n",
+ " informal = sample[\"input\"]\n",
+ " formal = sample[\"output\"] \n",
+ " question = f\"{informal}\"\n",
+ " answer = f\"{formal}\"\n",
+ "\n",
+ " elif(task == 'rhymes'):\n",
+ " input = sample[\"input\"]\n",
+ " output = sample[\"other_rhymes\"]\n",
+ " output = \", \".join(output)\n",
+ " question = f\"{input}\"\n",
+ " answer = f\"{output}\"\n",
+ " \n",
+ " data_list.append({\"question\":question,\"answer\":answer})\n",
+ " bbh_processor.dataset_to_jsonl(\"data/\"+dataset +\"/\"+save_file_path, dataset=data_list)\n",
+ "\n",
+ "os.system(\"rm -r INSTINCT\")\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fe28a967",
+ "metadata": {},
+ "source": [
+ "### Set paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_file_name = os.path.join(\"data/\"+dataset_to_run, \"train.jsonl\")\n",
+ "test_file_name = os.path.join(\"data/\"+dataset_to_run, \"test.jsonl\")\n",
+ "path_to_config = \"configs\"\n",
+ "llm_config_path = os.path.join(path_to_config, \"llm_config.yaml\")\n",
+ "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+ "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75ac5780",
+ "metadata": {},
+ "source": [
+ "### Create an object for calling prompt optimization and inference functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " train_file_name,\n",
+ " bbh_processor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a26af0d",
+ "metadata": {},
+ "source": [
+ "### Call prompt optmization function\n",
+ "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+ "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+ "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Function call to generate optimal prompt and expert profile \n",
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef923b11",
+ "metadata": {},
+ "source": [
+ "### Save the optimized prompt and expert profile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import pickle \n",
+ "if not os.path.exists(\"results\"):\n",
+ " os.system(\"mkdir results\")\n",
+ "\n",
+ "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+ " pickle.dump(best_prompt, f)\n",
+ "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+ " pickle.dump(expert_profile, f)\n",
+ "\n",
+ "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1942c67e",
+ "metadata": {},
+ "source": [
+ "### Evaluate the optimized prompt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "gp.EXPERT_PROFILE = expert_profile\n",
+ "gp.BEST_PROMPT = best_prompt\n",
+ "\n",
+ "# Function call to evaluate the prompt\n",
+ "accuracy = gp.evaluate(test_file_name)\n",
+ "\n",
+ "print(f\"Final Accuracy: {accuracy}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "PromptWizard",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demos/bbh/description.py b/demos/bbh/description.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee5756b89a4f39bf74911cd83947cfd7f9a9e730
--- /dev/null
+++ b/demos/bbh/description.py
@@ -0,0 +1,97 @@
+# BBH Datasets
+# informal_to_formal
+task_description = 'In this task, you will be given a sentence in an informal style. Your job is to rewrite the sentence in a formal style.'
+base_instruction = 'For each given sentence, provide a formal paraphrase.'
+answer_format = 'For each input sentence, present the reasoning followed by the format paraphrased sentence.'
+
+#letters_list
+task_description = 'In this task, you will be given a single word as input. Your job is to produce the output by adding a space between each character pair in the word.'
+base_instruction = 'For each given word, insert a space between each character pair in the word.'
+answer_format = 'For each input word, ouput only the space seperated characters.'
+
+#negation
+task_description = 'For each input, write a sentence that expresses the exact opposite meaning of the input.'
+base_instruction = 'For each given sentence, provide a new sentence that conveys the exact opposite meaning by using "not" in the input sentence, keeping the rest of the sentence unchanged.'
+answer_format = "For each input sentence, negate the meaning by adding 'not' to the input sentence."
+
+#orthography_starts_with
+task_description = 'For each input, output all the words in the sentence that begin with the character in brackets at the end of the sentence.'
+base_instruction = 'Output words with space separated that begin with the character in brackets at the end of the following sentence='
+answer_format = 'For each input sentence, present the reasoning followed by space seperated words.'
+
+#rhymes
+task_description = 'In this task, you will be given a single word as input. Your job is to produce list of comma sperated words that rhymes with the input word.'
+base_instruction = 'For each given word, provide a list of words that rhyme with the input word='
+answer_format = 'For each input word, present the reasoning followed by the list of rhyming word.'
+
+#second_word_letter
+task_description = 'Extract the second letter from the input word.'
+base_instruction = 'Output the second letter. Think step by step to arrive at the solution.'
+answer_format = 'For each input word, present the reasoning followed by the extracted letter (only single letter).'
+
+#sentence_similarity
+task_description = "Each input consists of two sentences (Sentence 1 and Sentence 2). Rate on a scale of 0 to 5 whether those sentences are paraphrases of each other, and also give a brief textual description of the rating (0 - definitely not, 2 - possibly, 3 - probably, 4 - almost perfectly and 5 - perfectly). Use \" - \" to separate them"
+base_instruction = """Rate the similarity of each pair of sentences according to the following scale:
+
+0 - Definitely not : The sentences are completely unrelated in meaning.
+1 - Probably not : The sentences have minor or superficial similarities but differ significantly in meaning.
+2 - Possibly : The sentences share some elements of meaning but are not strong paraphrases.
+3 - Probably : The sentences convey similar meanings but have some differences.
+4 - Almost perfectly : The sentences are very similar with only minor differences.
+5 - Perfectly :The sentences are nearly identical in meaning."""
+answer_format = 'Provide your rating and brief textual description for each pair of sentences from the 6 options. (0 - Definitely not, 1 - Probably not, 2 - Possibly, 3 - Probably, 4 - Almost perfectly, 5 - Perfectly)'
+
+#sum
+task_description = 'For each input, write the sum of the two numbers that appears there.'
+base_instruction = 'Output the sum of the following two numbers='
+answer_format = 'For each pair of numbers, present the reasoning followed by the sum.'
+
+#synonyms
+task_description = 'You will be given a word as input and need to output a word that is semantically similar.'
+base_instruction = 'Output a word that is semantically similar to the input word='
+answer_format = 'For each input word, present the reasoning followed by the synonym.'
+
+#taxonomy_animal
+task_description = 'In this task, you will be given a list of words. Your job is to identify and list all the animals from the given set of words.'
+base_instruction = 'For each given list of words, provide a new list containing only the animals.'
+answer_format = 'For each list of words, output the list of animals.'
+
+#auto_categorization
+task_description = 'Find the best categorization for the given set of words as input.'
+base_instruction = 'Output the best categorization for the following set of words='
+answer_format = 'For each set of words, present the reasoning followed by the best categorization.'
+
+#object_counting
+task_description = 'Find the number of objects in the given input.'
+base_instruction = 'Output the number of objects in the following input='
+answer_format = 'For each input, present the reasoning followed by the number of objects.'
+
+#odd_one_out
+task_description = 'Given the below list of words, find the odd one out'
+base_instruction = 'Output the word that does not belong to the group of words='
+answer_format = 'For each group of words, present the reasoning followed by the odd one out.'
+
+#word_sorting
+task_description = 'In this task, you will be given a set of words. Your job is to sort the words based on the first character of each word in alphabetical order.'
+base_instruction = 'For each given set of words, provide a sorted list of the words based on the first character of each word.'
+answer_format = 'For each input, list of sorted words based on the first character of each word.'
+
+#word_unscrambling
+task_description = 'In this task output all possible meaningful words that can be formed by rearranging all the letters of the given word. Each character must be used exactly once and the words must be valid.'
+base_instruction = 'Output comma seperated words of same length as input word.'
+answer_format = 'Output the all possible meaningful words comma seperated that can formed by rearranging the letters of the given word.'
+
+#antonyms
+task_description = 'In this task, you will be given a single word as input. Your job is to produce a word that has the exact opposite meaning (an antonym) to the input word.'
+base_instruction = 'For each given word, provide a word that is an antonym (has the exact opposite meaning).'
+answer_format = 'For each input word, output only a single word.'
+
+#cause_and_effect
+task_description = 'Find the cause in the following cause and effect pair. Each input consists of two sentences, where one is the cause and the other is the outcome.'
+base_instruction = 'Output the cause in the following cause and effect pair='
+answer_format = 'For each pair of sentences, present the reasoning followed by the cause.'
+
+#common_concept
+task_description = 'In this task, you will be given a list of objects. Your job is to identify and describe a common characteristic that links all the objects in the list.'
+base_instruction = 'The instruction is to āinvolveā the objects mentioned in the input.'
+answer_format = 'For each list of objects, output the common concept by "involving" the objects mentioned.'
\ No newline at end of file
diff --git a/demos/gsm8k/.env b/demos/gsm8k/.env
new file mode 100644
index 0000000000000000000000000000000000000000..290a11ba1125d027f0e0f653133f0032c7dd491e
--- /dev/null
+++ b/demos/gsm8k/.env
@@ -0,0 +1,8 @@
+USE_OPENAI_API_KEY="False"
+
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""
\ No newline at end of file
diff --git a/demos/gsm8k/configs/prompt_library.yaml b/demos/gsm8k/configs/prompt_library.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a2a5f7724defe0974bf3ba3572ca5ca1ae7b36
--- /dev/null
+++ b/demos/gsm8k/configs/prompt_library.yaml
@@ -0,0 +1,36 @@
+system_prompts: |
+ You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+ Guidelines
+ - Your role must always be a helpful assistant that assists students in understanding research papers.
+ - Only answer questions that are directly or indirectly related to the referenced paper(s).
+
+mode:
+ chat:
+ - name: CHAT-FIRST-MESSAGE
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ - name: CHAT-NEXT-MESSAGES
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ generation:
+ - name: FLASH_PROFILE
+ prompt_template: |
+ {user_msg}
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+ llm_request_type: rag-query
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
\ No newline at end of file
diff --git a/demos/gsm8k/configs/promptopt_config.yaml b/demos/gsm8k/configs/promptopt_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..056db4f9b1a8ecbcaee878a38450ccbc6f57261c
--- /dev/null
+++ b/demos/gsm8k/configs/promptopt_config.yaml
@@ -0,0 +1,52 @@
+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "For each question present the reasoning followed by the correct answer."
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################
+
+
+
diff --git a/demos/gsm8k/configs/setup_config.yaml b/demos/gsm8k/configs/setup_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79773bf48195851e56766c6f10a377711572fad7
--- /dev/null
+++ b/demos/gsm8k/configs/setup_config.yaml
@@ -0,0 +1,14 @@
+assistant_llm:
+ # put the unique_model_id that you specified in llm_config.yaml
+ prompt_opt: gpt-4o
+dir_info:
+ # Base directory for everything
+ base_dir: logs
+ log_dir_name: glue_logs
+experiment_name: gsm8k
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:
diff --git a/demos/gsm8k/demo.ipynb b/demos/gsm8k/demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e979ee74f7df7060fc0285bcbca520602001539f
--- /dev/null
+++ b/demos/gsm8k/demo.ipynb
@@ -0,0 +1,298 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "6eb94b72",
+ "metadata": {},
+ "source": [
+ "#### Set environment variables in [.env](.env) for LLM API calling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "388020c6",
+ "metadata": {},
+ "source": [
+ "### Import Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "11efa138",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, \"../../\")\n",
+ "import promptwizard\n",
+ "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+ "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+ "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+ "from typing import Any\n",
+ "from tqdm import tqdm\n",
+ "from re import compile, findall\n",
+ "import os\n",
+ "from datasets import load_dataset\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "load_dotenv(override = True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "beb14821",
+ "metadata": {},
+ "source": [
+ "### Create a dataset specific class and define the required functions "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "5f325d33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class GSM8k(DatasetSpecificProcessing):\n",
+ "\n",
+ " def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+ " def extract_answer_from_output(completion):\n",
+ " # Your functions for metrics and prompt building\n",
+ " ans_re = compile(r\"#### (\\-?[0-9\\.\\,]+)\")\n",
+ " self.INVALID_ANS = \"[invalid]\"\n",
+ "\n",
+ " match = ans_re.search(completion)\n",
+ " if match:\n",
+ " match_str = match.group(1).strip()\n",
+ " match_str = match_str.replace(\",\", \"\")\n",
+ " return match_str\n",
+ " else:\n",
+ " return self.INVALID_ANS\n",
+ "\n",
+ " examples_set = []\n",
+ "\n",
+ " for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+ " example = {\n",
+ " DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+ " DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+ " DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+ " }\n",
+ " examples_set.append(example)\n",
+ "\n",
+ " save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+ "\n",
+ " def extract_final_answer(self, answer: str):\n",
+ " \n",
+ " if not answer:\n",
+ " return self.INVALID_ANS\n",
+ "\n",
+ " model_pred = answer.lower()\n",
+ " preds = model_pred.split(self.ANSWER_START.lower())\n",
+ " answer_flag = True if len(preds) > 1 else False\n",
+ "\n",
+ " pred = preds[-1].replace(\",\", \"\")\n",
+ " pred = [s for s in findall(r'-?\\d+\\.?\\d*', pred)]\n",
+ "\n",
+ " if len(pred) == 0:\n",
+ " return self.INVALID_ANS\n",
+ "\n",
+ " if answer_flag:\n",
+ " # choose the first element in list\n",
+ " pred = pred[0]\n",
+ " else:\n",
+ " # choose the last element in list\n",
+ " pred = pred[-1]\n",
+ "\n",
+ " # (For arithmetic tasks) if a word ends with period, it will be omitted ...\n",
+ " if pred[-1] == \".\":\n",
+ " pred = pred[:-1]\n",
+ " return pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f384eb57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gsm8k_processor = GSM8k()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11d2de75",
+ "metadata": {},
+ "source": [
+ "### Load and save the dataset "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(\"data\"):\n",
+ " os.mkdir(\"data\")\n",
+ " \n",
+ "dataset = load_dataset(\"openai/gsm8k\", \"main\")\n",
+ "num_samples = 0\n",
+ "for dataset_type in ['train','test']:\n",
+ " data_list = []\n",
+ " for data in dataset[dataset_type]:\n",
+ " data_list.append({\"question\": data['question'], \"answer\": data['answer']})\n",
+ " if num_samples == 100 and dataset_type == 'train': # We sample only 100 train examples and use 25 out them for training randomly\n",
+ " break\n",
+ " num_samples += 1\n",
+ " gsm8k_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ac30e74f",
+ "metadata": {},
+ "source": [
+ "### Set paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_file_name = os.path.join(\"data\", \"train.jsonl\")\n",
+ "test_file_name = os.path.join(\"data\", \"test.jsonl\")\n",
+ "path_to_config = \"configs\"\n",
+ "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+ "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3392594d",
+ "metadata": {},
+ "source": [
+ "### Create an object for calling prompt optimization and inference functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " train_file_name,\n",
+ " gsm8k_processor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1784648c",
+ "metadata": {},
+ "source": [
+ "### Call prompt optmization function\n",
+ "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+ "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+ "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Function call to generate optimal prompt and expert profile \n",
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ee1aa99",
+ "metadata": {},
+ "source": [
+ "### Save the optimized prompt and expert profile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import pickle \n",
+ "\n",
+ "if not os.path.exists(\"results\"):\n",
+ " os.system(\"mkdir results\")\n",
+ " \n",
+ "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+ " pickle.dump(best_prompt, f)\n",
+ "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+ " pickle.dump(expert_profile, f)\n",
+ "\n",
+ "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aac42eed",
+ "metadata": {},
+ "source": [
+ "### Evaluate the optimized prompt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "gp.EXPERT_PROFILE = expert_profile\n",
+ "gp.BEST_PROMPT = best_prompt\n",
+ "\n",
+ "# Function call to evaluate the prompt\n",
+ "accuracy = gp.evaluate(test_file_name)\n",
+ "\n",
+ "print(f\"Final Accuracy: {accuracy}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "general",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demos/scenarios/.env b/demos/scenarios/.env
new file mode 100644
index 0000000000000000000000000000000000000000..290a11ba1125d027f0e0f653133f0032c7dd491e
--- /dev/null
+++ b/demos/scenarios/.env
@@ -0,0 +1,8 @@
+USE_OPENAI_API_KEY="False"
+
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""
\ No newline at end of file
diff --git a/demos/scenarios/configs/prompt_library.yaml b/demos/scenarios/configs/prompt_library.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a2a5f7724defe0974bf3ba3572ca5ca1ae7b36
--- /dev/null
+++ b/demos/scenarios/configs/prompt_library.yaml
@@ -0,0 +1,36 @@
+system_prompts: |
+ You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+ Guidelines
+ - Your role must always be a helpful assistant that assists students in understanding research papers.
+ - Only answer questions that are directly or indirectly related to the referenced paper(s).
+
+mode:
+ chat:
+ - name: CHAT-FIRST-MESSAGE
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ - name: CHAT-NEXT-MESSAGES
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ generation:
+ - name: FLASH_PROFILE
+ prompt_template: |
+ {user_msg}
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+ llm_request_type: rag-query
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
\ No newline at end of file
diff --git a/demos/scenarios/configs/promptopt_config.yaml b/demos/scenarios/configs/promptopt_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd9c6577cea84860ab55975096f8979198af0c16
--- /dev/null
+++ b/demos/scenarios/configs/promptopt_config.yaml
@@ -0,0 +1,53 @@
+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "For each question present the reasoning followed by the correct answer."
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################
+
+
+
+
diff --git a/demos/scenarios/configs/setup_config.yaml b/demos/scenarios/configs/setup_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79773bf48195851e56766c6f10a377711572fad7
--- /dev/null
+++ b/demos/scenarios/configs/setup_config.yaml
@@ -0,0 +1,14 @@
+assistant_llm:
+ # put the unique_model_id that you specified in llm_config.yaml
+ prompt_opt: gpt-4o
+dir_info:
+ # Base directory for everything
+ base_dir: logs
+ log_dir_name: glue_logs
+experiment_name: gsm8k
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:
diff --git a/demos/scenarios/dataset_scenarios_demo.ipynb b/demos/scenarios/dataset_scenarios_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..3a633787e8f151106e9f14c51d536772c7b4439a
--- /dev/null
+++ b/demos/scenarios/dataset_scenarios_demo.ipynb
@@ -0,0 +1,1146 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "6eb94b72",
+ "metadata": {},
+ "source": [
+ "## Following is a demo on running PromptWizard under different scenarios "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52c7ee0a",
+ "metadata": {},
+ "source": [
+ "#### Set environment variables in [.env](.env) for LLM API calling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3cffa5ef",
+ "metadata": {},
+ "source": [
+ "#### Import Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "11efa138",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, \"../../\")\n",
+ "import promptwizard\n",
+ "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+ "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+ "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+ "from typing import Any\n",
+ "from tqdm import tqdm\n",
+ "from re import compile, findall\n",
+ "import os\n",
+ "from datasets import load_dataset\n",
+ "import yaml\n",
+ "from dotenv import load_dotenv\n",
+ "load_dotenv(override = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "9be22d5d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def update_yaml_file(file_path,config_dict):\n",
+ "\n",
+ " with open(file_path, 'r') as file:\n",
+ " data = yaml.safe_load(file)\n",
+ "\n",
+ "\n",
+ " for field,value in config_dict.items():\n",
+ " data[field] = value\n",
+ "\n",
+ " with open(file_path, 'w') as file:\n",
+ " yaml.dump(data, file, default_flow_style=False)\n",
+ "\n",
+ " print(\"YAML file updated successfully!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78abb34a",
+ "metadata": {},
+ "source": [
+ "Set the paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "14399d47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path_to_config = \"configs\"\n",
+ "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+ "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f274af9",
+ "metadata": {},
+ "source": [
+ "### Now let us consider the three scenarios with respect to availability of training data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5aaed236",
+ "metadata": {},
+ "source": [
+ "#### Scenario 1 : We have no training data , but we also don't want in-context examples in final prompt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c34423d",
+ "metadata": {},
+ "source": [
+ "Set the configurations to generate mutations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec4e7607",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_path = 'configs/promptopt_config.yaml' \n",
+ "# Set the following based on the use case\n",
+ "config_dict = {\n",
+ " \"task_description\": \"You are a mathematics expert. You will be given a mathematics problem which you need to solve\",\n",
+ " \"base_instruction\": \"Lets think step by step.\",\n",
+ " \"mutation_rounds\": 5\n",
+ " }\n",
+ "update_yaml_file(file_path,config_dict)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d984e84e",
+ "metadata": {},
+ "source": [
+ "Create an object for calling prompt optimization and inference functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c7aa4ccb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " dataset_jsonl=None,\n",
+ " data_processor=None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d587065",
+ "metadata": {},
+ "source": [
+ "Call the optimization function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "afe8de4f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=False,run_without_train_examples=True,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a30db274",
+ "metadata": {},
+ "source": [
+ "Output : Five mutated prompts are printed on the termial as shown below :"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "e5cb1a65",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "OUTPUT = \"\"\"\n",
+ "Variations 1:\n",
+ "Expert Profile:\n",
+ "You are a mathematician with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to approach mathematical problems methodically, breaking them down into manageable steps and applying appropriate techniques to find solutions. You are familiar with both theoretical and applied mathematics, and you can explain your reasoning and solutions in a clear and concise manner. Your ability to solve mathematical problems efficiently and accurately makes you an invaluable resource for anyone seeking help with mathematics.:\n",
+ "Prompt:\n",
+ "You are a mathematics expert. You will be given a mathematics problem which you need to solve\n",
+ "Lets think step by step.\n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\n",
+ "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+ "_______________________________________________________________________\n",
+ "\n",
+ "Variations 2:\n",
+ "Expert Profile:\n",
+ "You are a mathematician with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to approach mathematical problems methodically, breaking them down into manageable steps and applying appropriate techniques to find solutions. You are familiar with both theoretical and applied mathematics, and you can explain your reasoning and solutions in a clear and concise manner. Your ability to solve mathematical problems efficiently and accurately makes you an invaluable resource for anyone seeking help with mathematics.:\n",
+ "Prompt:\n",
+ "Let's break this problem down step by step and devise an experiment to help solve it.\n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\n",
+ "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+ "_______________________________________________________________________\n",
+ "\n",
+ "Variations 3:\n",
+ "Expert Profile:\n",
+ "You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to break down intricate problems into manageable steps, making it easier to find solutions. You are familiar with a wide range of mathematical techniques and tools, and you can apply them effectively to solve problems. Whether the problem involves solving equations, proving theorems, or analyzing data, you can provide a clear and accurate solution. Your ability to explain your reasoning and methodology ensures that others can follow and understand your approach, making you an invaluable resource for tackling challenging mathematical problems.:\n",
+ "Prompt:\n",
+ "Let's think through this problem step by step and make a list of ideas to solve it.\n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\n",
+ "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+ "_______________________________________________________________________\n",
+ "\n",
+ "Variations 4:\n",
+ "Expert Profile:\n",
+ "You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to break down intricate problems into manageable steps, making it easier for others to follow your reasoning. You are familiar with a wide range of mathematical techniques and tools, and you can apply them effectively to find solutions. Whether the problem involves solving equations, proving theorems, or analyzing data, you can provide a clear, accurate, and well-explained solution. Your ability to communicate complex mathematical concepts in an understandable way makes you an invaluable resource for anyone seeking to solve mathematical problems.:\n",
+ "Prompt:\n",
+ "Let's approach this problem step by step and measure our progress as we go.\n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\n",
+ "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\n",
+ "Iterations completed: 0%| | 0/3 [00:24, ?it/s]\n",
+ "Time taken to find best prompt: 24.79972267150879 sec\n",
+ "_______________________________________________________________________\n",
+ "\n",
+ "Variations 5:\n",
+ "Expert Profile:\n",
+ "You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to approach problems methodically, breaking them down into manageable steps and applying appropriate mathematical techniques to find solutions. You are also adept at explaining your reasoning and methods in a clear and concise manner, making it easy for others to follow your thought process. Whether the problem involves solving equations, proving theorems, or analyzing data, you have the knowledge and skills to tackle it effectively. Your proficiency in mathematics is highly valuable in both academic and practical applications, and you are well-equipped to provide accurate and insightful solutions to a wide range of mathematical problems.:\n",
+ "Prompt:\n",
+ "Let's simplify this problem step by step to make it easier to solve.\n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\n",
+ "Keywords: mathematics, problem-solving, step-by-step, logical reasoning, expert\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dfd54818",
+ "metadata": {},
+ "source": [
+ "#### Scenario 2 : We have no training data , but we also want in-context examples in final prompt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b07d1862",
+ "metadata": {},
+ "source": [
+ "This scenario has two steps \n",
+ "- Genrate synthetic data\n",
+ "- Optimize prompts using synthetic data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bf44d6d7",
+ "metadata": {},
+ "source": [
+ "STEP 1 : Generate synthetic data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "96d07ae3",
+ "metadata": {},
+ "source": [
+ "Set the configurations to first generate synthetic training data. \\\n",
+ "Any number of synthetic examples can be generated and then used for optimizing prompts as mentioned in STEP 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3c7c1f19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_path = 'configs/promptopt_config.yaml' \n",
+ "# Set the number of synthetic training examples to be generated\n",
+ "config_dict = {\n",
+ " \"num_train_examples\":20\n",
+ " }\n",
+ "update_yaml_file(file_path,config_dict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2311b4ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " dataset_jsonl=None,\n",
+ " data_processor=None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65ec6cd2",
+ "metadata": {},
+ "source": [
+ "Call the function to generate synthetic examples, which are saved in train.jsonl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff84f04e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=False,run_without_train_examples=False,generate_synthetic_examples=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a286dcdf",
+ "metadata": {},
+ "source": [
+ "STEP 2 : Optimize prompts using synthetic data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb0a4060",
+ "metadata": {},
+ "source": [
+ "Create a dataset specific class and define the required functions "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "7aaa5126",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class GSM8k(DatasetSpecificProcessing):\n",
+ "\n",
+ " def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+ " def extract_answer_from_output(completion):\n",
+ " # Your functions for metrics and prompt building\n",
+ " ans_re = compile(r\"#### (\\-?[0-9\\.\\,]+)\")\n",
+ " self.INVALID_ANS = \"[invalid]\"\n",
+ "\n",
+ " match = ans_re.search(completion)\n",
+ " if match:\n",
+ " match_str = match.group(1).strip()\n",
+ " match_str = match_str.replace(\",\", \"\")\n",
+ " return match_str\n",
+ " else:\n",
+ " return self.INVALID_ANS\n",
+ "\n",
+ " examples_set = []\n",
+ "\n",
+ " for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+ " example = {\n",
+ " DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+ " DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+ " DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+ " }\n",
+ " examples_set.append(example)\n",
+ "\n",
+ " save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+ "\n",
+ " def extract_final_answer(self, answer: str):\n",
+ " \n",
+ " if not answer:\n",
+ " return self.INVALID_ANS\n",
+ "\n",
+ " model_pred = answer.lower()\n",
+ " preds = model_pred.split(self.ANSWER_START.lower())\n",
+ " answer_flag = True if len(preds) > 1 else False\n",
+ "\n",
+ " pred = preds[-1].replace(\",\", \"\")\n",
+ " pred = [s for s in findall(r'-?\\d+\\.?\\d*', pred)]\n",
+ "\n",
+ " if len(pred) == 0:\n",
+ " return self.INVALID_ANS\n",
+ "\n",
+ " if answer_flag:\n",
+ " # choose the first element in list\n",
+ " pred = pred[0]\n",
+ " else:\n",
+ " # choose the last element in list\n",
+ " pred = pred[-1]\n",
+ "\n",
+ " # (For arithmetic tasks) if a word ends with period, it will be omitted ...\n",
+ " if pred[-1] == \".\":\n",
+ " pred = pred[:-1]\n",
+ " return pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "212bea42",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gsm8k_processor = GSM8k()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36ae1f65",
+ "metadata": {},
+ "source": [
+ "Set the configurations to optimize the prompt on the synthetic data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "67db60b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_path = 'configs/promptopt_config.yaml' \n",
+ "config_dict = {\n",
+ " \"task_description\": \"You are a mathematics expert. You will be given a mathematics problem which you need to solve\",\n",
+ " \"base_instruction\": \"Lets think step by step.\",\n",
+ " \"mutation_rounds\": 2,\n",
+ " \"few_shot_count\": 5,\n",
+ " \"generate_reasoning\": True,\n",
+ " \"mutate_refine_iterations\" : 3,\n",
+ " \"seen_set_size\":20\n",
+ " }\n",
+ "update_yaml_file(file_path,config_dict)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc8eb2c5",
+ "metadata": {},
+ "source": [
+ "Call the optimization function "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e53934e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " dataset_jsonl = \"train_synthetic.jsonl\",\n",
+ " data_processor=gsm8k_processor)\n",
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b4bcd46b",
+ "metadata": {},
+ "source": [
+ "Output : Following Prompt and Expert Profile are generated "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ee6006f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "OUTPUT = \"\"\"\n",
+ "Generating Expert Identity....\n",
+ "Expert Identity: You are a mathematician with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your analytical skills and logical reasoning enable you to break down problems into manageable steps and find accurate solutions efficiently. You are familiar with a wide range of mathematical techniques and tools, and you can apply them to solve problems in both theoretical and applied contexts. Your expertise allows you to explain your solutions clearly and concisely, making complex concepts accessible to others. Whether the problem involves solving equations, proving theorems, or analyzing data, you are well-equipped to provide a thorough and correct solution.\n",
+ "Final best prompt: Provide a clear and detailed solution, breaking down all necessary steps. Ensure that the final answer is clearly marked and separated from the solution steps. Use proper mathematical notation and formatting throughout. Verify the final answer by checking the solution steps for accuracy. Simplify all expressions and fractions where possible. Handle special cases or edge cases appropriately, and clearly state any assumptions or conditions applied during the solution process. Finally, review the entire solution to ensure logical consistency and correct formatting.\n",
+ "\n",
+ "[Question] Solve for \\( x \\) in the equation \\( 2x + 3 = 11 \\).\n",
+ "[Answer] To solve for \\( x \\) in the equation \\( 2x + 3 = 11 \\), we will follow these steps:\n",
+ "\n",
+ "1. **Isolate the term containing \\( x \\)**:\n",
+ " We start by isolating the term with \\( x \\) on one side of the equation. To do this, we need to eliminate the constant term on the left side of the equation.\n",
+ "\n",
+ " \\[\n",
+ " 2x + 3 = 11\n",
+ " \\]\n",
+ "\n",
+ " Subtract 3 from both sides of the equation:\n",
+ "\n",
+ " \\[\n",
+ " 2x + 3 - 3 = 11 - 3\n",
+ " \\]\n",
+ "\n",
+ " Simplifying this, we get:\n",
+ "\n",
+ " \\[\n",
+ " 2x = 8\n",
+ " \\]\n",
+ "\n",
+ "2. **Solve for \\( x \\)**:\n",
+ " Now, we need to solve for \\( x \\) by isolating \\( x \\) itself. Since \\( x \\) is multiplied by 2, we will divide both sides of the equation by 2 to solve for \\( x \\).\n",
+ "\n",
+ " \\[\n",
+ " \\frac{2x}{2} = \\frac{8}{2}\n",
+ " \\]\n",
+ "\n",
+ " Simplifying this, we get:\n",
+ "\n",
+ " \\[\n",
+ " x = 4\n",
+ " \\]\n",
+ "\n",
+ "3. **Verify the solution**:\n",
+ " To ensure our solution is correct, we substitute \\( x = 4 \\) back into the original equation and check if both sides are equal.\n",
+ "\n",
+ " Original equation:\n",
+ "\n",
+ " \\[\n",
+ " 2x + 3 = 11\n",
+ " \\]\n",
+ "\n",
+ " Substitute \\( x = 4 \\):\n",
+ "\n",
+ " \\[\n",
+ " 2(4) + 3 = 11\n",
+ " \\]\n",
+ "\n",
+ " Simplifying this, we get:\n",
+ "\n",
+ " \\[\n",
+ " 8 + 3 = 11\n",
+ " \\]\n",
+ "\n",
+ " \\[\n",
+ " 11 = 11\n",
+ " \\]\n",
+ "\n",
+ " Since both sides of the equation are equal, our solution is verified to be correct.\n",
+ "\n",
+ "**Final Answer**: \\( x = 4 \\) \\( x = 4 \\) \n",
+ "\n",
+ "[Question] Solve for \\( x \\) in the equation \\( x^2 - 4x + 4 = 0 \\).\n",
+ "[Answer] To solve the quadratic equation \\( x^2 - 4x + 4 = 0 \\), we will follow these steps:\n",
+ "\n",
+ "1. **Identify the quadratic equation**: The given equation is \\( x^2 - 4x + 4 = 0 \\).\n",
+ "\n",
+ "2. **Recognize the standard form**: The standard form of a quadratic equation is \\( ax^2 + bx + c = 0 \\). Here, \\( a = 1 \\), \\( b = -4 \\), and \\( c = 4 \\).\n",
+ "\n",
+ "3. **Factor the quadratic expression**: We need to factor the quadratic expression on the left-hand side of the equation. We look for two numbers that multiply to \\( c \\) (which is 4) and add up to \\( b \\) (which is -4). These numbers are -2 and -2.\n",
+ "\n",
+ "4. **Write the factored form**: The quadratic expression \\( x^2 - 4x + 4 \\) can be factored as \\( (x - 2)(x - 2) \\) or \\( (x - 2)^2 \\).\n",
+ "\n",
+ "5. **Set the factored form equal to zero**: We now have \\( (x - 2)^2 = 0 \\).\n",
+ "\n",
+ "6. **Solve for \\( x \\)**: To find the value of \\( x \\), we take the square root of both sides of the equation:\n",
+ " \\[\n",
+ " \\sqrt{(x - 2)^2} = \\sqrt{0}\n",
+ " \\]\n",
+ " This simplifies to:\n",
+ " \\[\n",
+ " x - 2 = 0\n",
+ " \\]\n",
+ "\n",
+ "7. **Isolate \\( x \\)**: Add 2 to both sides of the equation to solve for \\( x \\):\n",
+ " \\[\n",
+ " x = 2\n",
+ " \\]\n",
+ "\n",
+ "8. **Verify the solution**: Substitute \\( x = 2 \\) back into the original equation to ensure it satisfies the equation:\n",
+ " \\[\n",
+ " (2)^2 - 4(2) + 4 = 4 - 8 + 4 = 0\n",
+ " \\]\n",
+ " Since the left-hand side equals the right-hand side (0), the solution \\( x = 2 \\) is verified.\n",
+ "\n",
+ "**Final Answer**: \\( x = 2 \\) \\( x = 2 \\) \n",
+ "\n",
+ "[Question] Find the derivative of \\( f(x) = 3x^2 \\cdot \\sin(x) \\).\n",
+ "[Answer] To find the derivative of the function \\( f(x) = 3x^2 \\cdot \\sin(x) \\), we will use the product rule of differentiation. The product rule states that if we have a function \\( f(x) = u(x) \\cdot v(x) \\), then its derivative \\( f'(x) \\) is given by:\n",
+ "\n",
+ "\\[ f'(x) = u'(x) \\cdot v(x) + u(x) \\cdot v'(x) \\]\n",
+ "\n",
+ "Here, we identify \\( u(x) = 3x^2 \\) and \\( v(x) = \\sin(x) \\).\n",
+ "\n",
+ "Step 1: Differentiate \\( u(x) = 3x^2 \\)\n",
+ "\\[ u'(x) = \\frac{d}{dx}(3x^2) = 3 \\cdot 2x = 6x \\]\n",
+ "\n",
+ "Step 2: Differentiate \\( v(x) = \\sin(x) \\)\n",
+ "\\[ v'(x) = \\frac{d}{dx}(\\sin(x)) = \\cos(x) \\]\n",
+ "\n",
+ "Step 3: Apply the product rule\n",
+ "\\[ f'(x) = u'(x) \\cdot v(x) + u(x) \\cdot v'(x) \\]\n",
+ "\\[ f'(x) = (6x) \\cdot \\sin(x) + (3x^2) \\cdot \\cos(x) \\]\n",
+ "\n",
+ "Step 4: Simplify the expression\n",
+ "\\[ f'(x) = 6x \\sin(x) + 3x^2 \\cos(x) \\]\n",
+ "\n",
+ "Thus, the derivative of the function \\( f(x) = 3x^2 \\cdot \\sin(x) \\) is:\n",
+ "\n",
+ "\\[ \\boxed{f'(x) = 6x \\sin(x) + 3x^2 \\cos(x)} \\]\n",
+ "\n",
+ "To verify the final answer, we can recheck each step to ensure accuracy:\n",
+ "- The derivative of \\( 3x^2 \\) is correctly calculated as \\( 6x \\).\n",
+ "- The derivative of \\( \\sin(x) \\) is correctly calculated as \\( \\cos(x) \\).\n",
+ "- The product rule is correctly applied, and the terms are correctly combined and simplified.\n",
+ "\n",
+ "Therefore, the final answer is confirmed to be correct. \\( f'(x) = 3x^2 \\cos(x) + 6x \\sin(x) \\) \n",
+ "\n",
+ "[Question] Evaluate the definite integral \\( \\int_{0}^{1} (4x^3 - 2x + 1) \\, dx \\).\n",
+ "[Answer] To evaluate the definite integral \\( \\int_{0}^{1} (4x^3 - 2x + 1) \\, dx \\), we will follow these steps:\n",
+ "\n",
+ "1. **Find the antiderivative** of the integrand \\( 4x^3 - 2x + 1 \\).\n",
+ "2. **Evaluate the antiderivative** at the upper limit of integration (1).\n",
+ "3. **Evaluate the antiderivative** at the lower limit of integration (0).\n",
+ "4. **Subtract the value** of the antiderivative at the lower limit from the value at the upper limit to find the definite integral.\n",
+ "\n",
+ "### Step-by-Step Solution:\n",
+ "\n",
+ "1. **Find the antiderivative**:\n",
+ " - The antiderivative of \\( 4x^3 \\) is \\( \\frac{4x^4}{4} = x^4 \\).\n",
+ " - The antiderivative of \\( -2x \\) is \\( -\\frac{2x^2}{2} = -x^2 \\).\n",
+ " - The antiderivative of \\( 1 \\) is \\( x \\).\n",
+ "\n",
+ " Therefore, the antiderivative of \\( 4x^3 - 2x + 1 \\) is:\n",
+ " \\[\n",
+ " F(x) = x^4 - x^2 + x\n",
+ " \\]\n",
+ "\n",
+ "2. **Evaluate the antiderivative at the upper limit (1)**:\n",
+ " \\[\n",
+ " F(1) = 1^4 - 1^2 + 1 = 1 - 1 + 1 = 1\n",
+ " \\]\n",
+ "\n",
+ "3. **Evaluate the antiderivative at the lower limit (0)**:\n",
+ " \\[\n",
+ " F(0) = 0^4 - 0^2 + 0 = 0\n",
+ " \\]\n",
+ "\n",
+ "4. **Subtract the value at the lower limit from the value at the upper limit**:\n",
+ " \\[\n",
+ " \\int_{0}^{1} (4x^3 - 2x + 1) \\, dx = F(1) - F(0) = 1 - 0 = 1\n",
+ " \\]\n",
+ "\n",
+ "### Final Answer:\n",
+ "\\[\n",
+ "\\boxed{1}\n",
+ "\\] \\( 1 \\) \n",
+ "\n",
+ "[Question] Solve the system of equations:\n",
+ "\\[ \\begin{cases} \n",
+ "x + 2y + z = 6 \\\\\n",
+ "2x - y + 3z = 14 \\\\\n",
+ "3x + y - z = 2 \n",
+ "\\end{cases} \\]\n",
+ "[Answer] To solve the system of equations:\n",
+ "\\[ \\begin{cases} \n",
+ "x + 2y + z = 6 \\\\\n",
+ "2x - y + 3z = 14 \\\\\n",
+ "3x + y - z = 2 \n",
+ "\\end{cases} \\]\n",
+ "\n",
+ "we will use the method of elimination and substitution to find the values of \\(x\\), \\(y\\), and \\(z\\).\n",
+ "\n",
+ "**Step 1: Eliminate \\(z\\) from the first two equations.**\n",
+ "\n",
+ "First, we multiply the first equation by 3 to align the coefficients of \\(z\\):\n",
+ "\\[ 3(x + 2y + z) = 3 \\cdot 6 \\]\n",
+ "\\[ 3x + 6y + 3z = 18 \\]\n",
+ "\n",
+ "Now, we subtract the second equation from this result:\n",
+ "\\[ (3x + 6y + 3z) - (2x - y + 3z) = 18 - 14 \\]\n",
+ "\\[ 3x + 6y + 3z - 2x + y - 3z = 4 \\]\n",
+ "\\[ x + 7y = 4 \\]\n",
+ "\\[ \\text{(Equation 4)} \\]\n",
+ "\n",
+ "**Step 2: Eliminate \\(z\\) from the first and third equations.**\n",
+ "\n",
+ "Next, we multiply the first equation by 1 and the third equation by 1 to align the coefficients of \\(z\\):\n",
+ "\\[ 1(x + 2y + z) = 1 \\cdot 6 \\]\n",
+ "\\[ x + 2y + z = 6 \\]\n",
+ "\n",
+ "\\[ 1(3x + y - z) = 1 \\cdot 2 \\]\n",
+ "\\[ 3x + y - z = 2 \\]\n",
+ "\n",
+ "Now, we add these two equations:\n",
+ "\\[ (x + 2y + z) + (3x + y - z) = 6 + 2 \\]\n",
+ "\\[ x + 2y + z + 3x + y - z = 8 \\]\n",
+ "\\[ 4x + 3y = 8 \\]\n",
+ "\\[ \\text{(Equation 5)} \\]\n",
+ "\n",
+ "**Step 3: Solve the system of equations formed by Equation 4 and Equation 5.**\n",
+ "\n",
+ "We now have:\n",
+ "\\[ \\begin{cases} \n",
+ "x + 7y = 4 \\\\\n",
+ "4x + 3y = 8 \n",
+ "\\end{cases} \\]\n",
+ "\n",
+ "First, we solve Equation 4 for \\(x\\):\n",
+ "\\[ x = 4 - 7y \\]\n",
+ "\n",
+ "Substitute \\(x = 4 - 7y\\) into Equation 5:\n",
+ "\\[ 4(4 - 7y) + 3y = 8 \\]\n",
+ "\\[ 16 - 28y + 3y = 8 \\]\n",
+ "\\[ 16 - 25y = 8 \\]\n",
+ "\\[ -25y = 8 - 16 \\]\n",
+ "\\[ -25y = -8 \\]\n",
+ "\\[ y = \\frac{8}{25} \\]\n",
+ "\n",
+ "**Step 4: Substitute \\(y\\) back into Equation 4 to find \\(x\\).**\n",
+ "\n",
+ "\\[ x + 7\\left(\\frac{8}{25}\\right) = 4 \\]\n",
+ "\\[ x + \\frac{56}{25} = 4 \\]\n",
+ "\\[ x = 4 - \\frac{56}{25} \\]\n",
+ "\\[ x = \\frac{100}{25} - \\frac{56}{25} \\]\n",
+ "\\[ x = \\frac{44}{25} \\]\n",
+ "\n",
+ "**Step 5: Substitute \\(x\\) and \\(y\\) back into the first original equation to find \\(z\\).**\n",
+ "\n",
+ "\\[ x + 2y + z = 6 \\]\n",
+ "\\[ \\frac{44}{25} + 2\\left(\\frac{8}{25}\\right) + z = 6 \\]\n",
+ "\\[ \\frac{44}{25} + \\frac{16}{25} + z = 6 \\]\n",
+ "\\[ \\frac{60}{25} + z = 6 \\]\n",
+ "\\[ \\frac{60}{25} = 2.4 \\]\n",
+ "\\[ 2.4 + z = 6 \\]\n",
+ "\\[ z = 6 - 2.4 \\]\n",
+ "\\[ z = 3.6 \\]\n",
+ "\n",
+ "**Final Answer:**\n",
+ "\\[ x = \\frac{44}{25}, y = \\frac{8}{25}, z = 3.6 \\]\n",
+ "\n",
+ "We have verified each step and simplified all expressions. The solution is logically consistent and correctly formatted. \\( x = \\frac{44}{25}, y = \\frac{8}{25}, z = 3.6 \\) \n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c61c2f84",
+ "metadata": {},
+ "source": [
+ "#### Scenario 3 : We have training data and also want in-context examples in final prompt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11d2de75",
+ "metadata": {},
+ "source": [
+ "Load and save the dataset "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(\"data\"):\n",
+ " os.mkdir(\"data\")\n",
+ " \n",
+ "dataset = load_dataset(\"openai/gsm8k\", \"main\")\n",
+ "num_samples = 0\n",
+ "for dataset_type in ['train','test']:\n",
+ " data_list = []\n",
+ " for data in dataset[dataset_type]:\n",
+ " data_list.append({\"question\": data['question'], \"answer\": data['answer']})\n",
+ " if num_samples == 100 and dataset_type == 'train': # We sample only 100 train examples and use 25 out them for training randomly\n",
+ " break\n",
+ " num_samples += 1\n",
+ " gsm8k_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "abf1671a",
+ "metadata": {},
+ "source": [
+ "Set the configurations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc841576",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_path = 'configs/promptopt_config.yaml' \n",
+ "config_dict = {\n",
+ " \"task_description\": \"You are a mathematics expert. You will be given a mathematics problem which you need to solve\",\n",
+ " \"base_instruction\": \"Lets think step by step.\",\n",
+ " \"mutation_rounds\": 2,\n",
+ " \"few_shot_count\": 5,\n",
+ " \"generate_reasoning\": True,\n",
+ " \"mutate_refine_iterations\" : 3,\n",
+ " \"seen_set_size\":20\n",
+ " }\n",
+ "update_yaml_file(file_path,config_dict)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3392594d",
+ "metadata": {},
+ "source": [
+ "Create an object for calling prompt optimization and inference functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " dataset_jsonl = os.path.join(\"data\", \"train.jsonl\"),\n",
+ " data_processor = gsm8k_processor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f421ce9",
+ "metadata": {},
+ "source": [
+ "Call the optimization function "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "09e3e6e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15bb0e80",
+ "metadata": {},
+ "source": [
+ "Output : Following Prompt and Expert Profile are generated "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "696e6612",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "OUTPUT = \"\"\"Expert Identity: You are a mathematics expert with a strong background in various fields of mathematics, including algebra, calculus, geometry, and statistics. You have a deep understanding of mathematical theories and principles, and you are skilled at solving complex problems with precision and clarity. Your expertise allows you to break down intricate problems into manageable steps, making it easier for others to follow your reasoning. You are familiar with a wide range of mathematical techniques and tools, and you can apply them effectively to find solutions. Whether the problem involves solving equations, proving theorems, or analyzing data, you can provide a clear, accurate, and well-explained solution. Your ability to communicate complex mathematical concepts in an understandable way makes you an invaluable resource for anyone seeking help with mathematics.\n",
+ "\n",
+ "Final best prompt: \n",
+ "\n",
+ "You are a mathematics expert. Your task is to solve a given mathematics problem accurately and provide a clear, detailed explanation of your solution process. Follow these steps to ensure a comprehensive and well-structured solution:\n",
+ "\n",
+ "1. **Understand the Problem**: Carefully read and comprehend the problem statement. Identify the key components and what is being asked.\n",
+ "\n",
+ "2. **Identify Components**: Break down the problem into its fundamental components, such as variables, constants, and relevant quantities (e.g., base pay, overtime pay, distances, speeds, etc.).\n",
+ "\n",
+ "3. **Apply Relevant Principles**: Use appropriate mathematical principles, formulas, and methods to solve the problem step by step.\n",
+ "\n",
+ "4. **Logical Reasoning**: Employ logical reasoning to explain each step of your solution process. Ensure that each step follows logically from the previous one.\n",
+ "\n",
+ "5. **Detailed Explanations**: Provide detailed explanations for each step to ensure clarity and understanding. Include intermediate results and how they contribute to the final solution.\n",
+ "\n",
+ "6. **Explicit Calculation Steps**: Show each calculation step in detail, including intermediate results. Use proper mathematical notation and symbols.\n",
+ "\n",
+ "7. **Verify Each Step**: Recheck each intermediate step of your calculation to verify the correctness of the final answer. Ensure that all arithmetic and algebraic operations are accurate.\n",
+ "\n",
+ "8. **Combine Results**: Clearly combine different components of the problem (e.g., base pay and overtime pay) before arriving at the final answer.\n",
+ "\n",
+ "9. **Simplify and Notate**: Simplify the final answer where possible, and use proper mathematical notation and symbols.\n",
+ "\n",
+ "10. **Mark the Final Answer**: Clearly mark the final answer within and tags.\n",
+ "\n",
+ "Ensure that your approach is tailored to the specific type of mathematical problem being solved, whether it involves arithmetic, algebra, geometry, calculus, or any other area of mathematics. Present the solutions in a clear and organized manner.\n",
+ "\n",
+ "**Additional Guidelines:**\n",
+ "- **Contextual Understanding**: Pay close attention to the context of the problem to ensure that all relationships and quantities are correctly interpreted.\n",
+ "- **Correct Application of Arithmetic Operations**: Double-check that all arithmetic operations are applied correctly and align with the problem's requirements.\n",
+ "- **Verification of Final Answer**: Dedicate a step to verify the final answer by rechecking all intermediate steps and ensuring they logically lead to the correct final result.\n",
+ "- **Clarity in Marking Final Answer**: Use the and tags to clearly mark the final answer.\n",
+ "\n",
+ "By following these steps and additional guidelines, you will ensure that the solution is accurate, well-explained, and clearly presented.\n",
+ "\n",
+ "\n",
+ "[Question] Bella bought stamps at the post office. Some of the stamps had a snowflake design, some had a truck design, and some had a rose design. Bella bought 11 snowflake stamps. She bought 9 more truck stamps than snowflake stamps, and 13 fewer rose stamps than truck stamps. How many stamps did Bella buy in all?\n",
+ "[Answer] 1. **Understand the Problem**: Bella bought three types of stamps: snowflake, truck, and rose. We need to determine the total number of stamps she bought, given the relationships between the quantities of each type.\n",
+ "\n",
+ "2. **Identify Components**:\n",
+ " - Number of snowflake stamps: 11.\n",
+ " - Number of truck stamps: 9 more than the number of snowflake stamps.\n",
+ " - Number of rose stamps: 13 fewer than the number of truck stamps.\n",
+ "\n",
+ "3. **Apply Relevant Principles**: Use basic arithmetic operations to find the quantities of truck and rose stamps, and then sum all the quantities to find the total number of stamps.\n",
+ "\n",
+ "4. **Logical Reasoning**:\n",
+ " - Number of snowflake stamps: 11.\n",
+ " - Number of truck stamps: 11 (snowflake stamps) + 9 = 20.\n",
+ " - Number of rose stamps: 20 (truck stamps) - 13 = 7.\n",
+ "\n",
+ "5. **Detailed Explanations**:\n",
+ " - Calculate the number of truck stamps: 11 (snowflake stamps) + 9 = 20.\n",
+ " - Calculate the number of rose stamps: 20 (truck stamps) - 13 = 7.\n",
+ " - Calculate the total number of stamps: 11 (snowflake) + 20 (truck) + 7 (rose) = 38.\n",
+ "\n",
+ "6. **Explicit Calculation Steps**:\n",
+ " - Truck stamps: 11 + 9 = $<11+9=20>20.\n",
+ " - Rose stamps: 20 - 13 = $<20-13=7>7.\n",
+ " - Total stamps: 11 + 20 + 7 = $<11+20+7=38>38.\n",
+ "\n",
+ "7. **Verify Each Step**: Recheck each calculation step to ensure correctness:\n",
+ " - Truck stamps: 11 + 9 = 20.\n",
+ " - Rose stamps: 20 - 13 = 7.\n",
+ " - Total stamps: 11 + 20 + 7 = 38.\n",
+ "\n",
+ "8. **Combine Results**: Combine the number of each type of stamp correctly to find the total number of stamps.\n",
+ "\n",
+ "9. **Simplify and Notate**: The final answer is already simplified.\n",
+ "\n",
+ "10. **Mark the Final Answer**: 38\n",
+ "\n",
+ "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. 38\n",
+ "\n",
+ "[Question] It takes Roque two hours to walk to work and one hour to ride his bike to work. Roque walks to and from work three times a week and rides his bike to and from work twice a week. How many hours in total does he take to get to and from work a week with walking and biking?\n",
+ "[Answer] 1. **Understand the Problem**: Roque has two modes of transportation to work: walking and biking. We need to calculate the total time he spends traveling to and from work in a week, considering the different times and frequencies for each mode.\n",
+ "\n",
+ "2. **Identify Components**:\n",
+ " - Time to walk to work: 2 hours (one way).\n",
+ " - Time to bike to work: 1 hour (one way).\n",
+ " - Frequency of walking: 3 times a week (to and from work).\n",
+ " - Frequency of biking: 2 times a week (to and from work).\n",
+ "\n",
+ "3. **Apply Relevant Principles**: Use basic arithmetic to calculate the total time spent walking and biking separately, then sum these times to get the total weekly travel time.\n",
+ "\n",
+ "4. **Logical Reasoning**:\n",
+ " - Calculate the total walking time for a week:\n",
+ " - One round trip (to and from work) by walking takes 2 hours (to work) + 2 hours (from work) = 4 hours.\n",
+ " - Roque walks to and from work 3 times a week, so the total walking time is 4 hours per round trip * 3 round trips = 12 hours.\n",
+ " - Calculate the total biking time for a week:\n",
+ " - One round trip (to and from work) by biking takes 1 hour (to work) + 1 hour (from work) = 2 hours.\n",
+ " - Roque bikes to and from work 2 times a week, so the total biking time is 2 hours per round trip * 2 round trips = 4 hours.\n",
+ "\n",
+ "5. **Detailed Explanations**:\n",
+ " - Walking time calculation:\n",
+ " - One round trip walking: 2 hours (to work) + 2 hours (from work) = 4 hours.\n",
+ " - Total walking time for the week: 4 hours per round trip * 3 round trips = 12 hours.\n",
+ " - Biking time calculation:\n",
+ " - One round trip biking: 1 hour (to work) + 1 hour (from work) = 2 hours.\n",
+ " - Total biking time for the week: 2 hours per round trip * 2 round trips = 4 hours.\n",
+ " - Combine the total walking and biking times to get the total weekly travel time:\n",
+ " - Total weekly travel time: 12 hours (walking) + 4 hours (biking) = 16 hours.\n",
+ "\n",
+ "6. **Explicit Calculation Steps**:\n",
+ " - Walking time: 2 hours (one way) * 2 (round trip) * 3 (times a week) = $<2*2*3=12>12 hours.\n",
+ " - Biking time: 1 hour (one way) * 2 (round trip) * 2 (times a week) = $<1*2*2=4>4 hours.\n",
+ " - Total time: 12 hours (walking) + 4 hours (biking) = $<12+4=16>16 hours.\n",
+ "\n",
+ "7. **Verify Each Step**: Recheck each calculation step to ensure correctness. Confirm that the arithmetic operations and logic used are accurate.\n",
+ "\n",
+ "8. **Combine Results**: Combine the total walking and biking times correctly to ensure the final answer is accurate.\n",
+ "\n",
+ "9. **Simplify and Notate**: The final answer is already simplified and clearly presented.\n",
+ "\n",
+ "10. **Mark the Final Answer**: 16\n",
+ "\n",
+ "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. 16\n",
+ "\n",
+ "[Question] Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?\n",
+ "[Answer] 1. **Understand the Problem**: Betty is saving money for a wallet that costs $100. She currently has half of the money she needs. Her parents and grandparents are contributing additional amounts to help her reach her goal. We need to determine how much more money Betty needs to buy the wallet.\n",
+ "\n",
+ "2. **Identify Components**:\n",
+ " - Total cost of the wallet: $100.\n",
+ " - Amount Betty currently has: half of $100.\n",
+ " - Contribution from parents: $15.\n",
+ " - Contribution from grandparents: twice the amount given by parents.\n",
+ "\n",
+ "3. **Apply Relevant Principles**: Use basic arithmetic to calculate the total amount of money Betty will have after receiving contributions from her parents and grandparents, and then determine how much more she needs to reach $100.\n",
+ "\n",
+ "4. **Logical Reasoning**:\n",
+ " - Calculate the amount Betty currently has: $100 / 2 = $50.\n",
+ " - Calculate the contribution from grandparents: 2 * $15 = $30.\n",
+ " - Calculate the total amount of money Betty will have: $50 (current amount) + $15 (parents' contribution) + $30 (grandparents' contribution).\n",
+ "\n",
+ "5. **Detailed Explanations**:\n",
+ " - Betty currently has $50 because half of $100 is $50.\n",
+ " - Her parents give her $15.\n",
+ " - Her grandparents give her twice the amount her parents give, which is 2 * $15 = $30.\n",
+ " - The total amount of money Betty will have is $50 (current amount) + $15 (parents' contribution) + $30 (grandparents' contribution) = $95.\n",
+ "\n",
+ "6. **Explicit Calculation Steps**:\n",
+ " - Current amount: $100 / 2 = $<100/2=50>50.\n",
+ " - Grandparents' contribution: 2 * $15 = $<2*15=30>30.\n",
+ " - Total amount: $50 + $15 + $30 = $<50+15+30=95>95.\n",
+ "\n",
+ "7. **Verify Each Step**: Recheck each calculation step to ensure correctness.\n",
+ " - Current amount: $100 / 2 = $50.\n",
+ " - Grandparents' contribution: 2 * $15 = $30.\n",
+ " - Total amount: $50 + $15 + $30 = $95.\n",
+ "\n",
+ "8. **Combine Results**: Combine the total amount of money Betty will have correctly.\n",
+ " - Total amount: $50 (current amount) + $15 (parents' contribution) + $30 (grandparents' contribution) = $95.\n",
+ "\n",
+ "9. **Simplify and Notate**: The final answer is already simplified.\n",
+ "\n",
+ "10. **Mark the Final Answer**: \n",
+ " - Amount Betty needs to buy the wallet: $100 - $95 = $<100-95=5>5.\n",
+ "\n",
+ "55\n",
+ "\n",
+ "[Question] A rectangle has a length of 10 cm and a width of 5 cm. What is the area and perimeter of the rectangle?\n",
+ "[Answer] 1. **Understand the Problem**: We need to find both the area and the perimeter of a rectangle given its length and width.\n",
+ "\n",
+ "2. **Identify Components**: \n",
+ " - Length of the rectangle (L) = 10 cm\n",
+ " - Width of the rectangle (W) = 5 cm\n",
+ "\n",
+ "3. **Apply Relevant Principles**: \n",
+ " - The formula for the area of a rectangle is \\( \\text{Area} = \\text{Length} \\times \\text{Width} \\).\n",
+ " - The formula for the perimeter of a rectangle is \\( \\text{Perimeter} = 2 \\times (\\text{Length} + \\text{Width}) \\).\n",
+ "\n",
+ "4. **Logical Reasoning**:\n",
+ " - To find the area, multiply the length by the width.\n",
+ " - To find the perimeter, add the length and the width, then multiply the result by 2.\n",
+ "\n",
+ "5. **Detailed Explanations**:\n",
+ " - Calculate the area: \\( \\text{Area} = 10 \\, \\text{cm} \\times 5 \\, \\text{cm} \\).\n",
+ " - Calculate the perimeter: \\( \\text{Perimeter} = 2 \\times (10 \\, \\text{cm} + 5 \\, \\text{cm}) \\).\n",
+ "\n",
+ "6. **Explicit Calculation Steps**:\n",
+ " - Area: \\( 10 \\times 5 = 50 \\, \\text{cm}^2 \\).\n",
+ " - Perimeter: \\( 2 \\times (10 + 5) = 2 \\times 15 = 30 \\, \\text{cm} \\).\n",
+ "\n",
+ "7. **Verify Each Step**: \n",
+ " - Recheck the area calculation: \\( 10 \\times 5 = 50 \\, \\text{cm}^2 \\).\n",
+ " - Recheck the perimeter calculation: \\( 2 \\times 15 = 30 \\, \\text{cm} \\).\n",
+ "\n",
+ "8. **Combine Results**: \n",
+ " - The area of the rectangle is \\( 50 \\, \\text{cm}^2 \\).\n",
+ " - The perimeter of the rectangle is \\( 30 \\, \\text{cm} \\).\n",
+ "\n",
+ "9. **Simplify and Notate**: \n",
+ " - The final answers are already simplified.\n",
+ "\n",
+ "10. **Mark the Final Answer**: \n",
+ " - Area: 50 \\, \\text{cm}^2\n",
+ " - Perimeter: 30 \\, \\text{cm}\n",
+ "\n",
+ "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. 50\n",
+ "\n",
+ "[Question] Solve for x in the equation 2x + 3 = 11.\n",
+ "[Answer] **Understand the Problem**: We need to solve for the variable \\( x \\) in the given linear equation \\( 2x + 3 = 11 \\).\n",
+ "\n",
+ "**Identify Components**: \n",
+ "- The equation is \\( 2x + 3 = 11 \\).\n",
+ "- We need to isolate \\( x \\) on one side of the equation.\n",
+ "\n",
+ "**Apply Relevant Principles**: \n",
+ "- Use basic algebraic principles to isolate \\( x \\).\n",
+ "\n",
+ "**Logical Reasoning**:\n",
+ "1. Start with the given equation: \\( 2x + 3 = 11 \\).\n",
+ "2. Subtract 3 from both sides of the equation to isolate the term with \\( x \\):\n",
+ " \\[\n",
+ " 2x + 3 - 3 = 11 - 3\n",
+ " \\]\n",
+ "3. Simplify both sides:\n",
+ " \\[\n",
+ " 2x = 8\n",
+ " \\]\n",
+ "4. Divide both sides by 2 to solve for \\( x \\):\n",
+ " \\[\n",
+ " \\frac{2x}{2} = \\frac{8}{2}\n",
+ " \\]\n",
+ "5. Simplify the division:\n",
+ " \\[\n",
+ " x = 4\n",
+ " \\]\n",
+ "\n",
+ "**Detailed Explanations**:\n",
+ "- Subtracting 3 from both sides removes the constant term on the left side, leaving \\( 2x \\) isolated.\n",
+ "- Dividing both sides by 2 isolates \\( x \\) by removing the coefficient of 2.\n",
+ "\n",
+ "**Explicit Calculation Steps**:\n",
+ "1. \\( 2x + 3 = 11 \\)\n",
+ "2. \\( 2x + 3 - 3 = 11 - 3 \\)\n",
+ "3. \\( 2x = 8 \\)\n",
+ "4. \\( \\frac{2x}{2} = \\frac{8}{2} \\)\n",
+ "5. \\( x = 4 \\)\n",
+ "\n",
+ "**Verify Each Step**:\n",
+ "- Recheck each step to ensure no arithmetic errors:\n",
+ " - Subtracting 3 from 11 gives 8.\n",
+ " - Dividing 8 by 2 gives 4.\n",
+ "\n",
+ "**Combine Results**: The final value of \\( x \\) is correctly isolated and calculated.\n",
+ "\n",
+ "**Simplify and Notate**: The final answer is already simplified.\n",
+ "\n",
+ "**Mark the Final Answer**: 4\n",
+ "\n",
+ "By following these steps, we ensure that the solution is accurate, well-explained, and clearly presented. 4\n",
+ "\n",
+ "\n",
+ "For each question present the reasoning followed by the correct answer.\"\"\""
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "general",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demos/svamp/.env b/demos/svamp/.env
new file mode 100644
index 0000000000000000000000000000000000000000..290a11ba1125d027f0e0f653133f0032c7dd491e
--- /dev/null
+++ b/demos/svamp/.env
@@ -0,0 +1,8 @@
+USE_OPENAI_API_KEY="False"
+
+OPENAI_API_KEY=""
+OPENAI_MODEL_NAME =""
+
+OPENAI_API_VERSION=""
+AZURE_OPENAI_ENDPOINT=""
+AZURE_OPENAI_DEPLOYMENT_NAME=""
\ No newline at end of file
diff --git a/demos/svamp/configs/prompt_library.yaml b/demos/svamp/configs/prompt_library.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a2a5f7724defe0974bf3ba3572ca5ca1ae7b36
--- /dev/null
+++ b/demos/svamp/configs/prompt_library.yaml
@@ -0,0 +1,36 @@
+system_prompts: |
+ You are a helpful assistant that assists research students in understanding research papers.
+system_guidelines: |
+ Guidelines
+ - Your role must always be a helpful assistant that assists students in understanding research papers.
+ - Only answer questions that are directly or indirectly related to the referenced paper(s).
+
+mode:
+ chat:
+ - name: CHAT-FIRST-MESSAGE
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ - name: CHAT-NEXT-MESSAGES
+ llm_request_type: rag-query
+ prompt_template: |
+ {user_msg}
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+
+ generation:
+ - name: FLASH_PROFILE
+ prompt_template: |
+ {user_msg}
+ prepend_system_prompts: False
+ prepend_system_guidelines: False
+ llm_request_type: rag-query
+ emb_model_id: text embedding ada 002 [vellm-openai2]
+ llm_model_id: gpt 35 Turbo [vellm-openai2]
\ No newline at end of file
diff --git a/demos/svamp/configs/promptopt_config.yaml b/demos/svamp/configs/promptopt_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9055e3fd9b4ad70cfc5ddb22353e76be44b3412e
--- /dev/null
+++ b/demos/svamp/configs/promptopt_config.yaml
@@ -0,0 +1,52 @@
+# Specify one or more prompt refinement technique to be used. If you specify more than one prompt refinement techniques,
+# all these technique would run on same seed data. Result, iterations needed & cost incurred for each of these
+# technique would be logged. And winning technique for each data instance and overall would be logged.
+
+# Supported prompt refinement techniques: Basic, RecursiveEval, MedPrompt
+# Uncomment techniques that you want to use
+############################ Critique Task Description Start ############################
+prompt_technique_name: "critique_n_refine"
+# unique_model_id of model defined in llm_config.yaml
+unique_model_id: gpt-4o
+# Number of iterations for conducting rounds of mutation of task description
+# followed by refinement of instructions
+mutate_refine_iterations: 3
+# Number of rounds of mutation to be performed when generating different styles
+mutation_rounds: 3
+# Refine instruction post mutation
+refine_instruction: true
+# Number of iterations for refining task description and in context examples for few-shot
+refine_task_eg_iterations: 3
+# Number of variations of prompts to generate in given iteration
+style_variation: 5
+# Number of questions to be asked to LLM in a single batch, during training step
+questions_batch_size: 1
+# Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+min_correct_count: 3
+# Max number of mini-batches on which we should evaluate our prompt
+max_eval_batches: 6
+# Number of top best performing prompts to be considered for next iterations
+top_n: 1
+# Description of task. This will be fed to prompt
+task_description: "You are a mathematics expert. You will be given a mathematics problem which you need to solve"
+# Base instruction, in line with your dataset. This will be fed to prompt
+base_instruction: "Lets think step by step."
+# Instruction for specifying answer format
+answer_format: "At the end, wrap your final answer and option if applicable between and tags"
+# Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+# `questions_batch_size` examples from training data with replacement.
+seen_set_size: 25
+# Number of examples to be given for few shots
+few_shot_count: 5
+# Number of synthetic training examples to be generated
+num_train_examples: 20
+# Generate synthetic reasoning
+generate_reasoning: true
+# Generate description of an expert which can solve the task at hand
+generate_expert_identity: true
+# Generate keywords that describe the intent of the task
+generate_intent_keywords: false
+############################ Critique Task Description End ############################
+
+
+
diff --git a/demos/svamp/configs/setup_config.yaml b/demos/svamp/configs/setup_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5741549da7883a5178700e0b790b4360e52ce19d
--- /dev/null
+++ b/demos/svamp/configs/setup_config.yaml
@@ -0,0 +1,14 @@
+assistant_llm:
+ # put the unique_model_id that you specified in llm_config.yaml
+ prompt_opt: gpt-4o
+dir_info:
+ # Base directory for everything
+ base_dir: logs
+ log_dir_name: glue_logs
+experiment_name: svamp
+# Many features are different for mode: online/offline. For eg
+# 1) Print of logs happens on console for offline mode
+# 2) LLM Queue gets instantiated only in online mode
+mode: offline
+# Full length description of the experiment. This would be logged.
+description:
diff --git a/demos/svamp/demo.ipynb b/demos/svamp/demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..1da0a2404ef2192091c1d4495df752d5b061ed10
--- /dev/null
+++ b/demos/svamp/demo.ipynb
@@ -0,0 +1,295 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "14360485",
+ "metadata": {},
+ "source": [
+ "#### Set environment variables in [.env](.env) for LLM API calling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6bd95c11",
+ "metadata": {},
+ "source": [
+ "### Import Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f1fb3d81-16b6-4b8c-a028-880fdce5e14a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, \"../../\")\n",
+ "import os\n",
+ "import promptwizard\n",
+ "from promptwizard.glue.promptopt.instantiate import GluePromptOpt\n",
+ "from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing\n",
+ "from promptwizard.glue.common.utils.file import save_jsonlist\n",
+ "from typing import Any\n",
+ "from tqdm import tqdm\n",
+ "import json\n",
+ "from datasets import load_dataset\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "load_dotenv(override = True)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f061d2fd",
+ "metadata": {},
+ "source": [
+ "### Create a dataset specific class and define the required functions "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f325d33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "def extract_between(start, end, text):\n",
+ " \"\"\"\n",
+ " Extracts the substring from 'text' that is between 'start' and 'end' strings.\n",
+ " \n",
+ " Parameters:\n",
+ " - start (str): The starting delimiter string.\n",
+ " - end (str): The ending delimiter string.\n",
+ " - text (str): The text to search within.\n",
+ " \n",
+ " Returns:\n",
+ " - str: The extracted substring between the start and end delimiters.\n",
+ " \"\"\"\n",
+ " start_index = text.find(start)\n",
+ " if start_index == -1:\n",
+ " return '' \n",
+ " \n",
+ " start_index += len(start)\n",
+ " \n",
+ " end_index = text.find(end, start_index)\n",
+ " if end_index == -1:\n",
+ " return '' \n",
+ " return text[start_index:end_index]\n",
+ "\n",
+ "class SVAMP(DatasetSpecificProcessing):\n",
+ "\n",
+ " def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:\n",
+ " def extract_answer_from_output(completion):\n",
+ "\n",
+ " return completion\n",
+ "\n",
+ " examples_set = []\n",
+ "\n",
+ " for _, sample in tqdm(enumerate(kwargs[\"dataset\"]), desc=\"Evaluating samples\"):\n",
+ " example = {\n",
+ " DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],\n",
+ " DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],\n",
+ " DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample[\"answer\"])\n",
+ " }\n",
+ " examples_set.append(example)\n",
+ "\n",
+ " save_jsonlist(dataset_jsonl, examples_set, \"w\")\n",
+ "\n",
+ " def extract_final_answer(self, answer: str):\n",
+ " \n",
+ " final_answer = extract_between(text=answer,start=\"\",end=\"\")\n",
+ " return final_answer\n",
+ " \n",
+ " def access_answer(self, llm_output: str, gt_answer: str):\n",
+ "\n",
+ " predicted_answer = self.extract_final_answer(llm_output)\n",
+ " is_correct = False\n",
+ " if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):\n",
+ " is_correct = True\n",
+ "\n",
+ " return is_correct, predicted_answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f384eb57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "svamp_processor = SVAMP()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "976681bd-4f43-4dbc-947e-cdb94d4824f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "if not os.path.exists(\"data\"):\n",
+ " os.mkdir(\"data\")\n",
+ "\n",
+ "dataset = load_dataset(\"ChilleD/SVAMP\")\n",
+ "\n",
+ "for dataset_type in ['train','test']:\n",
+ " data_list = []\n",
+ " num_samples = 0\n",
+ " for data in dataset[dataset_type]:\n",
+ " data_list.append({\"question\": data['question_concat'], \"answer\": data['Answer']})\n",
+ " if dataset_type == 'train' and num_samples == 100: # We sample only 100 train examples and use 25 out them for training randomly\n",
+ " break\n",
+ " num_samples += 1\n",
+ " svamp_processor.dataset_to_jsonl(\"data/\"+ dataset_type+'.jsonl', dataset=data_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4852b94b",
+ "metadata": {},
+ "source": [
+ "### Set paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "f43482f1-3e10-4cf7-8ea6-ff42c04067a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_file_name = os.path.join(\"data\", \"train.jsonl\")\n",
+ "test_file_name = os.path.join(\"data\", \"test.jsonl\")\n",
+ "path_to_config = \"configs\"\n",
+ "llm_config_path = os.path.join(path_to_config, \"llm_config.yaml\")\n",
+ "promptopt_config_path = os.path.join(path_to_config, \"promptopt_config.yaml\")\n",
+ "setup_config_path = os.path.join(path_to_config, \"setup_config.yaml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7ba6394",
+ "metadata": {},
+ "source": [
+ "### Create an object for calling prompt optimization and inference functionalities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8af4246f-db32-4b37-a73a-f9e2e5125d09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gp = GluePromptOpt(promptopt_config_path,\n",
+ " setup_config_path,\n",
+ " train_file_name,\n",
+ " svamp_processor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e38ea08",
+ "metadata": {},
+ "source": [
+ "### Call prompt optmization function\n",
+ "1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real\n",
+ "2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples \n",
+ "3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "573c6151-2c03-45d9-9904-1724a1e20f1b",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Function call to generate optimal prompt and expert profile \n",
+ "best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bae1a791",
+ "metadata": {},
+ "source": [
+ "### Save the optimized prompt and expert profile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34a716af-0d77-4c7d-b1c2-6438d66096ce",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import pickle \n",
+ "\n",
+ "if not os.path.exists(\"results\"):\n",
+ " os.system(\"mkdir results\")\n",
+ "\n",
+ "with open(\"results/best_prompt.pkl\", 'wb') as f:\n",
+ " pickle.dump(best_prompt, f)\n",
+ "with open(\"results/expert_profile.pkl\", 'wb') as f:\n",
+ " pickle.dump(expert_profile, f)\n",
+ "\n",
+ "print(f\"Best prompt: {best_prompt} \\nExpert profile: {expert_profile}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7691a87",
+ "metadata": {},
+ "source": [
+ "### Evaluate the optimized prompt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c49b5711-82dd-4d18-8cd4-ee447cf8d74c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "gp.EXPERT_PROFILE = expert_profile\n",
+ "gp.BEST_PROMPT = best_prompt\n",
+ "\n",
+ "# Function call to evaluate the prompt\n",
+ "accuracy = gp.evaluate(test_file_name)\n",
+ "\n",
+ "print(f\"Final Accuracy: {accuracy}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/images/arithmetic_task.png b/docs/images/arithmetic_task.png
new file mode 100644
index 0000000000000000000000000000000000000000..9f7d01df19809e8318b8f7fc118774d3793076e5
Binary files /dev/null and b/docs/images/arithmetic_task.png differ
diff --git a/docs/images/bigbench.png b/docs/images/bigbench.png
new file mode 100644
index 0000000000000000000000000000000000000000..72ff71accb965f704733949e857f696ae11946e9
Binary files /dev/null and b/docs/images/bigbench.png differ
diff --git a/docs/images/comaprision.png b/docs/images/comaprision.png
new file mode 100644
index 0000000000000000000000000000000000000000..54eff4b2c41a478a2c2a5a3912f336bcc8083c7d
--- /dev/null
+++ b/docs/images/comaprision.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ace953e64449bdfaac42a9e587e3c1f37447755aad8c522c6d02be6d7e925c65
+size 130021
diff --git a/docs/images/cost_analysis.png b/docs/images/cost_analysis.png
new file mode 100644
index 0000000000000000000000000000000000000000..eb52cfada4d768fc1b2a88008fcdbf20e99a8b82
Binary files /dev/null and b/docs/images/cost_analysis.png differ
diff --git a/docs/images/curve.png b/docs/images/curve.png
new file mode 100644
index 0000000000000000000000000000000000000000..485146cc2e0c086cbde0e8417d2fc40e3c3ca6dc
Binary files /dev/null and b/docs/images/curve.png differ
diff --git a/docs/images/github.png b/docs/images/github.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ed19ff70379a20472efa1bad8c1b39589eea8f8
Binary files /dev/null and b/docs/images/github.png differ
diff --git a/docs/images/icl_results.png b/docs/images/icl_results.png
new file mode 100644
index 0000000000000000000000000000000000000000..35814b640293492af5fa4239725561dd5844c5dd
Binary files /dev/null and b/docs/images/icl_results.png differ
diff --git a/docs/images/iterative_flowchart-1.png b/docs/images/iterative_flowchart-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..97a6ef100a05e29804bbcee10b6e97a45d475968
--- /dev/null
+++ b/docs/images/iterative_flowchart-1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:692c18bece5f26e48a8549c19b6f9969a284a0ce2e00d510da461ad579770f3e
+size 167646
diff --git a/docs/images/msr_blog.png b/docs/images/msr_blog.png
new file mode 100644
index 0000000000000000000000000000000000000000..190599a345b1f3f17f3ed7059263d01744e52a79
Binary files /dev/null and b/docs/images/msr_blog.png differ
diff --git a/docs/images/overview.png b/docs/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..359c8993021230a1573cdd5738829fc2d97c5066
--- /dev/null
+++ b/docs/images/overview.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dbee997ee3b194173cd2cc5fee24dfef3cc28311bf3e28778e66fb1ce8ca9ae
+size 251622
diff --git a/docs/images/ppc.png b/docs/images/ppc.png
new file mode 100644
index 0000000000000000000000000000000000000000..d62b995001a70b8235e19c86d45227b6d03ed234
--- /dev/null
+++ b/docs/images/ppc.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85152be471400c6927f4d3c5d201755564d719dbe2602f5499c1c7179c66b607
+size 112929
diff --git a/docs/images/ppc_1.png b/docs/images/ppc_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..eae19294cc6976087e58e58311b15486209fcfdd
--- /dev/null
+++ b/docs/images/ppc_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04a31930771409f59afb7ac8ac5207b77ed526b12592a87c44d773ad10d95e9f
+size 132339
diff --git a/docs/images/prompting.png b/docs/images/prompting.png
new file mode 100644
index 0000000000000000000000000000000000000000..40b0fdfd41553b5998ea716df0eb54ebd423623c
Binary files /dev/null and b/docs/images/prompting.png differ
diff --git a/docs/images/sequential_flowchart-1.png b/docs/images/sequential_flowchart-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..edf79152b4bb6f3f13695834720cb657639463ca
Binary files /dev/null and b/docs/images/sequential_flowchart-1.png differ
diff --git a/docs/images/slm_prompt.png b/docs/images/slm_prompt.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c24682739e8ace90fa1ea331e3f8e7b85a412a8
Binary files /dev/null and b/docs/images/slm_prompt.png differ
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..e5b134012654c66c1bb8ae38cba1373455b55a02
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,784 @@
+
+
+
+
+
+
+
+ PromptWizard
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ PromptWizard is an open source framework for automated prompt and example optimization, leveraging a feedback-driven critique and synthesis process to balance exploration and exploitation. It consistently outperforms state-of-the-art methods while significantly reducing computational costs, enabling efficient and scalable prompt engineering across diverse tasks and LLMs.
+
+
+
+
+
+
+
+
+
+
+
Overview
+
+ Large language models (LLMs) like GPT-4 have achieved remarkable performance across diverse tasks. At the core of this success is promptingāthe process of providing input instructions to guide models toward desired outputs. Studies have shown that prompting significantly influences LLM performance, making prompt engineeringāthe design and refinement of promptsācritical for maximizing accuracy. However, crafting effective prompts remains a labor-intensive and domain-specific task, requiring human expertise and subjective judgment. As models evolve and tasks vary, the need to repeatedly design prompts raises an important question: Can prompt engineering be automated to streamline this process and enhance scalability?
+
+
+
+
+
+
+
+
+
+
+
+
+
Motivation
+
+
+
Prompting is central to LLMs!
+
+
Prompting: The process of providing input instructions to guide models towards desired output
+
Prompt Engineering: The process of designing and refining of promptsā
+
Crating effective prompts is a challenge as:ā
+
+
The task is labor-intensive
+
Prompts need to be domain-specific to work effectively
+
Often it equires human expertise and is subjectiveā
+
Also as models and tasks evolve, there is a need for repeated design
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
PromptWizard Working
+
+
+ PromptWizard (PW) is a discrete prompt optimization framework that employs a self-evolving mechanism where the LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesis. This self-adaptive approach ensures holistic optimization by evolving both the instructions and in-context learning examples for better task performance.
+
+
Three Key Insights :
+
+
+
Feedback-driven Refinement: LLM generates, critiques, and refines its own prompts and examples, continuously improving through iterative feedback and synthesisā
+
+
Critique and Synthesize diverse examples: Generates synthetic examples that are robust, diverse and task-aware. Also it optimizes both prompt and examples in tandemā
+
+
Self generated Chain of Thought (CoT) steps with combination of positive, negative and synthetic examples
+
+
+
+ Following are the details of each step :
+
+
+
+
+
+
+
Prompt wizard uses a systematic, feedback-driven proces where it incorporates a critique component that provides feedback, thus guiding and refining the prompt over multiple iterationsā
+
The following steps help in carrying out this systematically
+
+
Mutate: Takes an initial problem description + thinking Styles to generate promptsā
+
Scoring: Evaluate the performance of the generated prompts to determine best promptā
+
Critique: Reviews where the prompt succeeded and failed by analyzing cases where the LLM struggledā
+
Synthesize: Uses critiqueās feedback to refine the best prompt
+
+
+
+
+
+
+
+
+
+
+
PromptWizard improves both prompt instructions and few-shot examples in tandemā
+
It uses self-reflection to synthesize examples that are diverse and task-relevantā
+
An iterative feedback loop is used that continuously refines both the prompt and few-shot examplesā
+
Few shot example optimization:ā
+
+
Critique: Analyzes previously selected examples and use the feedback to determine how examples should evolveā
+
Synthesize: Incorporates feedback to generate new synthetic examples that are more diverse, robust, and task-relevantā
+
+
Prompt instruction optimization:ā
+
+
Critique: Identifies weaknesses and gaps that require addressing to further refine the prompt instructionā
+
Synthesize: Leverages feedback from the critique to synthesize and refine the prompt instruction
+
+
+
+
+
+
+
+
+
+
Incorporating chain-of-thought (CoT) reasoning improves problem-solving abilities of the modelā
+
CoT Reasoning takes the selected few-shot examples and generates a detailed reasoning chain for each example to facilitate problem-solvingā
+
An LLM to check the coherence and relevance of examplesā
+
+ ā
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Results
+
+
+
+
+
+
+
+
PromptWizard outperforms the baselines, achieving the highest accuracy on 13/19 tasks (68%) with 0-shot and 16/19 (84%) with 1-shot
+
+
+
+
PromptWizard consistently performs near the best possible accuracy across all tasks
+
+
+
+
PromptWizard costs just $0.05 per task, 5-60x reduction in overall tokens/costā
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
PromptWizard using Llama-70B show a negligible < 1% drop in accuracy ā
+
+
+
+
+
+
+
+
PromptWizard shows strong resilience even with fewer training samples mainly due to synthetic example generation and reasoning chainsāā
+
+
+
+
+
+
+
+
Substantial performance improvements across all models when optimized prompts are generated by PromptWizard on GSM8k datasetā
+
+
+
+
+
+
+
+
Dataset
+
Accuracy (high)
+
+
+
+
DSPy
+
PromptAgent
+
APO
+
PW
+
+
+
GSM8k
+
78.2
+
68.84
+
25.67
+
90
+
+
+
AQUARAT
+
55.1
+
56.67
+
20.12
+
58.2
+
+
+
SVAMP
+
77
+
78.67
+
75.25
+
82.3
+
+
+
ETHOS
+
84.1
+
84.25
+
80.62
+
89.4
+
+
+
+
+
+
Dataset
+
Calls (low)
+
+
+
+
DSPy
+
PromptAgent
+
APO
+
PW
+
+
+
GSM8k
+
915
+
2115
+
8490
+
147
+
+
+
AQUARAT
+
920
+
2200
+
8500
+
112
+
+
+
SVAMP
+
2300
+
2111
+
8000
+
178
+
+
+
ETHOS
+
660
+
2217
+
8200
+
80
+
+
+
+
+
+
Dataset
+
Tokens (low)
+
+
+
+
DSPy
+
PromptAgent
+
APO
+
PW
+
+
+
GSM8k
+
262
+
500
+
109
+
237
+
+
+
AQUARAT
+
326
+
875
+
125
+
200
+
+
+
SVAMP
+
189
+
680
+
85
+
127
+
+
+
ETHOS
+
175
+
417
+
55
+
190
+
+
+
+
+
PromptWizard outperforms feedback based methods like APO, PromptAgent and other prompt optimization techniques like DSPy in terms of accuracy and number of API calls for optimization on various datasets.
+ ā
+
+
+
+
+
+
+
+
+
+
+
+
+
BibTeX
+
@misc{agarwal2024promptwizardtaskawarepromptoptimization,
+ title={PromptWizard: Task-Aware Prompt Optimization Framework},
+ author={Eshaan Agarwal and Joykirat Singh and Vivek Dani and Raghav Magazine and Tanuja Ganu and Akshay Nambi},
+ year={2024},
+ eprint={2405.18369},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL},
+ url={https://arxiv.org/abs/2405.18369},
+}
'}},function(t,e,i){"use strict";e.a=function(){return''}}]).default});
\ No newline at end of file
diff --git a/docs/static/js/bulma-slider.js b/docs/static/js/bulma-slider.js
new file mode 100644
index 0000000000000000000000000000000000000000..c6718de5c5ae59d2c22141a147f5afba41af9cbb
--- /dev/null
+++ b/docs/static/js/bulma-slider.js
@@ -0,0 +1,461 @@
+(function webpackUniversalModuleDefinition(root, factory) {
+ if(typeof exports === 'object' && typeof module === 'object')
+ module.exports = factory();
+ else if(typeof define === 'function' && define.amd)
+ define([], factory);
+ else if(typeof exports === 'object')
+ exports["bulmaSlider"] = factory();
+ else
+ root["bulmaSlider"] = factory();
+})(typeof self !== 'undefined' ? self : this, function() {
+return /******/ (function(modules) { // webpackBootstrap
+/******/ // The module cache
+/******/ var installedModules = {};
+/******/
+/******/ // The require function
+/******/ function __webpack_require__(moduleId) {
+/******/
+/******/ // Check if module is in cache
+/******/ if(installedModules[moduleId]) {
+/******/ return installedModules[moduleId].exports;
+/******/ }
+/******/ // Create a new module (and put it into the cache)
+/******/ var module = installedModules[moduleId] = {
+/******/ i: moduleId,
+/******/ l: false,
+/******/ exports: {}
+/******/ };
+/******/
+/******/ // Execute the module function
+/******/ modules[moduleId].call(module.exports, module, module.exports, __webpack_require__);
+/******/
+/******/ // Flag the module as loaded
+/******/ module.l = true;
+/******/
+/******/ // Return the exports of the module
+/******/ return module.exports;
+/******/ }
+/******/
+/******/
+/******/ // expose the modules object (__webpack_modules__)
+/******/ __webpack_require__.m = modules;
+/******/
+/******/ // expose the module cache
+/******/ __webpack_require__.c = installedModules;
+/******/
+/******/ // define getter function for harmony exports
+/******/ __webpack_require__.d = function(exports, name, getter) {
+/******/ if(!__webpack_require__.o(exports, name)) {
+/******/ Object.defineProperty(exports, name, {
+/******/ configurable: false,
+/******/ enumerable: true,
+/******/ get: getter
+/******/ });
+/******/ }
+/******/ };
+/******/
+/******/ // getDefaultExport function for compatibility with non-harmony modules
+/******/ __webpack_require__.n = function(module) {
+/******/ var getter = module && module.__esModule ?
+/******/ function getDefault() { return module['default']; } :
+/******/ function getModuleExports() { return module; };
+/******/ __webpack_require__.d(getter, 'a', getter);
+/******/ return getter;
+/******/ };
+/******/
+/******/ // Object.prototype.hasOwnProperty.call
+/******/ __webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); };
+/******/
+/******/ // __webpack_public_path__
+/******/ __webpack_require__.p = "";
+/******/
+/******/ // Load entry module and return exports
+/******/ return __webpack_require__(__webpack_require__.s = 0);
+/******/ })
+/************************************************************************/
+/******/ ([
+/* 0 */
+/***/ (function(module, __webpack_exports__, __webpack_require__) {
+
+"use strict";
+Object.defineProperty(__webpack_exports__, "__esModule", { value: true });
+/* harmony export (binding) */ __webpack_require__.d(__webpack_exports__, "isString", function() { return isString; });
+/* harmony import */ var __WEBPACK_IMPORTED_MODULE_0__events__ = __webpack_require__(1);
+var _extends = Object.assign || function (target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (Object.prototype.hasOwnProperty.call(source, key)) { target[key] = source[key]; } } } return target; };
+
+var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }();
+
+var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; };
+
+function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }
+
+function _possibleConstructorReturn(self, call) { if (!self) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return call && (typeof call === "object" || typeof call === "function") ? call : self; }
+
+function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function, not " + typeof superClass); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, enumerable: false, writable: true, configurable: true } }); if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass; }
+
+
+
+var isString = function isString(unknown) {
+ return typeof unknown === 'string' || !!unknown && (typeof unknown === 'undefined' ? 'undefined' : _typeof(unknown)) === 'object' && Object.prototype.toString.call(unknown) === '[object String]';
+};
+
+var bulmaSlider = function (_EventEmitter) {
+ _inherits(bulmaSlider, _EventEmitter);
+
+ function bulmaSlider(selector) {
+ var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
+
+ _classCallCheck(this, bulmaSlider);
+
+ var _this = _possibleConstructorReturn(this, (bulmaSlider.__proto__ || Object.getPrototypeOf(bulmaSlider)).call(this));
+
+ _this.element = typeof selector === 'string' ? document.querySelector(selector) : selector;
+ // An invalid selector or non-DOM node has been provided.
+ if (!_this.element) {
+ throw new Error('An invalid selector or non-DOM node has been provided.');
+ }
+
+ _this._clickEvents = ['click'];
+ /// Set default options and merge with instance defined
+ _this.options = _extends({}, options);
+
+ _this.onSliderInput = _this.onSliderInput.bind(_this);
+
+ _this.init();
+ return _this;
+ }
+
+ /**
+ * Initiate all DOM element containing selector
+ * @method
+ * @return {Array} Array of all slider instances
+ */
+
+
+ _createClass(bulmaSlider, [{
+ key: 'init',
+
+
+ /**
+ * Initiate plugin
+ * @method init
+ * @return {void}
+ */
+ value: function init() {
+ this._id = 'bulmaSlider' + new Date().getTime() + Math.floor(Math.random() * Math.floor(9999));
+ this.output = this._findOutputForSlider();
+
+ this._bindEvents();
+
+ if (this.output) {
+ if (this.element.classList.contains('has-output-tooltip')) {
+ // Get new output position
+ var newPosition = this._getSliderOutputPosition();
+
+ // Set output position
+ this.output.style['left'] = newPosition.position;
+ }
+ }
+
+ this.emit('bulmaslider:ready', this.element.value);
+ }
+ }, {
+ key: '_findOutputForSlider',
+ value: function _findOutputForSlider() {
+ var _this2 = this;
+
+ var result = null;
+ var outputs = document.getElementsByTagName('output') || [];
+
+ Array.from(outputs).forEach(function (output) {
+ if (output.htmlFor == _this2.element.getAttribute('id')) {
+ result = output;
+ return true;
+ }
+ });
+ return result;
+ }
+ }, {
+ key: '_getSliderOutputPosition',
+ value: function _getSliderOutputPosition() {
+ // Update output position
+ var newPlace, minValue;
+
+ var style = window.getComputedStyle(this.element, null);
+ // Measure width of range input
+ var sliderWidth = parseInt(style.getPropertyValue('width'), 10);
+
+ // Figure out placement percentage between left and right of input
+ if (!this.element.getAttribute('min')) {
+ minValue = 0;
+ } else {
+ minValue = this.element.getAttribute('min');
+ }
+ var newPoint = (this.element.value - minValue) / (this.element.getAttribute('max') - minValue);
+
+ // Prevent bubble from going beyond left or right (unsupported browsers)
+ if (newPoint < 0) {
+ newPlace = 0;
+ } else if (newPoint > 1) {
+ newPlace = sliderWidth;
+ } else {
+ newPlace = sliderWidth * newPoint;
+ }
+
+ return {
+ 'position': newPlace + 'px'
+ };
+ }
+
+ /**
+ * Bind all events
+ * @method _bindEvents
+ * @return {void}
+ */
+
+ }, {
+ key: '_bindEvents',
+ value: function _bindEvents() {
+ if (this.output) {
+ // Add event listener to update output when slider value change
+ this.element.addEventListener('input', this.onSliderInput, false);
+ }
+ }
+ }, {
+ key: 'onSliderInput',
+ value: function onSliderInput(e) {
+ e.preventDefault();
+
+ if (this.element.classList.contains('has-output-tooltip')) {
+ // Get new output position
+ var newPosition = this._getSliderOutputPosition();
+
+ // Set output position
+ this.output.style['left'] = newPosition.position;
+ }
+
+ // Check for prefix and postfix
+ var prefix = this.output.hasAttribute('data-prefix') ? this.output.getAttribute('data-prefix') : '';
+ var postfix = this.output.hasAttribute('data-postfix') ? this.output.getAttribute('data-postfix') : '';
+
+ // Update output with slider value
+ this.output.value = prefix + this.element.value + postfix;
+
+ this.emit('bulmaslider:ready', this.element.value);
+ }
+ }], [{
+ key: 'attach',
+ value: function attach() {
+ var _this3 = this;
+
+ var selector = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 'input[type="range"].slider';
+ var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
+
+ var instances = new Array();
+
+ var elements = isString(selector) ? document.querySelectorAll(selector) : Array.isArray(selector) ? selector : [selector];
+ elements.forEach(function (element) {
+ if (typeof element[_this3.constructor.name] === 'undefined') {
+ var instance = new bulmaSlider(element, options);
+ element[_this3.constructor.name] = instance;
+ instances.push(instance);
+ } else {
+ instances.push(element[_this3.constructor.name]);
+ }
+ });
+
+ return instances;
+ }
+ }]);
+
+ return bulmaSlider;
+}(__WEBPACK_IMPORTED_MODULE_0__events__["a" /* default */]);
+
+/* harmony default export */ __webpack_exports__["default"] = (bulmaSlider);
+
+/***/ }),
+/* 1 */
+/***/ (function(module, __webpack_exports__, __webpack_require__) {
+
+"use strict";
+var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }();
+
+function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }
+
+var EventEmitter = function () {
+ function EventEmitter() {
+ var listeners = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : [];
+
+ _classCallCheck(this, EventEmitter);
+
+ this._listeners = new Map(listeners);
+ this._middlewares = new Map();
+ }
+
+ _createClass(EventEmitter, [{
+ key: "listenerCount",
+ value: function listenerCount(eventName) {
+ if (!this._listeners.has(eventName)) {
+ return 0;
+ }
+
+ var eventListeners = this._listeners.get(eventName);
+ return eventListeners.length;
+ }
+ }, {
+ key: "removeListeners",
+ value: function removeListeners() {
+ var _this = this;
+
+ var eventName = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null;
+ var middleware = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
+
+ if (eventName !== null) {
+ if (Array.isArray(eventName)) {
+ name.forEach(function (e) {
+ return _this.removeListeners(e, middleware);
+ });
+ } else {
+ this._listeners.delete(eventName);
+
+ if (middleware) {
+ this.removeMiddleware(eventName);
+ }
+ }
+ } else {
+ this._listeners = new Map();
+ }
+ }
+ }, {
+ key: "middleware",
+ value: function middleware(eventName, fn) {
+ var _this2 = this;
+
+ if (Array.isArray(eventName)) {
+ name.forEach(function (e) {
+ return _this2.middleware(e, fn);
+ });
+ } else {
+ if (!Array.isArray(this._middlewares.get(eventName))) {
+ this._middlewares.set(eventName, []);
+ }
+
+ this._middlewares.get(eventName).push(fn);
+ }
+ }
+ }, {
+ key: "removeMiddleware",
+ value: function removeMiddleware() {
+ var _this3 = this;
+
+ var eventName = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null;
+
+ if (eventName !== null) {
+ if (Array.isArray(eventName)) {
+ name.forEach(function (e) {
+ return _this3.removeMiddleware(e);
+ });
+ } else {
+ this._middlewares.delete(eventName);
+ }
+ } else {
+ this._middlewares = new Map();
+ }
+ }
+ }, {
+ key: "on",
+ value: function on(name, callback) {
+ var _this4 = this;
+
+ var once = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
+
+ if (Array.isArray(name)) {
+ name.forEach(function (e) {
+ return _this4.on(e, callback);
+ });
+ } else {
+ name = name.toString();
+ var split = name.split(/,|, | /);
+
+ if (split.length > 1) {
+ split.forEach(function (e) {
+ return _this4.on(e, callback);
+ });
+ } else {
+ if (!Array.isArray(this._listeners.get(name))) {
+ this._listeners.set(name, []);
+ }
+
+ this._listeners.get(name).push({ once: once, callback: callback });
+ }
+ }
+ }
+ }, {
+ key: "once",
+ value: function once(name, callback) {
+ this.on(name, callback, true);
+ }
+ }, {
+ key: "emit",
+ value: function emit(name, data) {
+ var _this5 = this;
+
+ var silent = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
+
+ name = name.toString();
+ var listeners = this._listeners.get(name);
+ var middlewares = null;
+ var doneCount = 0;
+ var execute = silent;
+
+ if (Array.isArray(listeners)) {
+ listeners.forEach(function (listener, index) {
+ // Start Middleware checks unless we're doing a silent emit
+ if (!silent) {
+ middlewares = _this5._middlewares.get(name);
+ // Check and execute Middleware
+ if (Array.isArray(middlewares)) {
+ middlewares.forEach(function (middleware) {
+ middleware(data, function () {
+ var newData = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null;
+
+ if (newData !== null) {
+ data = newData;
+ }
+ doneCount++;
+ }, name);
+ });
+
+ if (doneCount >= middlewares.length) {
+ execute = true;
+ }
+ } else {
+ execute = true;
+ }
+ }
+
+ // If Middleware checks have been passed, execute
+ if (execute) {
+ if (listener.once) {
+ listeners[index] = null;
+ }
+ listener.callback(data);
+ }
+ });
+
+ // Dirty way of removing used Events
+ while (listeners.indexOf(null) !== -1) {
+ listeners.splice(listeners.indexOf(null), 1);
+ }
+ }
+ }
+ }]);
+
+ return EventEmitter;
+}();
+
+/* harmony default export */ __webpack_exports__["a"] = (EventEmitter);
+
+/***/ })
+/******/ ])["default"];
+});
\ No newline at end of file
diff --git a/docs/static/js/bulma-slider.min.js b/docs/static/js/bulma-slider.min.js
new file mode 100644
index 0000000000000000000000000000000000000000..7e62685763cf7668cfa8857fac0b27af2c277286
--- /dev/null
+++ b/docs/static/js/bulma-slider.min.js
@@ -0,0 +1 @@
+!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.bulmaSlider=e():t.bulmaSlider=e()}("undefined"!=typeof self?self:this,function(){return function(n){var r={};function i(t){if(r[t])return r[t].exports;var e=r[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,i),e.l=!0,e.exports}return i.m=n,i.c=r,i.d=function(t,e,n){i.o(t,e)||Object.defineProperty(t,e,{configurable:!1,enumerable:!0,get:n})},i.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return i.d(e,"a",e),e},i.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},i.p="",i(i.s=0)}([function(t,e,n){"use strict";Object.defineProperty(e,"__esModule",{value:!0}),n.d(e,"isString",function(){return l});var r=n(1),i=Object.assign||function(t){for(var e=1;e=l.length&&(s=!0)):s=!0),s&&(t.once&&(u[e]=null),t.callback(r))});-1!==u.indexOf(null);)u.splice(u.indexOf(null),1)}}]),e}();e.a=i}]).default});
\ No newline at end of file
diff --git a/docs/static/js/fontawesome.all.min.js b/docs/static/js/fontawesome.all.min.js
new file mode 100644
index 0000000000000000000000000000000000000000..9ee22fdb7753983bae3986b2436bdd167730cd5b
--- /dev/null
+++ b/docs/static/js/fontawesome.all.min.js
@@ -0,0 +1,5 @@
+/*!
+ * Font Awesome Free 5.15.1 by @fontawesome - https://fontawesome.com
+ * License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
+ */
+!function(){"use strict";var c={},l={};try{"undefined"!=typeof window&&(c=window),"undefined"!=typeof document&&(l=document)}catch(c){}var h=(c.navigator||{}).userAgent,z=void 0===h?"":h,a=c,v=l,m=(a.document,!!v.documentElement&&!!v.head&&"function"==typeof v.addEventListener&&v.createElement,~z.indexOf("MSIE")||z.indexOf("Trident/"),"___FONT_AWESOME___"),e=function(){try{return!0}catch(c){return!1}}();var s=a||{};s[m]||(s[m]={}),s[m].styles||(s[m].styles={}),s[m].hooks||(s[m].hooks={}),s[m].shims||(s[m].shims=[]);var t=s[m];function M(c,z){var l=(2>>0;h--;)l[h]=c[h];return l}function Ac(c){return c.classList?bc(c.classList):(c.getAttribute("class")||"").split(" ").filter(function(c){return c})}function gc(c,l){var h,z=l.split("-"),a=z[0],v=z.slice(1).join("-");return a!==c||""===v||(h=v,~T.indexOf(h))?null:v}function Sc(c){return"".concat(c).replace(/&/g,"&").replace(/"/g,""").replace(/'/g,"'").replace(//g,">")}function yc(h){return Object.keys(h||{}).reduce(function(c,l){return c+"".concat(l,": ").concat(h[l],";")},"")}function wc(c){return c.size!==Lc.size||c.x!==Lc.x||c.y!==Lc.y||c.rotate!==Lc.rotate||c.flipX||c.flipY}function Zc(c){var l=c.transform,h=c.containerWidth,z=c.iconWidth,a={transform:"translate(".concat(h/2," 256)")},v="translate(".concat(32*l.x,", ").concat(32*l.y,") "),m="scale(".concat(l.size/16*(l.flipX?-1:1),", ").concat(l.size/16*(l.flipY?-1:1),") "),e="rotate(".concat(l.rotate," 0 0)");return{outer:a,inner:{transform:"".concat(v," ").concat(m," ").concat(e)},path:{transform:"translate(".concat(z/2*-1," -256)")}}}var kc={x:0,y:0,width:"100%",height:"100%"};function xc(c){var l=!(1").concat(m.map(Jc).join(""),"").concat(l,">")}var $c=function(){};function cl(c){return"string"==typeof(c.getAttribute?c.getAttribute(cc):null)}var ll={replace:function(c){var l=c[0],h=c[1].map(function(c){return Jc(c)}).join("\n");if(l.parentNode&&l.outerHTML)l.outerHTML=h+(lc.keepOriginalSource&&"svg"!==l.tagName.toLowerCase()?"\x3c!-- ".concat(l.outerHTML," Font Awesome fontawesome.com --\x3e"):"");else if(l.parentNode){var z=document.createElement("span");l.parentNode.replaceChild(z,l),z.outerHTML=h}},nest:function(c){var l=c[0],h=c[1];if(~Ac(l).indexOf(lc.replacementClass))return ll.replace(c);var z=new RegExp("".concat(lc.familyPrefix,"-.*"));delete h[0].attributes.style,delete h[0].attributes.id;var a=h[0].attributes.class.split(" ").reduce(function(c,l){return l===lc.replacementClass||l.match(z)?c.toSvg.push(l):c.toNode.push(l),c},{toNode:[],toSvg:[]});h[0].attributes.class=a.toSvg.join(" ");var v=h.map(function(c){return Jc(c)}).join("\n");l.setAttribute("class",a.toNode.join(" ")),l.setAttribute(cc,""),l.innerHTML=v}};function hl(c){c()}function zl(h,c){var z="function"==typeof c?c:$c;if(0===h.length)z();else{var l=hl;lc.mutateApproach===y&&(l=o.requestAnimationFrame||hl),l(function(){var c=!0===lc.autoReplaceSvg?ll.replace:ll[lc.autoReplaceSvg]||ll.replace,l=_c.begin("mutate");h.map(c),l(),z()})}}var al=!1;function vl(){al=!1}var ml=null;function el(c){if(t&&lc.observeMutations){var a=c.treeCallback,v=c.nodeCallback,m=c.pseudoElementsCallback,l=c.observeMutationsRoot,h=void 0===l?C:l;ml=new t(function(c){al||bc(c).forEach(function(c){if("childList"===c.type&&0 {
+ console.log(state);
+ });
+ }
+
+ // Access to bulmaCarousel instance of an element
+ var element = document.querySelector('#my-element');
+ if (element && element.bulmaCarousel) {
+ // bulmaCarousel instance is available as element.bulmaCarousel
+ element.bulmaCarousel.on('before-show', function(state) {
+ console.log(state);
+ });
+ }
+
+ /*var player = document.getElementById('interpolation-video');
+ player.addEventListener('loadedmetadata', function() {
+ $('#interpolation-slider').on('input', function(event) {
+ console.log(this.value, player.duration);
+ player.currentTime = player.duration / 100 * this.value;
+ })
+ }, false);*/
+ preloadInterpolationImages();
+
+ $('#interpolation-slider').on('input', function(event) {
+ setInterpolationImage(this.value);
+ });
+ setInterpolationImage(0);
+ $('#interpolation-slider').prop('max', NUM_INTERP_FRAMES - 1);
+
+ bulmaSlider.attach();
+
+})
diff --git a/images/curve.png b/images/curve.png
new file mode 100644
index 0000000000000000000000000000000000000000..485146cc2e0c086cbde0e8417d2fc40e3c3ca6dc
Binary files /dev/null and b/images/curve.png differ
diff --git a/images/github.png b/images/github.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ed19ff70379a20472efa1bad8c1b39589eea8f8
Binary files /dev/null and b/images/github.png differ
diff --git a/images/iterative_flowchart-1.png b/images/iterative_flowchart-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..97a6ef100a05e29804bbcee10b6e97a45d475968
--- /dev/null
+++ b/images/iterative_flowchart-1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:692c18bece5f26e48a8549c19b6f9969a284a0ce2e00d510da461ad579770f3e
+size 167646
diff --git a/images/msr_blog.png b/images/msr_blog.png
new file mode 100644
index 0000000000000000000000000000000000000000..190599a345b1f3f17f3ed7059263d01744e52a79
Binary files /dev/null and b/images/msr_blog.png differ
diff --git a/images/overview.png b/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..359c8993021230a1573cdd5738829fc2d97c5066
--- /dev/null
+++ b/images/overview.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dbee997ee3b194173cd2cc5fee24dfef3cc28311bf3e28778e66fb1ce8ca9ae
+size 251622
diff --git a/images/sequential_flowchart-1.png b/images/sequential_flowchart-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..edf79152b4bb6f3f13695834720cb657639463ca
Binary files /dev/null and b/images/sequential_flowchart-1.png differ
diff --git a/promptwizard/__init__.py b/promptwizard/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5abcfba9d513ec791e10fa9995cf024a9de1a9a0
--- /dev/null
+++ b/promptwizard/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+# flake8: noqa
+from .glue.promptopt.instantiate import GluePromptOpt
+from .version import VERSION as __version__
+
+__all__ = ["GluePromptOpt"]
\ No newline at end of file
diff --git a/promptwizard/glue/common/__init__.py b/promptwizard/glue/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a54684e4d7b79bec90dced680eacecffbd2e2ce2
--- /dev/null
+++ b/promptwizard/glue/common/__init__.py
@@ -0,0 +1,4 @@
+"""
+ Vellm common package. This would be imported in almost all the vellm packages. Consider this package as parent
+ root-node for all Vellm related packages.
+"""
diff --git a/promptwizard/glue/common/base_classes.py b/promptwizard/glue/common/base_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e26f0105959f0c083d6e2e30f062358ace2f1ba
--- /dev/null
+++ b/promptwizard/glue/common/base_classes.py
@@ -0,0 +1,166 @@
+from dataclasses import dataclass
+from enum import Enum
+from inspect import getmembers, ismethod
+from typing import List, Optional
+
+# This file has class definitions for config yaml files
+
+# TODO: add comments for class definition and variable definition
+
+
+class UniversalBaseClass:
+ def __str__(self) -> str:
+ attributes_string = []
+ for member in getmembers(self):
+
+ # remove private and protected attributes
+ if not member[0].startswith('_'):
+
+ # remove methods that does not start with underscore
+ if not ismethod(member[1]):
+ attributes_string.append(member)
+ return str(attributes_string)
+
+######################################################################################
+# Classes related to llm_config.yaml
+
+
+@dataclass
+class LLMModel(UniversalBaseClass):
+ unique_model_id: str
+ model_type: str
+ track_tokens: str
+ req_per_min: int
+ tokens_per_min: int
+ error_backoff_in_seconds: int
+
+@dataclass
+class UserLimits(UniversalBaseClass):
+ max_num_requests_in_time_window: int
+ time_window_length_in_seconds: int
+
+
+@dataclass
+class LLMQueueSchedulerLimits(UniversalBaseClass):
+ ttl_in_seconds: int
+ max_queue_size: int
+
+
+@dataclass
+class AzureAOIModels(LLMModel, UniversalBaseClass):
+ model_name_in_azure: str
+ deployment_name_in_azure: str
+
+
+@dataclass
+class AzureAOILM(UniversalBaseClass):
+ api_key: str
+ api_version: str
+ api_type: str
+ azure_endpoint: str
+ azure_oai_models: List[AzureAOIModels]
+
+ def __post_init__(self):
+ azure_oai_models_obj = []
+ if self.azure_oai_models:
+ for azure_oai_model in self.azure_oai_models:
+ azure_oai_models_obj.append(AzureAOIModels(**azure_oai_model))
+ self.azure_oai_models = azure_oai_models_obj
+
+
+@dataclass
+class CustomLLM(LLMModel):
+ path_to_py_file: str
+ class_name: str
+
+
+@dataclass
+class LLMConfig(UniversalBaseClass):
+ azure_open_ai: AzureAOILM
+ user_limits: UserLimits
+ scheduler_limits: LLMQueueSchedulerLimits
+ custom_models: List[CustomLLM]
+
+ def __post_init__(self):
+ self.azure_open_ai = AzureAOILM(**self.azure_open_ai)
+ custom_model_obj = []
+ if self.custom_models:
+ for custom_model in self.custom_models:
+ custom_model_obj.append(CustomLLM(**custom_model))
+ self.custom_models = custom_model_obj
+
+######################################################################################
+# Classes related to setup_config.yaml
+
+
+@dataclass
+class AssistantLLM(UniversalBaseClass):
+ prompt_opt: str
+
+
+@dataclass
+class Dir(UniversalBaseClass):
+ base_dir: str
+ log_dir_name: str
+
+
+class OperationMode(Enum):
+ ONLINE = "online"
+ OFFLINE = "offline"
+
+
+@dataclass
+class SetupConfig(UniversalBaseClass):
+ assistant_llm: AssistantLLM
+ dir_info: Dir
+ experiment_name: str
+ mode: OperationMode
+ description: str
+
+ def __post_init__(self):
+ if self.dir_info:
+ self.dir_info = Dir(**self.dir_info)
+ if self.assistant_llm:
+ self.assistant_llm = AssistantLLM(**self.assistant_llm)
+
+######################################################################################
+# Classes related to prompt_library_config.yaml
+
+@dataclass
+class TaskConfig:
+ name: str
+ prompt_template: str
+ llm_request_type: str
+ prepend_system_prompts: Optional[bool] = True
+ prepend_system_guidelines: Optional[bool] = True
+ emb_model_id: Optional[str] = None
+ llm_model_id: Optional[str] = None
+
+@dataclass
+class Mode:
+ chat: List[TaskConfig]
+ generation: List[TaskConfig]
+
+ def __post_init__(self):
+ chat_obj = []
+ if self.chat:
+ for chat_config in self.chat:
+ chat_obj.append(TaskConfig(**chat_config))
+ self.chat = chat_obj
+
+ gen_obj = []
+ if self.generation:
+ for gen_config in self.generation:
+ gen_obj.append(TaskConfig(**gen_config))
+ self.generation = gen_obj
+
+
+@dataclass
+class PromptLibraryConfig:
+ mode: Mode
+ system_prompts: Optional[str] = None
+ system_guidelines: Optional[str] = None
+
+ def __post_init__(self):
+ if self.mode:
+ self.mode = Mode(**self.mode)
diff --git a/promptwizard/glue/common/constants/__init__.py b/promptwizard/glue/common/constants/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/promptwizard/glue/common/constants/error_strings.py b/promptwizard/glue/common/constants/error_strings.py
new file mode 100644
index 0000000000000000000000000000000000000000..d00c61af520a8b83104322a5787afc22a670eaf8
--- /dev/null
+++ b/promptwizard/glue/common/constants/error_strings.py
@@ -0,0 +1,3 @@
+
+class VellmErrorStrings:
+ PATH_DOESNT_EXIST = "{path} path doesn't exist. Please create path {path}"
diff --git a/promptwizard/glue/common/constants/log_strings.py b/promptwizard/glue/common/constants/log_strings.py
new file mode 100644
index 0000000000000000000000000000000000000000..2acc69265050023b255d656f28065a8cf4ec6b3e
--- /dev/null
+++ b/promptwizard/glue/common/constants/log_strings.py
@@ -0,0 +1,3 @@
+class CommonLogsStr:
+ INSTALL_MISSING_LIB = "{lib_name} is not installed. Installing {lib_name}."
+ LOG_SEPERATOR = "\n"+"="*150+"\n"
diff --git a/promptwizard/glue/common/constants/str_literals.py b/promptwizard/glue/common/constants/str_literals.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfaccffb7efc49bfd3b2f05314148bd91f9faa79
--- /dev/null
+++ b/promptwizard/glue/common/constants/str_literals.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass
+
+# TODO: add comments for class definition and variable definition
+# This file has classes storing constant literals
+
+
+@dataclass
+class FileConstants:
+ logfile_name = "glue_logs.log"
+ logfile_prefix = "glue_logs_"
+
+
+@dataclass
+class OAILiterals:
+ OPENAI_API_KEY = "OPENAI_API_KEY"
+ OPENAI_API_BASE = "OPENAI_API_BASE"
+ OPENAI_API_TYPE = "OPENAI_API_TYPE"
+ OPENAI_API_VERSION = "OPENAI_API_VERSION"
+ AZ_OPEN_AI_OBJECT = "AZ_OPEN_AI_OBJECT"
+
+
+@dataclass
+class LLMOutputTypes:
+ COMPLETION = "completion"
+ CHAT = "chat"
+ EMBEDDINGS = "embeddings"
+ MULTI_MODAL = "multimodal"
+
+
+@dataclass
+class InstallLibs:
+ LLAMA_LLM_AZ_OAI = "llama-index-llms-azure-openai==0.1.5"
+ LLAMA_EMB_AZ_OAI = "llama-index-embeddings-azure-openai==0.1.6"
+ LLAMA_MM_LLM_AZ_OAI = "llama-index-multi-modal-llms-azure-openai==0.1.4"
+ AZURE_CORE = "azure-core==1.30.1"
+ TIKTOKEN = "tiktoken"
+
+
+@dataclass
+class LLMLiterals:
+ EMBEDDING_TOKEN_COUNT = "embedding_token_count"
+ PROMPT_LLM_TOKEN_COUNT = "prompt_llm_token_count"
+ COMPLETION_LLM_TOKEN_COUNT = "completion_llm_token_count"
+ TOTAL_LLM_TOKEN_COUNT = "total_llm_token_count"
+
diff --git a/promptwizard/glue/common/exceptions.py b/promptwizard/glue/common/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b974094022c4d24a550c5669419ec52d9961f8f
--- /dev/null
+++ b/promptwizard/glue/common/exceptions.py
@@ -0,0 +1,37 @@
+from .utils.logging import get_glue_logger
+
+logger = get_glue_logger(__name__)
+
+
+class GlueException(Exception):
+ """
+ Base class for all exceptions in Glue framework
+ """
+ def __init__(self, err_message):
+ logger.error(f"\n Error: {err_message}\n")
+ super().__init__(err_message)
+
+
+class GlueLLMException(GlueException):
+ """
+ Base class for all exceptions related to LLM
+ """
+ def __init__(self, err_message, excep_obj):
+ message = ("LLM exception\n"
+ f"Exception: {err_message}\n"
+ f"Exception logs: {excep_obj}")
+
+ super().__init__(message)
+
+
+class GlueValidaionException(GlueException):
+ """
+ Base class for all exceptions related to Validation in Glue framework
+ """
+ def __init__(self, err_message, excep_obj):
+ message = ("[Invalid user input detected]\n"
+ f"Exception: {err_message}\n"
+ f"Exception logs: {excep_obj}")
+
+ super().__init__(message)
+
diff --git a/promptwizard/glue/common/llm/__init__.py b/promptwizard/glue/common/llm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/promptwizard/glue/common/llm/custom_llm.py b/promptwizard/glue/common/llm/custom_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..096d523c99a9cbcc0ae983d640cef8c98fc3263f
--- /dev/null
+++ b/promptwizard/glue/common/llm/custom_llm.py
@@ -0,0 +1,20 @@
+class GlueLLM:
+ """
+ Abstract class that can be inherited by a class that defines Custom LLM
+ """
+
+ @staticmethod
+ def get_tokenizer():
+ """
+ This method should either return an encode method of tokenizer or None
+ :return: method
+
+ e.g. When using HuggingFace tokenizer
+ tokenizer = Tokenizer(BPE())
+ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+ return fast_tokenizer.encode
+
+ e.g. When using tiktoken tokenizer
+ return tiktoken.encoding_for_model(azure_oai_model.model_name_in_azure).encode
+ """
+ return None
diff --git a/promptwizard/glue/common/llm/llm_helper.py b/promptwizard/glue/common/llm/llm_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b8e6424334f29f4f988a8ea20f93f589e9e2fd8
--- /dev/null
+++ b/promptwizard/glue/common/llm/llm_helper.py
@@ -0,0 +1,30 @@
+from llama_index.core.llms import LLM
+from llama_index.core.callbacks.token_counting import TokenCountingHandler
+from llama_index.core.callbacks.base_handler import BaseCallbackHandler
+
+
+def get_token_counter(llm_handle: LLM) -> TokenCountingHandler:
+ """
+ Extract TokenCountingHandler handler from llm_handle.
+
+ :param llm_handle: Object of class LLM, which is the handle to make all LLM related calls
+ :return: Object of TokenCountingHandler, that's registered as callback_manager in LLM. If not found, return None
+ """
+ return get_callback_handler(llm_handle, "TokenCountingHandler")
+
+
+def get_callback_handler(llm_handle: LLM, class_name: str) -> BaseCallbackHandler:
+ """
+ Extract callback_manager from llm_handle, find out which call back manager is of class type `class_name`.
+ Return that object.
+
+ :param llm_handle: Object of class LLM, which is the handle to make all LLM related calls
+ :param class_name: Name of class (without prefix file path) e.g. TokenCountingHandler
+ :return: Object of BaseCallbackHandler, that's registered as callback_manager in LLM. If not found, return None
+ """
+ if llm_handle and llm_handle.callback_manager:
+ for handler in llm_handle.callback_manager.handlers:
+ if type(handler).__name__ == class_name:
+ return handler
+
+ return None
diff --git a/promptwizard/glue/common/llm/llm_mgr.py b/promptwizard/glue/common/llm/llm_mgr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5cec9cd19adfc9da1a271ad011b6efa657687f0
--- /dev/null
+++ b/promptwizard/glue/common/llm/llm_mgr.py
@@ -0,0 +1,195 @@
+from typing import Dict
+from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
+from llama_index.core.llms import ChatMessage
+from llama_index.core.llms import LLM
+from tenacity import retry, stop_after_attempt, wait_fixed, wait_random
+from ..base_classes import LLMConfig
+from ..constants.str_literals import InstallLibs, OAILiterals, \
+ OAILiterals, LLMLiterals, LLMOutputTypes
+from .llm_helper import get_token_counter
+from ..exceptions import GlueLLMException
+from ..utils.runtime_tasks import install_lib_if_missing
+from ..utils.logging import get_glue_logger
+from ..utils.runtime_tasks import str_to_class
+import os
+logger = get_glue_logger(__name__)
+
+def call_api(messages):
+
+ from openai import OpenAI
+ from azure.identity import get_bearer_token_provider, AzureCliCredential
+ from openai import AzureOpenAI
+
+ if os.environ['USE_OPENAI_API_KEY'] == "True":
+ client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+
+ response = client.chat.completions.create(
+ model=os.environ["OPENAI_MODEL_NAME"],
+ messages=messages,
+ temperature=0.0,
+ )
+ else:
+ token_provider = get_bearer_token_provider(
+ AzureCliCredential(), "https://cognitiveservices.azure.com/.default"
+ )
+ client = AzureOpenAI(
+ api_version=os.environ["OPENAI_API_VERSION"],
+ azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+ azure_ad_token_provider=token_provider
+ )
+ response = client.chat.completions.create(
+ model=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
+ messages=messages,
+ temperature=0.0,
+ )
+
+ prediction = response.choices[0].message.content
+ return prediction
+
+
+class LLMMgr:
+ @staticmethod
+ def chat_completion(messages: Dict):
+ llm_handle = os.environ.get("MODEL_TYPE", "AzureOpenAI")
+ try:
+ if(llm_handle == "AzureOpenAI"):
+ # Code to for calling LLMs
+ return call_api(messages)
+ elif(llm_handle == "LLamaAML"):
+ # Code to for calling SLMs
+ return 0
+ except Exception as e:
+ print(e)
+ return "Sorry, I am not able to understand your query. Please try again."
+ # raise GlueLLMException(f"Exception when calling {llm_handle.__class__.__name__} "
+ # f"LLM in chat mode, with message {messages} ", e)
+
+
+ @staticmethod
+ def get_all_model_ids_of_type(llm_config: LLMConfig, llm_output_type: str):
+ res = []
+ if llm_config.azure_open_ai:
+ for azure_model in llm_config.azure_open_ai.azure_oai_models:
+ if azure_model.model_type == llm_output_type:
+ res.append(azure_model.unique_model_id)
+ if llm_config.custom_models:
+ if llm_config.custom_models.model_type == llm_output_type:
+ res.append(llm_config.custom_models.unique_model_id)
+ return res
+
+ @staticmethod
+ def get_llm_pool(llm_config: LLMConfig) -> Dict[str, LLM]:
+ """
+ Create a dictionary of LLMs. key would be unique id of LLM, value is object using which
+ methods associated with that LLM service can be called.
+
+ :param llm_config: Object having all settings & preferences for all LLMs to be used in out system
+ :return: Dict key=unique_model_id of LLM, value=Object of class llama_index.core.llms.LLM
+ which can be used as handle to that LLM
+ """
+ llm_pool = {}
+ az_llm_config = llm_config.azure_open_ai
+
+ if az_llm_config:
+ install_lib_if_missing(InstallLibs.LLAMA_LLM_AZ_OAI)
+ install_lib_if_missing(InstallLibs.LLAMA_EMB_AZ_OAI)
+ install_lib_if_missing(InstallLibs.LLAMA_MM_LLM_AZ_OAI)
+ install_lib_if_missing(InstallLibs.TIKTOKEN)
+
+ import tiktoken
+ # from llama_index.llms.azure_openai import AzureOpenAI
+ from openai import AzureOpenAI
+ from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
+ from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal
+
+ az_token_provider = None
+ # if az_llm_config.use_azure_ad:
+ from azure.identity import get_bearer_token_provider, AzureCliCredential
+ az_token_provider = get_bearer_token_provider(AzureCliCredential(),
+ "https://cognitiveservices.azure.com/.default")
+
+ for azure_oai_model in az_llm_config.azure_oai_models:
+ callback_mgr = None
+ if azure_oai_model.track_tokens:
+
+ # If we need to count number of tokens used in LLM calls
+ token_counter = TokenCountingHandler(
+ tokenizer=tiktoken.encoding_for_model(azure_oai_model.model_name_in_azure).encode
+ )
+ callback_mgr = CallbackManager([token_counter])
+ token_counter.reset_counts()
+ # ()
+
+ if azure_oai_model.model_type in [LLMOutputTypes.CHAT, LLMOutputTypes.COMPLETION]:
+ # ()
+ llm_pool[azure_oai_model.unique_model_id] = \
+ AzureOpenAI(
+ # use_azure_ad=az_llm_config.use_azure_ad,
+ azure_ad_token_provider=az_token_provider,
+ # model=azure_oai_model.model_name_in_azure,
+ # deployment_name=azure_oai_model.deployment_name_in_azure,
+ api_key=az_llm_config.api_key,
+ azure_endpoint=az_llm_config.azure_endpoint,
+ api_version=az_llm_config.api_version,
+ # callback_manager=callback_mgr
+ )
+ # ()
+ elif azure_oai_model.model_type == LLMOutputTypes.EMBEDDINGS:
+ llm_pool[azure_oai_model.unique_model_id] =\
+ AzureOpenAIEmbedding(use_azure_ad=az_llm_config.use_azure_ad,
+ azure_ad_token_provider=az_token_provider,
+ model=azure_oai_model.model_name_in_azure,
+ deployment_name=azure_oai_model.deployment_name_in_azure,
+ api_key=az_llm_config.api_key,
+ azure_endpoint=az_llm_config.azure_endpoint,
+ api_version=az_llm_config.api_version,
+ callback_manager=callback_mgr
+ )
+ elif azure_oai_model.model_type == LLMOutputTypes.MULTI_MODAL:
+
+ llm_pool[azure_oai_model.unique_model_id] = \
+ AzureOpenAIMultiModal(use_azure_ad=az_llm_config.use_azure_ad,
+ azure_ad_token_provider=az_token_provider,
+ model=azure_oai_model.model_name_in_azure,
+ deployment_name=azure_oai_model.deployment_name_in_azure,
+ api_key=az_llm_config.api_key,
+ azure_endpoint=az_llm_config.azure_endpoint,
+ api_version=az_llm_config.api_version,
+ max_new_tokens=4096
+ )
+
+ if llm_config.custom_models:
+ for custom_model in llm_config.custom_models:
+ # try:
+ custom_llm_class = str_to_class(custom_model.class_name, None, custom_model.path_to_py_file)
+
+ callback_mgr = None
+ if custom_model.track_tokens:
+ # If we need to count number of tokens used in LLM calls
+ token_counter = TokenCountingHandler(
+ tokenizer=custom_llm_class.get_tokenizer()
+ )
+ callback_mgr = CallbackManager([token_counter])
+ token_counter.reset_counts()
+ llm_pool[custom_model.unique_model_id] = custom_llm_class(callback_manager=callback_mgr)
+ # except Exception as e:
+ # raise GlueLLMException(f"Custom model {custom_model.unique_model_id} not loaded.", e)
+ return llm_pool
+
+ @staticmethod
+ def get_tokens_used(llm_handle: LLM) -> Dict[str, int]:
+ """
+ For a given LLM, output the number of tokens used.
+
+ :param llm_handle: Handle to a single LLM
+ :return: Dict of token-type and count of tokens used
+ """
+ token_counter = get_token_counter(llm_handle)
+ if token_counter:
+ return {
+ LLMLiterals.EMBEDDING_TOKEN_COUNT: token_counter.total_embedding_token_count,
+ LLMLiterals.PROMPT_LLM_TOKEN_COUNT: token_counter.prompt_llm_token_count,
+ LLMLiterals.COMPLETION_LLM_TOKEN_COUNT: token_counter.completion_llm_token_count,
+ LLMLiterals.TOTAL_LLM_TOKEN_COUNT: token_counter.total_llm_token_count
+ }
+ return None
diff --git a/promptwizard/glue/common/llm/promptMessage.py b/promptwizard/glue/common/llm/promptMessage.py
new file mode 100644
index 0000000000000000000000000000000000000000..e774d0f3442ccafcdfe40e231df75d3577ebbc73
--- /dev/null
+++ b/promptwizard/glue/common/llm/promptMessage.py
@@ -0,0 +1,21 @@
+[
+ {
+ "role": "system",
+ "content": "You are a mathematician with a profound knowledge of various fields within mathematics, including algebra, calculus, geometry, statistics, and number theory. Holding an advanced degree in mathematics, your expertise ranges from fundamental arithmetic to complex, abstract mathematical theories. Your analytical skills enable you to solve intricate problems, prove theorems, and provide clear explanations on a wide range of mathematical concepts. Whether addressing basic arithmetic questions or tackling advanced topics like differential equations and linear algebra, your precision and clarity in conveying mathematical ideas are exceptional. Your experience teaching and publishing research in mathematics ensures that your explanations are accurate, thorough, and comprehensible, making you an ideal agent to address any mathematics-related queries with confidence and authority."
+ },
+ {
+ "role": "user",
+ "content": "You are an expert in mathematics. Your task is to solve and provide detailed explanations for mathematics questions accurately, specifically focusing on multiple-choice formats. \n\nBegin by clearly identifying any fundamental assumptions, initial conditions, or properties relevant to the problem. Verify these assumptions and conditions against the provided options. Ensure to interpret any special values or symbolic expressions correctly, such as Ļ or vectors.\n\nSimplify the problem wherever possible to make each part more manageable without oversimplifying. Provide a step-by-step breakdown of your solution process, including comprehensive reasoning and calculations. Ensure each intermediary step logically follows from the previous one. \n\nWhile concluding the problem, compare your computed solution explicitly against the given multiple-choice options and select the one that matches. Cross-check every intermediate and final computation for numerical correctness and consistency with given conditions.\n\nAim for clarity and precision in your explanations to ensure complete understanding, and prioritize providing the most accurate solution that aligns with the problem's specific requirements.\n\nProvide the final answer, without the option choices.\n\n[Question] {question}\n[Answer] \n"
+ }
+]
+
+[
+ {
+ "role": "system",
+ "content": "You are a mathematician with a profound knowledge of various fields within mathematics, including algebra, calculus, geometry, statistics, and number theory. Holding an advanced degree in mathematics, your expertise ranges from fundamental arithmetic to complex, abstract mathematical theories. Your analytical skills enable you to solve intricate problems, prove theorems, and provide clear explanations on a wide range of mathematical concepts. Whether addressing basic arithmetic questions or tackling advanced topics like differential equations and linear algebra, your precision and clarity in conveying mathematical ideas are exceptional. Your experience teaching and publishing research in mathematics ensures that your explanations are accurate, thorough, and comprehensible, making you an ideal agent to address any mathematics-related queries with confidence and authority."
+ },
+ {
+ "role": "user",
+ "content": "\nYou are an expert in mathematics. Your task is to solve and provide detailed explanations for mathematics questions accurately, specifically focusing on multiple-choice formats. \n\nBegin by clearly identifying any fundamental assumptions, initial conditions, or properties relevant to the problem. Verify these assumptions and conditions against the provided options. Ensure to interpret any special values or symbolic expressions correctly, such as Ļ or vectors.\n\nSimplify the problem wherever possible to make each part more manageable without oversimplifying. Provide a step-by-step breakdown of your solution process, including comprehensive reasoning and calculations. Ensure each intermediary step logically follows from the previous one. \n\nWhile concluding the problem, compare your computed solution explicitly against the given multiple-choice options and select the one that matches. Cross-check every intermediate and final computation for numerical correctness and consistency with given conditions.\n\nAim for clarity and precision in your explanations to ensure complete understanding, and prioritize providing the most accurate solution that aligns with the problem\'s specific requirements.\n\n\n[Question] The angle between the two tangents from the origin to the circle (x-7)2+(y+1)2=25 is\n(1) 0\n(2) π3\n(3) π6\n(4) π2\n[Answer] [Question]: The angle between the two tangents from the origin to the circle \\((x-7)^2 + (y+1)^2 = 25\\) is\n(1) 0\n(2) \\(\\frac{\\pi}{3}\\)\n(3) \\(\\frac{\\pi}{6}\\)\n(4) \\(\\frac{\\pi}{2}\\)\n\n[Answer]: \\(\\frac{\\pi}{2}\\) \n\n[Improved Reasoning Chain]:\n\n1. **Identify the Circle\'s Properties:**\n The given equation of the circle is \\((x - 7)^2 + (y + 1)^2 = 25\\). This is in the standard form \\((x - h)^2 + (y - k)^2 = r^2\\), where \\( (h, k) \\) is the center of the circle and \\( r \\) is the radius.\n - Center of the circle, \\((h, k) = (7, -1)\\)\n - Radius, \\( r = \\sqrt{25} = 5 \\)\n\n2. **Concept of Tangent from an External Point:**\n Tangents drawn from an external point to a circle form equal angles with the line joining the external point to the center of the circle. The formula to find the angle \\(\\theta\\) between the two tangents from a point \\((x_1, y_1)\\) to a circle with center \\((h, k)\\) and radius \\(r\\) is:\n \\[\n \\cos\\theta = \\frac{\\text{distance from the external point to the center}}{\\text{radius}}\n \\]\n Where distance from the origin \\((0,0)\\) to the center \\((7, -1)\\) is calculated using the distance formula:\n \\[\n \\text{Distance} = \\sqrt{(7 - 0)^2 + (-1 - 0)^2} = \\sqrt{49 + 1} = \\sqrt{50} = 5\\sqrt{2}\n \\]\n\n3. **Calculate Cosine of the Angle:**\n Using the distance found, we calculate \\(\\cos\\theta\\):\n \\[\n \\cos\\theta = \\frac{5\\sqrt{2}}{5} = \\sqrt{2} \n \\] \n Since the distance should be divided by r it should be : \n \\[\n \\cos\\theta = \\frac{5\\sqrt{2}}{5} = \\frac{\\sqrt{2} }= \\sqrt{2}/2 \n \\]\n\n4. **Actual Formula for the Angle Between Tangents**: \n actual formula is cos(thetap/2)\n =RA. hence \\\n distance for this point to the center\'s point \n this is \n the thus Tangent between T=\n intersection point of cos(a) hence\n the sec=sin(tah)\n\n5. **Calculate the Angle:**\n Solving for \n \\(\\cos\\frac{\\theta}{2}\\), where \\(\\cos\\theta=\\frac5}. is\n \n st that \\ half overall angle $\\theta $\n thus \n\n6. \n\n6. **Determine the Angle Between Tangents:**\n is simplest such computation and-->\nIf \\theta is \\\n \n angle $\\theta = 2\\times$\\frac{\\pi } = \\-< \n\n therefore \\(\\theta = 2\\sine\\theta\n\n7. **Verify Against Options:**\n \n\n ŲÆŲ§Ų±ŪŁ $\\theta $\n \n\n \\( = \\left\\(\\frac{\\pi}\n\ntherefore \n angle thus is \n of multiple correct .__;\n\nThus Therefore or \\(\\pi0$\\mid$\n\nhence circle\npoints ot \\frac \\(^-^\\div$\\\n\n\n\n[Question] \\( \\mathbf{a}, \\mathbf{b}, \\mathbf{c} \\) are three vectors, such that \\( \\mathbf{a}+\\mathbf{b}+\\mathbf{c}=0,|\\mathbf{a}|=1 \\), \\( |b|=2,|c|=3 \\), then \\( \\mathbf{a} \\cdot \\mathbf{b}+\\mathbf{b} \\cdot \\mathbf{c}+\\mathbf{c} \\cdot \\mathbf{a} \\) is equal to\n(a) 0\n(b) \\( -7 \\)\n(c) 7\n(d) 4\n[Answer] ### Solution:\n\nTo solve the problem, let\'s start with the given conditions and fundamental vector properties:\n\n1. **Given Conditions**:\n - \\(\\mathbf{a} + \\mathbf{b} + \\mathbf{c} = 0\\)\n - \\(|\\mathbf{a}| = 1\\)\n - \\(|\\mathbf{b}| = 2\\)\n - \\(|\\mathbf{c}| = 3\\)\n\n2. **Required**:\n Determine the value of \\(\\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{b} \\cdot \\mathbf{c} + \\mathbf{c} \\cdot \\mathbf{a}\\).\n\n### Key Steps and Reasoning:\n\n1. **Rewrite the Vector Equation**:\n Given that \\(\\mathbf{a} + \\mathbf{b} + \\mathbf{c} = 0\\), we can rearrange this as:\n \\[\n \\mathbf{c} = -(\\mathbf{a} + \\mathbf{b})\n \\]\n\n2. **Substitute \\(\\mathbf{c}\\) into the Dot Product Expression**:\n We need to evaluate \\(\\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{b} \\cdot (\\mathbf{c}) + \\mathbf{c} \\cdot (\\mathbf{a})\\). Substitute \\(\\mathbf{c} = -(\\mathbf{a} + \\mathbf{b})\\):\n\n \\[\n \\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{b} \\cdot (-(\\mathbf{a} + \\mathbf{b})) + (-(\\mathbf{a} + \\mathbf{b})) \\cdot \\mathbf{a}\n \\]\n\n3. **Expand the Dot Products**:\n \\[\n \\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{b} \\cdot (-\\mathbf{a} - \\mathbf{b}) + (-\\mathbf{a} - \\mathbf{b}) \\cdot \\mathbf{a}\n \\]\n\n4. **Distribute the Dot Products**:\n \\[\n \\mathbf{a} \\cdot \\mathbf{b} + \\mathbf{b} \\cdot (-\\mathbf{a}) + \\mathbf{b} \\cdot (-\\mathbf{b}) + (-\\mathbf{a}) \\cdot \\mathbf{a} + (-\\mathbf{a}) \\cdot \\mathbf{b}\n \\]\n\n5. **Simplify Terms**:\n Recall that the dot product is commutative (\\(\\mathbf{u} \\cdot \\mathbf{v} = \\mathbf{v} \\cdot \\mathbf{u}\\)):\n \\[\n \\mathbf{a} \\cdot \\mathbf{b} - \\mathbf{b} \\cdot \\mathbf{a} - \\mathbf{b} \\cdot \\mathbf{b} - \\mathbf{a} \\cdot \\mathbf{a} - \\mathbf{a} \\cdot \\mathbf{b}\n \\]\n\n6. **Combine Like Terms**:\n Notice that \\(\\mathbf{a} \\cdot \\mathbf{b}\\) and \\(-\\mathbf{b} \\cdot \\mathbf{a}\\) cancel out, leaving:\n \\[\n - \\mathbf{b} \\cdot \\mathbf{b} - \\mathbf{a} \\cdot \\mathbf{a} - \\mathbf{a} \\cdot \\mathbf{b}\n \\]\n Since \\(\\|\\mathbf{a}\\|^2 = \\mathbf{a} \\cdot \\mathbf{a}\\) and \\(\\|\\mathbf{b}\\|^2 = \\mathbf{b} \\cdot \\mathbf{b}\\), we substitute:\n \\[\n - \\|\\mathbf{b}\\|^2 - \\|\\mathbf{a}\\|^2\n \\]\n\n7. **Substitute the Known Magnitudes**:\n Given \\(|\\mathbf{a}| = 1\\) and \\(|\\mathbf{b}| = 2\\):\n \\[\n - 2^2 - 1^2 = -4 - 1 = -5\n \\]\n\nHowever, we need to recheck that we only included terms up to \\(\\mathbf{a} \\cdot \\mathbf{b}, \\mathbf{b} \\cdot \\mathbf{c}, \\mathbf{c} \\cdot \\mathbf{a}\\). Upon reworking final checks, the multiple-choice logic was inferred including pairwise products confirming all terms:\n\nWe thus verified options against such pair pairs validating product, re-ensuring \\(-(\\lbrace\\mathbf{pairs)\\cdot -1) on logic pairs\\), ensures correct revised steps validations across true vector property consistency and option choice confirms \\(( -7)\\).\n\n8. **Select the Correct Option**:\n Upon reconfirmation, we finalize:\n \\[\n -4 -3 = -7\n \\]\n\nThus, the correct answer is:\n\\( \\boxed{-7} \\)\n
-7
\n\n[Question] Suppose \\( y=f(x) \\) and \\( y=g(x) \\) are two functions whose graphs intersect at the three points \\( (0,4),(2,2) \\) and \\( (4,0) \\) with \\( f(x)>g(x) \\) for \\( 0<x<2 \\) and \\( f(x)<g(x) \\) for \\( 2<x<4 \\). If \\( \\int_{0}^{4}[f(x)-g(x)] d x=10 \\) and \\( \\int_{2}^{4}[g(x)-f(x)] d x=5 \\), then the area between two curves for \\( 0<x<2 \\), is\n(a) 5\n(b) 10\n(c) 15\n(d) 20\n[Answer] ### Determining the Area Between Two Curves\n\nWe are given functions \\( y=f(x) \\) and \\( y=g(x) \\) whose graphs intersect at three specific points: \\( (0, 4) \\), \\( (2, 2) \\), and \\( (4, 0) \\). The problem specifies that \\( f(x) > g(x) \\) for \\( 0 < x < 2 \\) and \\( f(x) < g(x) \\) for \\( 2 < x < 4 \\). We need to determine the area between these curves for the interval \\( 0 < x < 2 \\).\n\n#### Fundamental Assumptions and Properties\n- The area between two curves \\( y = f(x) \\) and \\( y = g(x) \\) over an interval \\([a, b]\\) is given by the integral \\(\\int_{a}^{b} |f(x) - g(x)| \\,dx\\).\n- When \\( f(x) > g(x) \\) in the interval \\( 0 < x < 2 \\), the absolute value can be removed, and we have \\( \\int_{0}^{2} (f(x) - g(x)) \\, dx \\).\n- Similarly, for \\( 2 < x < 4 \\), where \\( f(x) < g(x) \\), the integral becomes \\(\\int_{2}^{4} (g(x) - f(x)) \\, dx \\).\n\n#### Simplification Using Given Information\nWe are provided two specific integral values:\n1. \\( \\int_{0}^{4} [f(x) - g(x)] \\, dx = 10 \\)\n2. \\( \\int_{2}^{4} [g(x) - f(x)] \\, dx = 5 \\)\n\nFrom the first integral, \\( \\int_{0}^{4} [f(x) - g(x)] \\, dx = 10 \\):\nThis integral can be split into two parts:\n\\[ \\int_{0}^{2} [f(x) - g(x)] \\, dx + \\int_{2}^{4} [f(x) - g(x)] \\, dx = 10 \\]\n\nGiven the second integral, \\( \\int_{2}^{4} [g(x) - f(x)] \\, dx = 5 \\):\n\\[ \\int_{2}^{4} [f(x) - g(x)] \\, dx = -\\int_{2}^{4} [g(x) - f(x)] \\, dx = -5 \\]\n\nPutting it all together:\n\\[ \\int_{0}^{2} [f(x) - g(x)] \\, dx + \\int_{2}^{4} [f(x) - g(x)] \\, dx = 10 \\]\n\\[ \\int_{0}^{2} [f(x) - g(x)] \\, dx - 5 = 10 \\]\n\nSolving for the required integral:\n\\[ \\int_{0}^{2} [f(x) - g(x)] \\, dx = 10 + 5 \\]\n\\[ \\int_{0}^{2} [f(x) - g(x)] \\, dx = 15 \\]\n\n### Conclusion\nBased on the calculations, the area between the curves \\( y = f(x) \\) and \\( y = g(x) \\) from \\( 0 < x < 2 \\) is 15.\n\nThus, the correct answer is (c) 15.
15
\n\n\nProvide the final answer.\n\n[Question] {question}\n[Answer] \n"
+ }
+]
\ No newline at end of file
diff --git a/promptwizard/glue/common/utils/__init__.py b/promptwizard/glue/common/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de1d315b4fdeb5e376790e65ee1f4dc8eef59c0
--- /dev/null
+++ b/promptwizard/glue/common/utils/__init__.py
@@ -0,0 +1,7 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""
+ Vellm common package. This would be imported in almost all the vellm packages. Consider this package as parent
+ root-node for all Vellm related packages. ./utils folder will have all common utils.
+"""
diff --git a/promptwizard/glue/common/utils/download.py b/promptwizard/glue/common/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..850e002662df5fd2c55de151176f6fcdaa9bbd85
--- /dev/null
+++ b/promptwizard/glue/common/utils/download.py
@@ -0,0 +1,35 @@
+import os
+import requests
+
+from pathlib import Path
+from urllib.parse import urlparse
+from glue.common.constants.str_literals import DirNames
+from glue.common.utils.logging import get_glue_logger
+
+logger = get_glue_logger(__name__)
+
+def download_model(url):
+ cwd = os.getcwd()
+ dirs = Path(cwd).parts
+ idx = 0
+ if DirNames.PACKAGE_BASE_DIR in dirs:
+ idx = dir.index(DirNames.PACKAGE_BASE_DIR)
+ download_path = os.path.join(*dir[:idx+1], DirNames.MODEL_DIR)
+ os.makedirs(download_path, exist_ok=True)
+
+ parsed_url = urlparse(url)
+ model_filename = os.path.basename(parsed_url.path)
+
+ model_path = os.path.join(download_path, model_filename)
+ if not os.path.exists(model_path):
+ r = requests.get(url, stream=True)
+ if r.ok:
+ with os.open(model_path, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=1024 * 8):
+ if chunk:
+ f.write(chunk)
+ f.flush()
+
+ return model_path
+
+
diff --git a/promptwizard/glue/common/utils/file.py b/promptwizard/glue/common/utils/file.py
new file mode 100644
index 0000000000000000000000000000000000000000..62417a72e930cc4e5f09b775577a004102ebb273
--- /dev/null
+++ b/promptwizard/glue/common/utils/file.py
@@ -0,0 +1,131 @@
+import json
+from os.path import join
+from typing import Dict, List
+import yaml
+
+from ..exceptions import GlueValidaionException
+
+
+def yaml_to_dict(file_path: str) -> Dict:
+ with open(file_path) as yaml_file:
+ yaml_string = yaml_file.read()
+
+ try:
+ # convert yaml string to dict
+ parsed_dict = yaml.safe_load(yaml_string)
+ except yaml.scanner.ScannerError as e:
+ raise GlueValidaionException(f"There could be some syntax error in yaml written in {file_path}", e)
+
+ return parsed_dict
+
+
+def yaml_to_class(yaml_file_path: str, cls: type, default_yaml_file_path: str = None):
+ """
+ Read yaml file present at path `yaml_file_path`, convert it to dictionary using pyyaml's standard methods.
+ Then convert this dictionary to class object of class given as `cls`. Further check if user has provided all
+ the required fields in `yaml_file_path`. Fields that are missing in `yaml_file_path`, set them with defaults.
+
+ :param yaml_file_path: str
+ :param cls: type
+ :param default_yaml_file_path: str
+ :return:
+ """
+ if not yaml_file_path:
+ yaml_file_path = default_yaml_file_path
+ custom_args = yaml_to_dict(yaml_file_path)
+
+ if default_yaml_file_path:
+ # If user has not provided all the required arguments, fill them with defaults
+ default_args = yaml_to_dict(default_yaml_file_path)
+ missing_args = set(default_args) - set(custom_args)
+ for key in list(missing_args):
+ custom_args[key] = default_args[key]
+
+ try:
+ yaml_as_class = cls(**custom_args)
+ except TypeError as e:
+ raise GlueValidaionException(f"Exception while converting yaml file at {yaml_file_path} "
+ f"to class {cls.__name__}: ", e)
+
+ return yaml_as_class
+
+
+def read_jsonl(file_path: str) -> List:
+ """
+ This function should be used when size of jsonl file is not too big.
+
+ :param file_path:
+ :return: All json strings in .jsonl file as a list
+ """
+ jsonl_list = []
+ with open(file_path, "r") as fileobj:
+ while True:
+ single_row = fileobj.readline()
+ if not single_row:
+ break
+
+ json_object = json.loads(single_row.strip())
+ jsonl_list.append(json_object)
+ return jsonl_list
+
+
+def read_jsonl_row(file_path: str):
+ """
+
+ :param file_path:
+ :return: Single line from the file. One at a time.
+ """
+ with open(file_path, "r") as fileobj:
+ while True:
+ try:
+ single_row = fileobj.readline()
+ if not single_row:
+ break
+
+ json_object = json.loads(single_row.strip())
+ yield json_object
+ except json.JSONDecodeError as e:
+ print(f"Error while reading jsonl file at {file_path}. Error: {e}")
+ continue
+
+
+def append_as_jsonl(file_path: str, args_to_log: Dict):
+ """
+
+ :param file_path:
+ :param args_to_log:
+ :return:
+ """
+ json_str = json.dumps(args_to_log, default=str)
+ with open(file_path, "a") as fileobj:
+ fileobj.write(json_str+"\n")
+
+
+def save_jsonlist(file_path: str, json_list: List, mode: str = "a"):
+ """
+ :param json_list: List of json objects
+ :param file_path: File location to which we shall save content of json_list list, in jsonl format.
+ :param mode: Write mode
+ :return: None
+ """
+ with open(file_path, mode) as file_obj:
+ for json_obj in json_list:
+ json_str = json.dumps(json_obj, default=str)
+ file_obj.write(json_str+"\n")
+
+
+def str_list_to_dir_path(str_list: List[str]) -> str:
+ """
+ Return a string which is directory path formed out of concatenating given strings in list `str_list`
+
+ e.g.
+ str_list=["dir_1", "sub_dir_1"]
+ return "dir_1\sub_dir_1"
+ """
+ if not str_list:
+ return ""
+
+ path = ""
+ for dir_name in str_list:
+ path = join(path, dir_name)
+ return path
diff --git a/promptwizard/glue/common/utils/logging.py b/promptwizard/glue/common/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab23bc40c9110ecaf572158e6f3c42d6fa3547e3
--- /dev/null
+++ b/promptwizard/glue/common/utils/logging.py
@@ -0,0 +1,54 @@
+import logging
+from os import makedirs
+from os.path import join
+from logging.handlers import TimedRotatingFileHandler
+
+from ..constants.str_literals import FileConstants
+
+logging_handlers_list = []
+
+
+def set_logging_config(log_dirpath: str, mode: str = "offline") -> None:
+ """
+ This logger should be used when we are running online production scenario
+
+ :param log_dirpath: Path to directory where logg files should be saved.
+ :param mode: Specifies whether the mode is `online or `offline`
+ :return:
+ """
+ global logging_handlers_list
+ makedirs(log_dirpath, exist_ok=True)
+ logging.basicConfig(filename=join(log_dirpath, FileConstants.logfile_name),
+ filemode='a',
+ format=u"%(asctime)s.%(msecs)03d | %(name)-12s | %(funcName)s:\n%(message)s\n",
+ datefmt='%Y-%m-%d,%H:%M:%S',
+ level=logging.NOTSET,
+ force=True,
+ encoding="utf-8")
+
+ if mode == "online":
+ daily_split_handler = TimedRotatingFileHandler(FileConstants.logfile_prefix, when="midnight", backupCount=30, encoding="utf-8")
+ daily_split_handler.suffix = "%Y%m%d"
+ logging_handlers_list = [daily_split_handler]
+ else:
+ console = logging.StreamHandler()
+ console.setLevel(logging.NOTSET)
+ logging_handlers_list = [console]
+
+
+def get_glue_logger(module_name: str) -> logging.Logger:
+ """
+ Method to get common logger object for module.
+
+ :param module_name: Name of the module.
+ :return: Logger object, which can be used for logging
+ """
+ global logging_handlers_list
+
+ logger = logging.getLogger(module_name)
+ for handler in logging_handlers_list:
+ logger.addHandler(handler)
+ # TODO: Add handler to log to app insights if Azure connection is ON
+
+ return logger
+
diff --git a/promptwizard/glue/common/utils/runtime_tasks.py b/promptwizard/glue/common/utils/runtime_tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3946c598ffe8e2aa6f7dd1e66437f1e0744fa5af
--- /dev/null
+++ b/promptwizard/glue/common/utils/runtime_tasks.py
@@ -0,0 +1,67 @@
+from importlib import import_module
+from importlib.metadata import distribution, PackageNotFoundError
+import os
+from importlib.util import module_from_spec, spec_from_file_location
+
+from os.path import basename, splitext
+import subprocess
+import sys
+
+from ..constants.log_strings import CommonLogsStr
+from ..exceptions import GlueValidaionException
+from ..utils.logging import get_glue_logger
+
+logger = get_glue_logger(__name__)
+
+
+def install_lib_if_missing(lib_name, find_links = None) -> bool:
+ """
+ Check if library with name `lib_name` is installed in environment. If not, install it in runtime.
+
+ :param lib_name: Name of library
+ :return: True if library was installed. False if it was not initially installed and was installed now.
+ """
+ try:
+ version = None
+ if "==" in lib_name:
+ lib_name, version = lib_name.split("==")
+ distri_obj = distribution(lib_name)
+ # if version and distri_obj.version != version:
+ # raise GlueValidaionException(f"{lib_name} with version={distri_obj.version} is found. "
+ # f"But version needed is {version}", None)
+ return True
+ except (PackageNotFoundError, GlueValidaionException):
+ logger.info(CommonLogsStr.INSTALL_MISSING_LIB.format(lib_name=lib_name))
+ with open(os.devnull, 'w') as devnull:
+ if find_links:
+ subprocess.check_call([sys.executable, "-m", "pip", "install", lib_name, "-f", find_links], stdout=devnull, stderr=devnull)
+ else:
+ subprocess.check_call([sys.executable, "-m", "pip", "install", lib_name], stdout=devnull, stderr=devnull)
+
+ return False
+
+
+def str_to_class(class_name: str, import_path: str = None, file_path: str = None):
+ """
+ For a given `class_name` in string format, return the class instance (not object).
+ You need to specify any one of the 2: import_path or file_path. When both are specified `import_path` takes
+ precedence.
+
+ :param class_name: Class name, specified as string e.g. CSVReader
+ :param import_path: Import path for the specified class_name e.g. llama_index.readers.file
+ :param file_path: Path to the file where this class is present. e.g. C:\\dir1\\sub_dir1\\filename.py
+ :return: Class
+ """
+
+ if import_path:
+ cls = getattr(import_module(import_path), class_name)
+ elif file_path:
+ file_name_without_extsn = splitext(basename(file_path))[0]
+ spec = spec_from_file_location(file_name_without_extsn, file_path)
+ module = module_from_spec(spec)
+ spec.loader.exec_module(module)
+ cls = getattr(module, class_name)
+ else:
+ cls = getattr(sys.modules[__name__], class_name)
+
+ return cls
diff --git a/promptwizard/glue/paramlogger/__init__.py b/promptwizard/glue/paramlogger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb5427ffd4903ef0d777c332f35453c9a1664e6d
--- /dev/null
+++ b/promptwizard/glue/paramlogger/__init__.py
@@ -0,0 +1,141 @@
+__path__ = __import__('pkgutil').extend_path(__path__, __name__)
+
+from collections import defaultdict
+from datetime import datetime
+from os import makedirs
+from os.path import basename, join
+from uuid import uuid4
+
+from . import file_utils as futil
+from .constants import LogLiterals
+from .utils import run_method_get_io_dict
+
+
+class ParamLogger:
+ def __init__(self, base_path: str = ""):
+ """
+ :param base_path: Path where all log files would be saved
+ """
+ self.BASE_PATH = base_path
+ if base_path:
+ makedirs(self.BASE_PATH, exist_ok=True)
+
+ # Unique `id` for a sample in dataset
+ self.SAMPLE_UNQ_ID = None
+
+ # This dictionary can be used, when we want to log output and input of multiple components as a single jsonl
+ self.CHAINED_LOG = []
+
+ # When using ParamLogger decorator over a method in a class, should we avoid logging arguement with name `self`
+ self.DEL_SELF_ARG = True
+
+ def reset_eval_glue(self, base_path):
+ # Path where all log files would be saved
+ self.BASE_PATH = base_path
+ makedirs(self.BASE_PATH, exist_ok=True)
+
+ # Unique `id` for a sample in dataset
+ self.SAMPLE_UNQ_ID = None
+
+ # This dictionary can be used, when we want to log output and input of multiple components as a single jsonl
+ self.CHAINED_LOG = []
+
+ def clear_chained_log(self):
+ """
+ Deletes all previously saved data. Re-initialize CHAINED_LOG with new meta data.
+ """
+ self.CHAINED_LOG = []
+
+ def dump_chained_log_to_file(self, file_name="chained_logs"):
+ """
+ Append to file all data collected in CHAINED_LOG as json line.
+ Resets CHAINED_LOG to new instance
+ """
+
+ file_path = join(self.BASE_PATH, file_name + ".jsonl")
+ futil.save_jsonlist(file_path=file_path, json_list=self.CHAINED_LOG)
+ self.clear_chained_log()
+
+ def append_dict_to_chained_logs(self, args_to_log):
+ self.CHAINED_LOG.append(args_to_log)
+
+ def append_to_chained_log(self, method_obj):
+ """
+ Execute the method referenced by method_obj. After executing, append the jsonl form of inputs and outputs of
+ that method to self.CHAINED_LOG list.
+
+ :param method_obj:
+ :return: None
+ """
+ def wrap(*argv, **kwargs):
+ args_to_log = run_method_get_io_dict(method_obj, self.DEL_SELF_ARG, *argv, **kwargs)
+ args_to_log[LogLiterals.META][LogLiterals.METHOD_NAME] = method_obj.__name__
+ self.CHAINED_LOG.append(args_to_log)
+ return args_to_log[LogLiterals.OUTPUTS]
+ return wrap
+
+ def log_io_params(self, method_obj, file_name="io_logs"):
+ """
+ Execute the method referenced by method_obj. After executing, log the inputs and outputs of that method to
+ log file.
+
+ :param method_obj: Method reference, that can be executed
+ :param file_name: Name of file in which we shall be logging the input output params of method
+ :return: None
+ """
+ def wrap(*argv, **kwargs):
+ args_to_log = run_method_get_io_dict(method_obj, self.DEL_SELF_ARG, *argv, **kwargs)
+ if not self.SAMPLE_UNQ_ID:
+ self.SAMPLE_UNQ_ID = uuid4()
+ args_to_log[LogLiterals.ID] = self.SAMPLE_UNQ_ID
+ args_to_log[LogLiterals.META][LogLiterals.METHOD_NAME] = method_obj.__name__
+ file_path = join(self.BASE_PATH, file_name + ".jsonl")
+ futil.append_as_jsonl(file_path=file_path, args_to_log=args_to_log)
+ self.SAMPLE_UNQ_ID = None
+ return args_to_log[LogLiterals.OUTPUTS]
+ return wrap
+
+ def log_io_params_for_method(self, method_obj):
+ """
+ Execute the method referenced by method_obj. After executing, log the inputs and outputs of that method to
+ log file. Name of log file would be the method name
+
+ :param method_obj: Method reference, that can be executed
+ :return: None
+ """
+ def wrap(*argv, **kwargs):
+ args_to_log = run_method_get_io_dict(method_obj, self.DEL_SELF_ARG, *argv, **kwargs)
+ if not self.SAMPLE_UNQ_ID:
+ self.SAMPLE_UNQ_ID = uuid4()
+ args_to_log[LogLiterals.ID] = self.SAMPLE_UNQ_ID
+ file_path = join(self.BASE_PATH, method_obj.__name__+".jsonl")
+ futil.append_as_jsonl(file_path=file_path, args_to_log=args_to_log)
+ self.SAMPLE_UNQ_ID = None
+ return args_to_log[LogLiterals.OUTPUTS]
+ return wrap
+
+ def run_over_logs(self, method_obj):
+ """
+ Run the method referenced by method_obj over each entry in jsonl file present at location `file_path`.
+ `id`, `inputs`, `outputs` fields in jsonl file at `file_path` can be accessed via dummy_id, dummy_input,
+ dummy_output parameters respectively.
+
+ :param method_obj:
+ :return: None
+ """
+ def wrap(file_path, dummy_id, dummy_input, dummy_output, dummy_meta, **kwargs):
+ eval_file_path = join(self.BASE_PATH, method_obj.__name__ + "_" + basename(file_path))
+ args_to_log = defaultdict(dict)
+
+ for json_obj in futil.read_jsonl_row(file_path):
+ eval_result = method_obj(None,
+ json_obj[LogLiterals.ID],
+ json_obj[LogLiterals.INPUTS],
+ json_obj[LogLiterals.OUTPUTS],
+ json_obj[LogLiterals.META],
+ **kwargs)
+ args_to_log[LogLiterals.ID] = json_obj[LogLiterals.ID]
+ args_to_log[LogLiterals.EVAL_RESULT] = eval_result
+ args_to_log[LogLiterals.META][LogLiterals.TIMESTAMP] = datetime.now()
+ futil.append_as_jsonl(file_path=eval_file_path, args_to_log=args_to_log)
+ return wrap
diff --git a/promptwizard/glue/paramlogger/constants.py b/promptwizard/glue/paramlogger/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e399c320c9a0c62a370a893f319bf81a419960
--- /dev/null
+++ b/promptwizard/glue/paramlogger/constants.py
@@ -0,0 +1,11 @@
+class LogLiterals:
+ # static variables
+ INPUTS = "inputs"
+ OUTPUTS = "outputs"
+ META = "meta"
+ ID = "id"
+ TIMESTAMP = "timestamp"
+ EXEC_SEC = "execution_time_sec"
+ EVAL_RESULT = "eval_result"
+ METHOD_NAME = "method_name"
+ DIR_NAME = "io_logs"
diff --git a/promptwizard/glue/paramlogger/file_utils.py b/promptwizard/glue/paramlogger/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd3aad342f4bd9e7c4a7521a81353b72e64389a2
--- /dev/null
+++ b/promptwizard/glue/paramlogger/file_utils.py
@@ -0,0 +1,80 @@
+import json
+from os.path import join
+from typing import Dict, List
+
+
+def read_jsonl(file_path: str) -> List:
+ """
+ This function should be used when size of jsonl file is not too big.
+
+ :param file_path:
+ :return: All json strings in .jsonl file as a list
+ """
+ jsonl_list = []
+ with open(file_path, "r") as fileobj:
+ while True:
+ single_row = fileobj.readline()
+ if not single_row:
+ break
+
+ json_object = json.loads(single_row.strip())
+ jsonl_list.append(json_object)
+ return jsonl_list
+
+
+def read_jsonl_row(file_path: str):
+ """
+
+ :param file_path:
+ :return: Single line from the file. One at a time.
+ """
+ with open(file_path, "r") as fileobj:
+ while True:
+ single_row = fileobj.readline()
+ if not single_row:
+ break
+
+ json_object = json.loads(single_row.strip())
+ yield json_object
+
+
+def append_as_jsonl(file_path: str, args_to_log: Dict):
+ """
+
+ :param file_path:
+ :param args_to_log:
+ :return:
+ """
+ json_str = json.dumps(args_to_log, default=str)
+ with open(file_path, "a") as fileobj:
+ fileobj.write(json_str+"\n")
+
+
+def save_jsonlist(file_path: str, json_list: List, mode: str = "a"):
+ """
+ :param json_list: List of json objects
+ :param file_path: File location to which we shall save content of json_list list, in jsonl format.
+ :param mode: Write mode
+ :return: None
+ """
+ with open(file_path, mode) as file_obj:
+ for json_obj in json_list:
+ json_str = json.dumps(json_obj, default=str)
+ file_obj.write(json_str+"\n")
+
+
+def str_list_to_dir_path(str_list: List[str]) -> str:
+ """
+ Return a string which is directory path formed out of concatenating given strings in list `str_list`
+
+ e.g.
+ str_list=["dir_1", "sub_dir_1"]
+ return "dir_1\sub_dir_1"
+ """
+ if not str_list:
+ return ""
+
+ path = ""
+ for dir_name in str_list:
+ path = join(path, dir_name)
+ return path
diff --git a/promptwizard/glue/paramlogger/utils.py b/promptwizard/glue/paramlogger/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..894bb8ea3090307b6b86ed066a1d2ca6bcf0b062
--- /dev/null
+++ b/promptwizard/glue/paramlogger/utils.py
@@ -0,0 +1,57 @@
+from collections import defaultdict
+from datetime import datetime
+from inspect import getfullargspec
+from time import time
+from typing import Dict, Hashable
+
+from .constants import LogLiterals
+
+
+def run_method_get_io_dict(method_obj, del_self_arg: bool, *argv, **kwargs) -> Dict:
+ """
+ Run method method_obj with *argv as arguments.
+ Create dictionary of all input/ output and other meta data elements to be eventually logged to file.
+
+ :param method_obj: method reference
+ :param del_self_arg: True if we shouldn't include `self` variable in output dictionary
+ :param argv: Arguments that needs to be passed to method as *argv
+ :param kwargs: Arguments that needs to be passed to method as **kwargs
+
+ :return: Dict that has inputs, outputs and meta data to be logged
+ """
+ args_to_log = defaultdict(dict)
+
+ start_time = time()
+ output = method_obj(*argv, **kwargs)
+ execution_time = time() - start_time
+
+ # get name of input parameters of method method_obj
+ arg_spec = getfullargspec(method_obj)
+ arg_names = arg_spec.args
+ argv_list = list(argv)
+
+ # Capture all *argv values
+ for arg_name, arg_val in zip(arg_names[:len(argv_list)], argv_list):
+ if isinstance(arg_val, Hashable) and not (del_self_arg and arg_name == "self"):
+ args_to_log[LogLiterals.INPUTS][arg_name] = str(arg_val)
+
+ # Capture all **kwargs values
+ args_to_log[LogLiterals.INPUTS].update(kwargs)
+
+ if arg_spec.defaults:
+ default_arg_values = list(arg_spec.defaults)
+ # For args that don't have any value, set defaults
+ arg_with_no_values_count = len(arg_names) - (len(argv_list) + len(kwargs))
+ # Number of arguments for which defaults should be used
+ defaults_count = min(arg_with_no_values_count, len(default_arg_values))
+
+ # Arguments for which values are not passed but defaults are specified, use defaults
+ for arg_names, arg_val in zip(arg_names[-defaults_count:], default_arg_values[-defaults_count:]):
+ if isinstance(arg_val, Hashable):
+ args_to_log[LogLiterals.INPUTS][arg_name] = str(arg_val)
+
+ args_to_log[LogLiterals.OUTPUTS] = output
+ args_to_log[LogLiterals.META][LogLiterals.EXEC_SEC] = execution_time
+ args_to_log[LogLiterals.META][LogLiterals.TIMESTAMP] = datetime.now()
+
+ return args_to_log
diff --git a/promptwizard/glue/promptopt/__init__.py b/promptwizard/glue/promptopt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/promptwizard/glue/promptopt/constants.py b/promptwizard/glue/promptopt/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7245dc45778a77d0bb22be51f97853220f25f0a
--- /dev/null
+++ b/promptwizard/glue/promptopt/constants.py
@@ -0,0 +1,41 @@
+from dataclasses import dataclass
+from enum import Enum
+
+from ..common.base_classes import UniversalBaseClass
+
+
+# Set of Prompt Management Techniques supported by Vellm co-pilot
+# Hyperparameters defined in promptopt_config.yaml
+class SupportedPromptOpt(Enum):
+ CRITIQUE_N_REFINE = "critique_n_refine"
+
+ @classmethod
+ def all_values(cls):
+ return ",".join([member.value for member in SupportedPromptOpt])
+
+ @classmethod
+ def has_value(cls, value):
+ return value in cls._value2member_map_
+
+
+@dataclass
+class PromptOptimizationLiterals:
+ PROMPT_TECHNIQUE_NAME = "prompt_technique_name"
+
+
+@dataclass
+class PromptOptimizationParams(UniversalBaseClass):
+ """
+ Parent class for all Prompt Optimization classes.
+ """
+ prompt_technique_name: str
+
+
+@dataclass
+class PromptPool(UniversalBaseClass):
+ """
+ Parent class for all classes that handle prompt strings for each techniques.
+ """
+ system_prompt: str
+ final_prompt: str
+ eval_prompt: str
diff --git a/promptwizard/glue/promptopt/instantiate.py b/promptwizard/glue/promptopt/instantiate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d2a105e15efb468c7cdbfca4cdb0659f01dad5a
--- /dev/null
+++ b/promptwizard/glue/promptopt/instantiate.py
@@ -0,0 +1,178 @@
+from os.path import dirname, join
+import pickle
+import time
+from typing import Any
+
+from ..common.base_classes import LLMConfig, SetupConfig
+from ..common.constants.log_strings import CommonLogsStr
+from ..common.llm.llm_mgr import LLMMgr
+from ..common.utils.logging import get_glue_logger, set_logging_config
+from ..common.utils.file import read_jsonl, yaml_to_class, yaml_to_dict, read_jsonl_row
+from ..paramlogger import ParamLogger
+from ..promptopt.constants import PromptOptimizationLiterals
+from ..promptopt.techniques.common_logic import DatasetSpecificProcessing
+from ..promptopt.utils import get_promptopt_class
+
+
+class GluePromptOpt:
+ """
+ This class is trigger point for any prompt optimization method. Different prompt optimization techniques are
+ represented by different classes. This class collates all the user configs present in different yaml files and
+ other boilerplate code. Any of supported prompt optimization techniques can be triggered by this class.
+ """
+ BEST_PROMPT = None
+ EXPERT_PROFILE = None
+ data_processor = None
+ iolog = ParamLogger()
+
+ class EvalLiterals:
+ IS_CORRECT = "is_correct"
+ PREDICTED_ANS = "predicted_ans"
+ LLM_OUTPUT = "llm_output"
+
+ def __init__(self,
+ prompt_config_path: str,
+ setup_config_path: str,
+ dataset_jsonl: str,
+ data_processor: DatasetSpecificProcessing,
+ dataset_processor_pkl_path: str = None,
+ prompt_pool_path: str = None):
+ """
+ Collates all the configs present in different yaml files. Initialize logger, de-serialize pickle file that has
+ class/method for dataset processing (for given dataset).
+
+ :param llm_config_path: Path to yaml file that has LLM related configs.
+ :param prompt_config_path: Path to yaml file that has prompt templates for the given techniques.
+ :param setup_config_path: Path to yaml file that has user preferences.
+ :param dataset_jsonl: Path to jsonl file that has dataset present in jsonl format.
+ :param data_processor: object of DatasetSpecificProcessing class, which has data handling methods which are
+ specific to that dataset
+ :param dataset_processor_pkl_path: Path to pickle file that has object of class DatasetSpecificProcessing
+ serialized.
+ :param prompt_pool_path: Path to yaml file that has prompts
+ """
+ if dataset_jsonl != None:
+ if data_processor:
+ self.data_processor = data_processor
+ else:
+ with open(dataset_processor_pkl_path, "rb") as file:
+ self.data_processor = pickle.load(file) # datatype: class DatasetSpecificProcessing
+
+ prompt_config_dict = yaml_to_dict(prompt_config_path)
+ prompt_opt_cls, prompt_opt_hyperparam_cls, promptpool_cls = get_promptopt_class(
+ prompt_config_dict[PromptOptimizationLiterals.PROMPT_TECHNIQUE_NAME])
+
+ self.setup_config = yaml_to_class(setup_config_path, SetupConfig)
+ self.prompt_opt_param = yaml_to_class(prompt_config_path, prompt_opt_hyperparam_cls)
+ current_dir = dirname(__file__)
+ default_yaml_path = join(current_dir,
+ "techniques",
+ prompt_config_dict[PromptOptimizationLiterals.PROMPT_TECHNIQUE_NAME],
+ "prompt_pool.yaml")
+
+ self.prompt_pool = yaml_to_class(prompt_pool_path, promptpool_cls, default_yaml_path)
+
+ if dataset_jsonl != None:
+ dataset = read_jsonl(dataset_jsonl)
+ self.prompt_opt_param.answer_format += self.prompt_pool.ans_delimiter_instruction
+ base_path = join(self.setup_config.dir_info.base_dir, self.setup_config.experiment_name)
+ set_logging_config(join(base_path, self.setup_config.dir_info.log_dir_name),
+ self.setup_config.mode)
+ self.logger = get_glue_logger(__name__)
+
+ if dataset_jsonl != None:
+ if len(dataset) < self.prompt_opt_param.seen_set_size:
+ self.prompt_opt_param.seen_set_size = len(dataset)
+ self.logger.info(f"Dataset has {len(dataset)} samples. However values for seen_set_size is "
+ f"{self.prompt_opt_param.seen_set_size}. Hence resetting seen_set_size"
+ f" to {len(dataset)}")
+
+ if self.prompt_opt_param.few_shot_count > self.prompt_opt_param.seen_set_size:
+ self.prompt_opt_param.few_shot_count = self.prompt_opt_param.seen_set_size
+ self.logger.info(f"Value set for few_shot_count is {self.prompt_opt_param.few_shot_count}. "
+ f"However values for seen_set_size is {self.prompt_opt_param.seen_set_size}. "
+ f"Hence resetting few_shot_count to {self.prompt_opt_param.few_shot_count}")
+
+ if dataset_jsonl != None:
+ training_dataset = dataset[:self.prompt_opt_param.seen_set_size]
+ else:
+ training_dataset = None
+ self.logger.info(f"Setup configurations parameters: {self.setup_config} \n{CommonLogsStr.LOG_SEPERATOR}")
+ self.logger.info(f"Prompt Optimization parameters: {self.prompt_opt_param} \n{CommonLogsStr.LOG_SEPERATOR}")
+
+ # This iolog is going to be used when doing complete evaluation over test-dataset
+ self.iolog.reset_eval_glue(join(base_path, "evaluation"))
+
+ self.prompt_opt = prompt_opt_cls(training_dataset, base_path, self.setup_config,
+ self.prompt_pool, self.data_processor, self.logger)
+
+ def get_best_prompt(self,use_examples=False,run_without_train_examples=False,generate_synthetic_examples=False) -> (str, Any):
+ """
+ Call get_best_prompt() method of class PromptOptimizer & return its value.
+ :return: (best_prompt, expert_profile)
+ best_prompt-> Best prompt for a given task description
+ expert_profile-> Description of an expert who is apt to solve the task at hand. LLM would be asked to take
+ identity of described in expert_profile.
+ """
+ start_time = time.time()
+ self.BEST_PROMPT, self.EXPERT_PROFILE = self.prompt_opt.get_best_prompt(self.prompt_opt_param,use_examples=use_examples,run_without_train_examples=run_without_train_examples,generate_synthetic_examples=generate_synthetic_examples)
+
+ self.logger.info(f"Time taken to find best prompt: {(time.time() - start_time)} sec")
+ return self.BEST_PROMPT, self.EXPERT_PROFILE
+
+ def evaluate(self, test_dataset_jsonl: str) -> float:
+ """
+ Evaluate the performance of self.BEST_PROMPT over test dataset. Return the accuracy.
+
+ :param test_dataset_jsonl: Path to jsonl file that has test dataset
+ :return: Percentage accuracy
+ """
+
+ start_time = time.time()
+ self.logger.info(f"Evaluation started {CommonLogsStr.LOG_SEPERATOR}")
+ if not self.BEST_PROMPT:
+ self.logger.error("BEST_PROMPT attribute is not set. Please set self.BEST_PROMPT attribute of this object, "
+ "either manually or by calling get_best_prompt() method.")
+ return
+
+ total_correct = 0
+ total_count = 0
+ for json_obj in read_jsonl_row(test_dataset_jsonl):
+ answer = self.predict_and_access(json_obj[DatasetSpecificProcessing.QUESTION_LITERAL],
+ json_obj[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL])
+
+ total_correct += answer[self.EvalLiterals.IS_CORRECT]
+ total_count += 1
+ result = {"accuracy": f"{total_correct}/{total_count} : {total_correct/total_count}%",
+ "predicted": answer[self.EvalLiterals.PREDICTED_ANS],
+ "actual": json_obj[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL]}
+ self.iolog.append_dict_to_chained_logs(result)
+ self.logger.info(result)
+
+ self.iolog.dump_chained_log_to_file(file_name=f"eval_result_{self.setup_config.experiment_name}")
+ self.logger.info(f"Time taken for evaluation: {(time.time() - start_time)} sec")
+ return total_correct / total_count
+
+ @iolog.log_io_params
+ def predict_and_access(self, question: str, gt_answer: str) -> (bool, str, str):
+ """
+ For the given input question, get answer to it from LLM, using the BEST_PROMPT & EXPERT_PROFILE
+ computes earlier.
+
+ :param question: Question to be asked to LLM, to solve
+ :param gt_answer: Ground truth, final answer.
+ :return: (is_correct, predicted_ans, llm_output)
+ is_correct -> Tells if prediction by LLM was correct.
+ predicted_ans -> is the actual predicted answer by LLM.
+ llm_output -> Output text generated by LLM for the given question
+ :rtype: (bool, str, str)
+ """
+ final_prompt = self.prompt_pool.eval_prompt.format(instruction=self.BEST_PROMPT,
+ question=question)
+ llm_output = self.prompt_opt.chat_completion(user_prompt=final_prompt, system_prompt=self.EXPERT_PROFILE)
+
+ is_correct, predicted_ans = self.data_processor.access_answer(llm_output, gt_answer)
+ return {self.EvalLiterals.IS_CORRECT: is_correct,
+ self.EvalLiterals.PREDICTED_ANS: predicted_ans,
+ self.EvalLiterals.LLM_OUTPUT: llm_output}
+
diff --git a/promptwizard/glue/promptopt/runner.py b/promptwizard/glue/promptopt/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..084c53221a4fe1ca861f53371d3b28d17d65187f
--- /dev/null
+++ b/promptwizard/glue/promptopt/runner.py
@@ -0,0 +1,29 @@
+import argparse
+from glue.promptopt.instantiate import GluePromptOpt
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="Arguments needed by prompt manager")
+ parser.add_argument('--llm_config_path', default=None)
+ parser.add_argument('--prompt_config_path', default=None)
+ parser.add_argument('--setup_config_path', default=None)
+ parser.add_argument('--train_file_name', default=None)
+ parser.add_argument('--test_file_name', default=None)
+ parser.add_argument('--dataset_processor_pkl_path', default=None)
+ parser.add_argument('--prompt_pool_path', default=None)
+
+ args = parser.parse_args()
+
+ gp = GluePromptOpt(args.llm_config_path,
+ args.promptopt_config_path,
+ args.setup_config_path,
+ args.train_file_name,
+ args.dataset_processor_pkl_path,
+ args.prompt_pool_path)
+
+ best_prompt, expert_profile = gp.get_best_prompt()
+ print(f"Best prompt: {best_prompt} \nExpert profile: {expert_profile}")
+
+ if args.test_file_name:
+ accuracy = gp.evaluate(args.test_file_name)
+ print(f"accuracy: {accuracy}")
+
diff --git a/promptwizard/glue/promptopt/techniques/__init__.py b/promptwizard/glue/promptopt/techniques/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/promptwizard/glue/promptopt/techniques/common_logic.py b/promptwizard/glue/promptopt/techniques/common_logic.py
new file mode 100644
index 0000000000000000000000000000000000000000..13252b5385ad450e04a07e1b652a038573d96d21
--- /dev/null
+++ b/promptwizard/glue/promptopt/techniques/common_logic.py
@@ -0,0 +1,123 @@
+from abc import abstractmethod, ABC
+from typing import Any, List
+
+from ..constants import PromptOptimizationParams
+
+
+class PromptOptimizer(ABC):
+ """
+ Parent class for all prompt optimization techniques.
+ """
+ TECHNIQUE_NAME = ""
+
+ @abstractmethod
+ def get_best_prompt(self, params: PromptOptimizationParams) -> (str, Any):
+ """Method that will return best prompt for given task description, base instruction and few shot examples"""
+ pass
+
+
+class DatasetSpecificProcessing(ABC):
+ """
+ Prompt Optimizer is agnostic of dataset on which its run. There are few processing requirements that are specific
+ to dataset. This class should be inherited class that user defines & its methods should be defined based on their
+ dataset & use-case.
+ """
+ QUESTION_LITERAL = "question"
+ ANSWER_WITH_REASON_LITERAL = "answer"
+ FINAL_ANSWER_LITERAL = "final_answer"
+ QUESTION_KEY_IN_PROMPT = "[Question]"
+ ANSWER_KEY_IN_PROMPT = "[Answer]"
+ # Regular expression pattern to match text between and tags
+ TEXT_DELIMITER_PATTERN = r"(?s)(?<=)(.*?)(?=)"
+ TEXT_DELIMITER_PATTERN_MUTATION = r"(?s)(?<=)(.*?)(?=)"
+ ANSWER_START = ""
+ ANSWER_END = ""
+ ANSWER_DELIMITER_PATTERN = r"(?s)(?<=" + ANSWER_START + ")(.*?)(?=" + ANSWER_END + ")"
+ INVALID_ANS = "[invalid]"
+ FINAL_PROMPT = None
+
+
+ def normalize_prediction(self, prediction, lowercase=True):
+ import re
+ import string
+ prediction = prediction.replace(' and ', ' ')
+ prediction = prediction.replace('Sentence 1:', ' ')
+ prediction = prediction.replace('Sentence 2:', ' ')
+ prediction = prediction.strip()
+ prediction = prediction.split("\n")[0]
+ prediction = prediction.split(".")[0]
+
+ if lowercase:
+ prediction = prediction.lower()
+
+ # remove punctuation
+ prediction = prediction.replace('-', ' ')
+ prediction = prediction.translate(
+ str.maketrans('', '', string.punctuation))
+
+ return prediction
+ def access_answer(self, llm_output: str, gt_answer: str) -> (bool, Any):
+ """
+ Compare answer generated by model with the answer in ground truth.
+ Return True if they are equal. Definition of `equal` depends on problem at hand.
+ Here only the default implementation is provided. This method should be overridden & custom defined
+ based on end use-case.
+
+ :param llm_output: Output of LLM i.e. the predicted answer
+ :param gt_answer: The expected ground truth answer
+ """
+
+ predicted_answer = self.extract_final_answer(llm_output)
+ is_correct = False
+ if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):
+ is_correct = True
+
+ return is_correct, predicted_answer
+
+
+
+ def collate_to_str(self, examples: List, example_template: str) -> str:
+ """
+ Take as input a list of examples. Populate common template with values in these examples. Concatenate all of
+ them to a single string, which can then be passed to LLM as prompt.
+
+ :param examples: List of examples
+ :param example_template: A template of giving examples to LLM as part of few shot learning
+ :return: Concatenated string of all examples over the template.
+ """
+ example_string = ""
+ for example in examples:
+ answer = example[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL]
+ if DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL in example:
+ answer = example[DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL]
+
+ example_string += example_template.format(question=example[DatasetSpecificProcessing.QUESTION_LITERAL],
+ answer=answer)
+ return example_string
+
+ def extract_final_answer(self, answer: str) -> str:
+ """
+ Parse the output of LLM and extract the answer that you need from it.
+ Here only the default implementation is provided. This method should be overridden & custom defined
+ based on end use-case.
+
+ :param answer: Output of LLM i.e. the response the to the question asked.
+ :return: Final answer extracted from `answer` text, that we are looking for.
+ """
+
+ return answer
+
+ @abstractmethod
+ def dataset_to_jsonl(self, dataset_jsonl: str, task: str, **kwargs: Any) -> None:
+ """
+ Prompt optimizer needs data in jsonl format. And each json string should be as below
+ {
+ 'question': 'I had 3 books. I gave 2 books to Ram. How many books do I have now ?',
+ 'reason': 'Number of books that I had initially=1. Number of books I have after giving 2 books to Ram=3-2=1.',
+ 'answer': 1)
+ }
+
+ :param dataset_jsonl: Path of file in which jsonl data should be saved.
+ :param **kwargs: List of other user defined input parameters.
+ """
+ pass
diff --git a/promptwizard/glue/promptopt/techniques/critique_n_refine/__init__.py b/promptwizard/glue/promptopt/techniques/critique_n_refine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/promptwizard/glue/promptopt/techniques/critique_n_refine/base_classes.py b/promptwizard/glue/promptopt/techniques/critique_n_refine/base_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0259ee3eee41f870fef1430c86bd2b896ddefa
--- /dev/null
+++ b/promptwizard/glue/promptopt/techniques/critique_n_refine/base_classes.py
@@ -0,0 +1,69 @@
+from dataclasses import dataclass
+from typing import List
+
+from ....common.base_classes import UniversalBaseClass
+from ...constants import PromptOptimizationParams, PromptPool
+
+
+@dataclass
+class CritiqueNRefinePromptPool(PromptPool):
+ quest_reason_ans: str
+ expert_profile: str
+ ans_delimiter_instruction: str
+ intent_template: str
+ thinking_styles: List[str]
+ meta_critique_template: str
+ meta_positive_critique_template: str
+ critique_refine_template: str
+ solve_template: str
+ examples_critique_template: str
+ examples_optimization_template: str
+ meta_sample_template: str
+ intent_template: str
+ expert_template: str
+ generate_reason_template: str
+ reason_optimization_template: str
+ examples_critique_template_zero_shot: str
+
+
+@dataclass
+class CritiqueNRefineParams(PromptOptimizationParams, UniversalBaseClass):
+ unique_model_id: str
+ # Number of candidate prompts to generate in given iteration
+ style_variation: int
+ # Number of questions to be asked to LLM in a single go
+ questions_batch_size: int
+ # Number of batches of questions to correctly answered, for a prompt to be considered as performing good
+ min_correct_count: int
+ # Max number of mini-batches on which we should evaluate our prompt
+ max_eval_batches: int
+ # Number of top best performing prompts to be considered for next iterations
+ top_n: int
+ # Number of rounds of mutation to be performed when generating different styles
+ mutation_rounds: int
+ # Refine instruction post mutation
+ refine_instruction: bool
+ # Number of iterations for conducting rounds of mutation of task description
+ # followed by refinement of instructions
+ mutate_refine_iterations: int
+ # Number of iterations for refining task description and in context examples for few-shot
+ refine_task_eg_iterations: int
+ # Description of task. This will be fed to prompt
+ task_description: str
+ # Base instruction, in line with your dataset. This will be fed to prompt
+ base_instruction: str
+ # Instruction for specifying answer format
+ answer_format: str
+ # Number of samples from dataset, set aside as training data. In every iteration we would be drawing
+ # `questions_batch_size` examples from training data with replacement.
+ seen_set_size: int
+ # Number of examples to be given for few shots
+ few_shot_count: int
+ # Generate synthetic reasoning
+ generate_reasoning: bool
+ # Generate description of an expert which can solve the task at hand
+ generate_expert_identity: bool
+ # Generate keywords that describe the intent of the task
+ generate_intent_keywords: bool
+ # number of synthetic training examples to be generated
+ num_train_examples: int
diff --git a/promptwizard/glue/promptopt/techniques/critique_n_refine/core_logic.py b/promptwizard/glue/promptopt/techniques/critique_n_refine/core_logic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce13dae47b19f3b5e13a09a7a4234fd5680cdef4
--- /dev/null
+++ b/promptwizard/glue/promptopt/techniques/critique_n_refine/core_logic.py
@@ -0,0 +1,609 @@
+import random
+import re
+from os.path import join
+from tqdm import tqdm
+from typing import Any, Dict, List
+import json
+
+from ....paramlogger import ParamLogger
+from ....paramlogger.constants import LogLiterals
+from ....common.base_classes import SetupConfig, UniversalBaseClass
+from ....common.llm.llm_mgr import LLMMgr
+from ....common.constants.log_strings import CommonLogsStr
+from ...constants import PromptOptimizationParams, SupportedPromptOpt
+from ...techniques.common_logic import DatasetSpecificProcessing, PromptOptimizer
+from ...techniques.critique_n_refine.base_classes import CritiqueNRefinePromptPool
+
+
+def extract_between(start, end, text):
+ """
+ Extracts the substring from 'text' that is between 'start' and 'end' strings.
+
+ Parameters:
+ - start (str): The starting delimiter string.
+ - end (str): The ending delimiter string.
+ - text (str): The text to search within.
+
+ Returns:
+ - str: The extracted substring between the start and end delimiters.
+ """
+ start_index = text.find(start)
+ if start_index == -1:
+ return ''
+
+ start_index += len(start)
+
+ end_index = text.find(end, start_index)
+ if end_index == -1:
+ return ''
+ return text[start_index:end_index]
+
+
+class CritiqueNRefine(PromptOptimizer, UniversalBaseClass):
+ """
+ TODO: Explain this method
+ """
+
+ TECHNIQUE_NAME = SupportedPromptOpt.CRITIQUE_N_REFINE.value
+
+ class GetPromptScoreIndex:
+ """
+ Class to hold constants. Output of get_prompt_score() method is a list.
+ This class stores mapping between output entity and its index in output of get_prompt_score() method.
+ """
+ PROMPT_STR = 0
+ SCORE = 1
+ DATASET = 2
+
+ # This has to defined outside of constructor, so that it can be used as decorator.
+ iolog = ParamLogger()
+
+ def __init__(self, dataset: List, base_path: str, setup_config: SetupConfig,
+ prompt_pool: CritiqueNRefinePromptPool, data_processor: DatasetSpecificProcessing, logger):
+ self.dataset = dataset
+ self.setup_config = setup_config
+ self.data_processor = data_processor
+ self.logger = logger
+ self.prompt_pool = prompt_pool
+ base_path = join(base_path, LogLiterals.DIR_NAME)
+ self.iolog.reset_eval_glue(base_path)
+
+ @iolog.log_io_params
+ def chat_completion(self, user_prompt: str, system_prompt: str = None):
+ """
+ Make a chat completion request to the OpenAI API.
+
+ :param user_prompt: Text spoken by user in a conversation.
+ :param system_prompt: Text spoken by system in a conversation.
+ :return: Output of LLM
+ """
+ if not system_prompt:
+ system_prompt = self.prompt_pool.system_prompt
+
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_prompt}
+ ]
+ response = LLMMgr.chat_completion(messages)
+ return response
+
+ @iolog.log_io_params
+ def gen_different_styles(self, base_instruction: str, task_description: str,
+ mutation_rounds: int = 2, thinking_styles_count: int = 10) -> List:
+ """
+ Generate different variations of base_instruction by mixing thinking styles.
+
+ :param base_instruction: Instruction given to LLM to solve the task defined in task_description.
+ :param task_description: Description of the task to be solved.
+ :param mutation_rounds: Number of rounds of mutation to be performed when generating different styles.
+ :param thinking_styles_count: Number of different thinking styles descriptions to be taken from the pool of
+ thinking styles and given to LLM as reference (in context).
+
+ :return: List of prompts generated in `mutation_rounds` rounds of mutation.
+ """
+ candidate_prompts = [task_description + "\n" + base_instruction]
+
+ for mutation_round in range(mutation_rounds):
+ mutated_sample_prompt = self.prompt_pool.meta_sample_template.format(
+ task_description=task_description,
+ meta_prompts="\n".join(self.prompt_pool.thinking_styles[:thinking_styles_count]),
+ num_variations=thinking_styles_count,
+ prompt_instruction=base_instruction)
+ generated_mutated_prompt = self.chat_completion(mutated_sample_prompt)
+ # Find all matches of the pattern in the text
+ matches = re.findall(DatasetSpecificProcessing.TEXT_DELIMITER_PATTERN_MUTATION, generated_mutated_prompt)
+ candidate_prompts.extend(matches)
+
+ self.logger.info(f"mutation_round={mutation_round} mutated_sample_prompt={mutated_sample_prompt}"
+ f"mutated_prompt_generation={generated_mutated_prompt}")
+
+ return candidate_prompts
+
+ @iolog.log_io_params
+ def critique_and_refine(self, prompt: str, critique_example_set: List,
+ further_enhance: bool = False) -> str:
+ """
+ For the given prompt and examples, generate critique using LLM. Then using the generated critique, refine the prompt using LLM.
+
+ :param prompt: Initial prompt
+ :param critique_example_set: Set of examples to be given in context (as few shots)
+ :param further_enhance: True, if the initial prompt gave number of correct answers more than expected threshold.
+ i.e. we try to further optimize already good prompt.
+ False, if the initial prompt gave number of correct answers less than expected
+ threshold. i.e. we try to improve poorly performing prompt.
+ :return: refined prompt
+ """
+ example_string = self.data_processor.collate_to_str(critique_example_set,
+ self.prompt_pool.quest_reason_ans)
+
+ if further_enhance:
+ # Prompt to get critique on the prompt for which we got the examples right
+ meta_critique_prompt = self.prompt_pool.meta_positive_critique_template
+ else:
+ # Prompt to get critique on the prompt for which we got the examples wrong
+ meta_critique_prompt = self.prompt_pool.meta_critique_template
+
+ meta_critique_prompt = meta_critique_prompt.format(instruction=prompt, examples=example_string)
+
+ critique_text = self.chat_completion(meta_critique_prompt, self.prompt_pool.expert_profile)
+ critique_refine_prompt = self.prompt_pool.critique_refine_template.format(instruction=prompt,
+ examples=example_string,
+ critique=critique_text,
+ steps_per_sample=1)
+
+ refined_prompts = self.chat_completion(critique_refine_prompt, self.prompt_pool.expert_profile)
+
+ refined_prompts = re.findall(DatasetSpecificProcessing.TEXT_DELIMITER_PATTERN, refined_prompts)
+
+ if refined_prompts:
+ final_refined_prompts = refined_prompts[0]
+ else:
+ raise ValueError("The LLM ouput is not in the expected format. Please rerun the code...")
+
+ self.logger.info(f"Prompt to get critique:\n {meta_critique_prompt}"
+ f"critique received from LLM:\n {critique_text}"
+ f"Prompt to get Refinement after critique, from LLM:\n {critique_refine_prompt}"
+ f"Refined prompts received from LLM:\n {final_refined_prompts}")
+
+ return final_refined_prompts
+
+ @iolog.log_io_params
+ def get_prompt_score(self, instructions: List[str], params: PromptOptimizationParams) -> List:
+ """
+ For each of the prompts in input, make LLM answer a set questions from dataset.
+ Check if the answers are correct. Assign score to each prompt based on the number of batches of questions
+ answered correctly. Once you get a prompt that gets all the questions right, you can stop the process.
+
+ :params instructions: Prompts using which we'll try to solve the task
+ :params params: Object of PromptOptimizationParams class, that has hyperparameters related to prompt
+ optimization technique in context.
+ :return: A tuple with (Prompt string,
+ score corresponding to that prompt,
+ set of examples over which we evaluated)
+ """
+ prompt_score_list = []
+
+ for instruction in instructions:
+ correct_count, count = 0, 0
+ critique_example_set = []
+ dataset_subset = random.sample(self.dataset, params.questions_batch_size)
+ questions_pool = [example[DatasetSpecificProcessing.QUESTION_LITERAL] for example in dataset_subset]
+ while not critique_example_set and \
+ correct_count < params.min_correct_count and \
+ count < params.max_eval_batches:
+ count += 1
+ solve_prompt = self.prompt_pool.solve_template.format(
+ questions_batch_size=params.questions_batch_size,
+ answer_format=params.answer_format,
+ instruction=instruction,
+ questions='\n'.join(questions_pool))
+
+ generated_text = self.chat_completion(solve_prompt)
+ critique_example_set = self.evaluate(generated_text, dataset_subset)
+ if not critique_example_set:
+ # If all the questions were answered correctly, then we need to get a new set of questions to answer
+ dataset_subset = random.sample(self.dataset, params.questions_batch_size)
+ questions_pool = [example[DatasetSpecificProcessing.QUESTION_LITERAL] for example in dataset_subset]
+ correct_count += 1
+ #
+ print("critique_example_set, correct_count")
+ print(critique_example_set, correct_count)
+ print("Loop completed")
+ prompt_score_list.append([instruction, correct_count/count, dataset_subset])
+
+ self.logger.info(f"prompt_score_list {prompt_score_list}")
+ return prompt_score_list
+
+ @iolog.log_io_params
+ def refine_prompts(self, prompt_score_list: List, params: PromptOptimizationParams) -> List:
+ """
+ Further refine the prompts differently based on whether they got the subset of questions right or wrong.
+
+ :param prompt_score_list: List of (prompt string, score for that prompt string,
+ set of examples given in context)
+ :param params: Object of class having hyperparameters for Prompt Optimization.
+ :return: List of prompts, which were refined over input prompts.
+ """
+ refined_prompts = []
+ for prompt, score, critique_example_set in prompt_score_list:
+ if score >= params.min_correct_count/params.max_eval_batches:
+ # if it's good enough prompt, how to mutate on that
+ refined_prompts.append(self.critique_and_refine(prompt, critique_example_set, True))
+ else:
+ # if it's not good enough prompt, how to mutate on that
+ refined_prompts.append(self.critique_and_refine(prompt, critique_example_set))
+
+ self.logger.info(f"refined_prompts {refined_prompts}")
+ return refined_prompts
+
+ @iolog.log_io_params
+ def evaluate(self, generated_text: str, dataset_subset: List) -> List:
+ """
+ Compare predicted answers with actual answers from the dataset.
+ Return the list of questions for which the predicted answer was wrong.
+
+ :param generated_text: Output of LLM, that has answers for a mini-batch of questions
+ (which were send in single go)
+ :param dataset_subset: List of examples with question and ground truth.
+ :return: List of examples that were wrongly classified.
+ """
+ # Find all matches of the pattern in the text
+ answer_matches = re.findall(DatasetSpecificProcessing.ANSWER_DELIMITER_PATTERN, generated_text)
+
+ # answer_matches = [self.chat_completion(FINAL_ANSWER_EXTRACTION_PROMPT.format(text=generated_text), "You are an AI assistant. Please follow the users requests.")]
+ answer_matches = [generated_text]
+ #
+ answers_len, dataset_len = len(answer_matches), len(dataset_subset)
+ if answers_len != dataset_len:
+ self.logger.info(f"Answers extracted from LLM output={answers_len}, Questions asked to LLM {dataset_len}")
+ if answers_len > dataset_len:
+ # Select last `dataset_len` number of extractions as final.
+ answer_matches = answer_matches[-dataset_len:]
+
+ wrong_examples = []
+ for i in range(min(answers_len, dataset_len)):
+ print("dataset_subset", dataset_subset)
+ actual_answer = dataset_subset[i][DatasetSpecificProcessing.FINAL_ANSWER_LITERAL]
+ question = dataset_subset[i][DatasetSpecificProcessing.QUESTION_LITERAL]
+ is_correct, _ = self.data_processor.access_answer(answer_matches[i], actual_answer)
+ if not is_correct:
+ wrong_examples.append(dataset_subset[i])
+ #
+ return wrong_examples
+
+ @iolog.log_io_params
+ def select_top_prompts(self, prompt_score_list: List, top_n: int) -> List:
+ """
+ Sort prompts in prompt_score_list, based on its performance. And return max, top `top_n` prompts.
+
+ :param prompt_score_list: List of (prompt string, score for that prompt string,
+ set of examples given in context)
+ :param top_n: Max number of prompts from the top of the list, that we need to return
+ :return: List of top `top_n` prompts.
+ """
+ sorted_prompts = sorted(prompt_score_list, key=lambda x: [x[self.GetPromptScoreIndex.SCORE],
+ len(x[self.GetPromptScoreIndex.PROMPT_STR])],
+ reverse=True)
+ sorted_top_n_prompts = sorted_prompts[:top_n]
+ self.logger.debug(f"Sorted top n prompts: {sorted_top_n_prompts}")
+ return sorted_top_n_prompts
+
+ def extract_examples_frm_response(self, response_with_examples: str) -> List:
+ """
+ Extract the elements that constitute an example in dataset viz question, reasoning for answer and the answer.
+ Put these elements to list and return.
+
+ :param response_with_examples: Response of LLM which has synthetic examples.
+ :return: A list of synthetic examples
+ """
+ #
+ synthetic_examples = []
+ parsed_data = re.findall(DatasetSpecificProcessing.TEXT_DELIMITER_PATTERN, response_with_examples, re.DOTALL)
+ parsed_data = [s.strip() for s in parsed_data]
+
+ for text in parsed_data:
+ # Splitting text into question, reason, and answer
+ if DatasetSpecificProcessing.QUESTION_KEY_IN_PROMPT in text and \
+ DatasetSpecificProcessing.ANSWER_KEY_IN_PROMPT in text:
+ question = text[text.find(DatasetSpecificProcessing.QUESTION_KEY_IN_PROMPT) +
+ len(DatasetSpecificProcessing.QUESTION_KEY_IN_PROMPT):
+ text.find(DatasetSpecificProcessing.ANSWER_KEY_IN_PROMPT)].strip()
+ answer_with_reason = text[text.find(DatasetSpecificProcessing.ANSWER_KEY_IN_PROMPT) +
+ len(DatasetSpecificProcessing.ANSWER_KEY_IN_PROMPT):].strip()
+
+ if self.data_processor != None:
+ final_answer = self.data_processor.extract_final_answer(answer_with_reason)
+ else:
+ final_answer = extract_between(text=answer_with_reason,start="",end="")
+
+
+ formatted_data = {
+ DatasetSpecificProcessing.QUESTION_LITERAL: question,
+ DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: answer_with_reason,
+ DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: final_answer
+ }
+
+ synthetic_examples.append(formatted_data)
+
+ return synthetic_examples
+
+ def generate_reasoning(self, task_description: str, instruction: str, question: str, answer: str) -> str:
+ """
+ For the given question return the reasoning that's needed to arrive at the provided answer
+
+ :param task_description: Task description of the given task
+ :param instruction: Instruction given to LLM for solving the given task
+ :param question: Question from the task to be solved
+ :param answer: Answer to the question
+ :return: Reasoning that went through for getting answer `answer` for question `question`
+ """
+
+ prompt_template = self.prompt_pool.generate_reason_template.format(task_description=task_description,
+ instruction=instruction,
+ question=question,
+ answer=answer)
+ return self.chat_completion(user_prompt=prompt_template)
+
+ @iolog.log_io_params
+ def generate_expert_identity(self, task_description: str) -> str:
+ """
+ Generate sentence using LLM, describing the identity of an expert, who is apt to solve the task defined
+ in task_description
+ :param task_description: Task description of the given task
+ :return: An expert profile, that can go in as system prompt and LLM would be asked to act as per this
+ expert profile.
+ """
+ expert_prompt = self.prompt_pool.expert_template.format(task_description=task_description)
+ return self.chat_completion(expert_prompt)
+
+ @iolog.log_io_params
+ def generate_intent_keywords(self, task_description: str, instruction: str):
+ """
+ For a given task description and instruction, generate keywords that describe the intent.
+
+ :param task_description: Description of the task that has to be solved by LLM
+ :param instruction: Instruction given to LLM for solving the given task
+ """
+ prompt_template = self.prompt_pool.intent_template.format(task_description=task_description, instruction=instruction)
+ return self.chat_completion(user_prompt=prompt_template)
+
+ @iolog.append_to_chained_log
+ def generate_best_examples(self, examples: List, params: PromptOptimizationParams) -> List:
+ """
+ Generate best example to be give as few-shots for the given task.
+
+ :param examples: List of examples. Each example is a dictionary with keys as question/reason/answer
+ :param params: Object having hyperparameters for this prompt optimization technique.
+ :return: List of synthetic examples
+ """
+ example_string = self.data_processor.collate_to_str(examples, self.prompt_pool.quest_reason_ans)
+ few_shot_critique_prompt = self.prompt_pool.examples_critique_template.\
+ format(prompt=params.base_instruction,
+ examples=example_string,
+ task_description=params.task_description,
+ num_examples=params.few_shot_count)
+
+ critique = self.chat_completion(few_shot_critique_prompt, self.prompt_pool.expert_profile)
+
+ gt_eg = random.sample(self.dataset, 1)
+ gt_eg_string = self.data_processor.collate_to_str(gt_eg, self.prompt_pool.quest_reason_ans)
+ few_shot_opt_prompt = self.prompt_pool.examples_optimization_template.\
+ format(prompt=params.base_instruction,
+ examples=example_string,
+ gt_example=gt_eg_string,
+ critique=critique,
+ task_description=params.task_description,
+ num_examples=params.few_shot_count)
+ synthetic_examples = self.chat_completion(few_shot_opt_prompt, self.prompt_pool.expert_profile)
+ synthetic_examples = self.extract_examples_frm_response(synthetic_examples)
+
+ return synthetic_examples
+
+ def generate_best_examples_zero_shot(self,params: PromptOptimizationParams) -> List:
+ """
+ Generate best example to be give as few-shots for the given task.
+
+ :param params: Object having hyperparameters for this prompt optimization technique.
+ :return: List of synthetic examples
+ """
+ few_shot_critique_prompt = self.prompt_pool.examples_critique_template_zero_shot.\
+ format(prompt=params.base_instruction,
+ task_description=params.task_description,
+ num_examples=params.num_train_examples)
+
+ critique = self.chat_completion(few_shot_critique_prompt, self.prompt_pool.expert_profile)
+
+ few_shot_opt_prompt = self.prompt_pool.examples_optimization_template.\
+ format(prompt=params.base_instruction,
+ examples="",
+ gt_example="",
+ critique=critique,
+ task_description=params.task_description,
+ num_examples=params.num_train_examples)
+ synthetic_examples = self.chat_completion(few_shot_opt_prompt, self.prompt_pool.expert_profile)
+ synthetic_examples = self.extract_examples_frm_response(synthetic_examples)
+ return synthetic_examples
+
+ @iolog.append_to_chained_log
+ def get_best_instr_by_critique(self, examples: List, params: PromptOptimizationParams):
+
+ if self.data_processor != None:
+ example_string = self.data_processor.collate_to_str(examples,
+ self.prompt_pool.quest_reason_ans)
+ else:
+ example_string = ""
+ for example in examples:
+ answer = example[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL]
+ if DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL in example:
+ answer = example[DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL]
+
+ example_string += self.prompt_pool.quest_reason_ans.format(question=example[DatasetSpecificProcessing.QUESTION_LITERAL],
+ answer=answer)
+
+ meta_critique_prompt = self.prompt_pool.meta_critique_template.format(instruction=params.base_instruction,
+ examples=example_string)
+ critique_text = self.chat_completion(meta_critique_prompt, self.prompt_pool.expert_profile)
+ critique_refine_prompt = self.prompt_pool.critique_refine_template.format(instruction=params.base_instruction,
+ examples=example_string,
+ critique=critique_text,
+ steps_per_sample=1)
+ refined_prompts = self.chat_completion(critique_refine_prompt)
+
+ if self.data_processor != None:
+ refined_instructions = re.findall(self.data_processor.TEXT_DELIMITER_PATTERN, refined_prompts)
+ else:
+ refined_instructions = re.findall(DatasetSpecificProcessing.TEXT_DELIMITER_PATTERN, refined_prompts)
+
+ return refined_instructions[0] if refined_instructions else None
+
+ def get_best_prompt(self, params: PromptOptimizationParams,use_examples=False,run_without_train_examples=False,generate_synthetic_examples=False) -> (str, Any):
+ """
+ Perform `params.max_iterations` iterations for optimizing your prompt. And return the best prompt found so far.
+
+ :params: Object of class PromptOptimizationParams, that has all hyper-parameters needed for prompt optimization.
+ :return: Best prompt for the given task and dataset.
+ """
+
+ current_base_instruction = params.base_instruction
+
+ if not generate_synthetic_examples:
+ print("\nMutating Task Description....")
+ # Mutate and refine task description
+ for round_num in tqdm(range(1, params.mutate_refine_iterations+1), desc="Iterations completed: "):
+ self.logger.info(f"{CommonLogsStr.LOG_SEPERATOR} + Starting iteration: {round_num} \n "
+ f"current_base_instruction: {current_base_instruction}")
+ candidate_prompts = self.gen_different_styles(current_base_instruction,
+ params.task_description,
+ params.mutation_rounds+1,
+ params.style_variation)
+
+ if run_without_train_examples:
+ prompt_index = 1
+ print("\nOptimization Finished...")
+ print("\nPossible prompt variations:")
+ for candidate in candidate_prompts[:params.mutation_rounds]:
+ final_best_prompt = self.prompt_pool.final_prompt.format(
+ instruction=candidate,
+ answer_format=params.answer_format,
+ few_shot_examples="")
+ expert_identity = self.prompt_pool.system_prompt
+ if params.generate_expert_identity:
+ expert_identity = self.generate_expert_identity(params.task_description)
+
+ #if params.generate_intent_keywords:
+ intent_keywords = self.generate_intent_keywords(params.task_description,
+ params.base_instruction)
+
+ final_best_prompt += "Keywords: " + intent_keywords
+ print("_______________________________________________________________________")
+ print("\nVariations "+str(prompt_index)+":\nExpert Profile:\n"+expert_identity+":\nPrompt:\n"+final_best_prompt)
+ prompt_index += 1
+ return "",""
+ prompt_score_list = self.get_prompt_score(candidate_prompts, params)
+ prompt_score_list = self.select_top_prompts(prompt_score_list, params.top_n)
+
+ if params.refine_instruction:
+ refined_prompts = self.refine_prompts(prompt_score_list, params)
+ refined_prompt_score_list = self.get_prompt_score(refined_prompts, params)
+ prompt_score_list = self.select_top_prompts(refined_prompt_score_list + prompt_score_list,
+ params.top_n)
+
+ current_base_instruction = prompt_score_list[0][self.GetPromptScoreIndex.PROMPT_STR]
+ self.iolog.append_dict_to_chained_logs({"round_num": round_num,
+ "best_prompt": current_base_instruction,
+ "score": prompt_score_list[0][self.GetPromptScoreIndex.SCORE]
+ })
+
+ examples = []
+
+ params.base_instruction = current_base_instruction
+ for example in self.dataset:
+ solve_prompt = self.prompt_pool.solve_template.format(
+ questions_batch_size=1,
+ instruction=params.base_instruction,
+ answer_format=params.answer_format,
+ questions=example[DatasetSpecificProcessing.QUESTION_LITERAL])
+ generated_text = self.chat_completion(solve_prompt)
+
+ examples.extend(self.evaluate(generated_text, [example]))
+ if len(examples) >= params.few_shot_count:
+ break
+
+ if len(examples) < params.few_shot_count:
+ examples = random.sample(self.dataset, params.few_shot_count - len(examples))
+
+ # Refine task description and examples iteratively
+ print("\nRefining Task description and Examples iteratively....")
+ for i in tqdm(range(params.refine_task_eg_iterations)):
+ refine_task_desc = random.choice([True, False])
+ if refine_task_desc:
+ refined_instruction = self.get_best_instr_by_critique(examples, params)
+ if refined_instruction:
+ params.base_instruction = refined_instruction
+ # comment this to turn off synthetic examples
+ elif use_examples:
+ examples = self.generate_best_examples(examples, params)
+ else:
+ print("Generating Sythetic Examples....")
+ train_examples = self.generate_best_examples_zero_shot(params)
+ with open("train_synthetic.jsonl", 'w') as file:
+ for record in train_examples:
+ json.dump(record, file)
+ file.write('\n')
+
+ print("Synthetic examples saved at train.jsonl....")
+ return "",""
+
+
+ if params.generate_reasoning:
+ print("\nGenerating CoT Reasoning for In-Context Examples....")
+ for example in tqdm(examples):
+ reason = self.generate_reasoning(params.task_description,
+ params.base_instruction,
+ example[DatasetSpecificProcessing.QUESTION_LITERAL],
+ example[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL])
+
+ example[DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL] = f"{reason} " + \
+ f"{DatasetSpecificProcessing.ANSWER_START}" + \
+ f"{example[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL]}" + \
+ f"{DatasetSpecificProcessing.ANSWER_END}"
+ if self.data_processor != None:
+ example_string = self.data_processor.collate_to_str(examples, self.prompt_pool.quest_reason_ans)
+ else:
+ example_string = ""
+ for example in examples:
+ answer = example[DatasetSpecificProcessing.FINAL_ANSWER_LITERAL]
+ if DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL in example:
+ answer = example[DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL]
+
+ example_string += self.prompt_pool.quest_reason_ans.format(question=example[DatasetSpecificProcessing.QUESTION_LITERAL],
+ answer=answer)
+
+ if params.few_shot_count == 0:
+ final_best_prompt = self.prompt_pool.final_prompt.format(
+ instruction=params.base_instruction,
+ answer_format=params.answer_format,
+ few_shot_examples="")
+ else:
+ final_best_prompt = self.prompt_pool.final_prompt.format(
+ instruction=params.base_instruction,
+ answer_format=params.answer_format,
+ few_shot_examples=example_string)
+
+ expert_identity = self.prompt_pool.system_prompt
+ if params.generate_expert_identity:
+ print("\nGenerating Expert Identity....")
+ expert_identity = self.generate_expert_identity(params.task_description)
+ self.logger.info(f"Expert Identity: {expert_identity}")
+
+ if params.generate_intent_keywords:
+ print("\nGenerating Intent Keywords....")
+ intent_keywords = self.generate_intent_keywords(params.task_description,
+ params.base_instruction)
+
+ final_best_prompt += "Keywords: " + intent_keywords
+
+ self.iolog.dump_chained_log_to_file("best_prompt")
+ self.logger.info(f"Final best prompt: {final_best_prompt}")
+
+ return final_best_prompt, expert_identity
diff --git a/promptwizard/glue/promptopt/techniques/critique_n_refine/prompt_pool.yaml b/promptwizard/glue/promptopt/techniques/critique_n_refine/prompt_pool.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..073c7224c53782e5cd77efe7cfa47beef7a2a616
--- /dev/null
+++ b/promptwizard/glue/promptopt/techniques/critique_n_refine/prompt_pool.yaml
@@ -0,0 +1,231 @@
+final_prompt: |
+ {instruction}
+ {few_shot_examples}
+
+ {answer_format}
+
+eval_prompt: |
+ {instruction}
+
+ [Question] {question}
+ [Answer]
+
+quest_reason_ans: |
+
+ [Question] {question}
+ [Answer] {answer}
+
+system_prompt: You are a helpful assistant developed by OpenAI that can efficiently perform tasks as per instruction
+
+expert_profile: You are a helpful assistant developed by OpenAI that can efficiently perform tasks as per instruction
+
+thinking_styles:
+ - "How could I devise an experiment to help solve that problem?"
+ - "Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made."
+ - "How could I measure progress on this problem?"
+ - "How can I simplify the problem so that it is easier to solve?"
+ - "What are the key assumptions underlying this problem?"
+ - "What are the potential risks and drawbacks of each solution?"
+ - "What are the alternative perspectives or viewpoints on this problem?"
+ - "What are the long-term implications of this problem and its solutions?"
+ - "How can I break down this problem into smaller, more manageable parts?"
+ - "Critical Thinking: This style involves analyzing the problem from different perspectives, questioning assumptions, and evaluating the evidence or information available. It focuses on logical reasoning, evidence-based decision-making, and identifying potential biases or flaws in thinking."
+ - "Try creative thinking, generate innovative and out-of-the-box ideas to solve the problem. Explore unconventional solutions, thinking beyond traditional boundaries, and encouraging imagination and originality."
+ - "Seek input and collaboration from others to solve the problem. Emphasize teamwork, open communication, and leveraging the diverse perspectives and expertise of a group to come up with effective solutions."
+ - "Use systems thinking: Consider the problem as part of a larger system and understanding the interconnectedness of various elements. Focuses on identifying the underlying causes, feedback loops, and interdependencies that influence the problem, and developing holistic solutions that address the system as a whole."
+ - "Use Risk Analysis: Evaluate potential risks, uncertainties, and tradeoffs associated with different solutions or approaches to a problem. Emphasize assessing the potential consequences and likelihood of success or failure, and making informed decisions based on a balanced analysis of risks and benefits."
+ - "Use Reflective Thinking: Step back from the problem, take the time for introspection and self-reflection. Examine personal biases, assumptions, and mental models that may influence problem-solving, and being open to learning from past experiences to improve future approaches."
+ - "What is the core issue or problem that needs to be addressed?"
+ - "What are the underlying causes or factors contributing to the problem?"
+ - "Are there any potential solutions or strategies that have been tried before? If yes, what were the outcomes and lessons learned?"
+ - "What are the potential obstacles or challenges that might arise in solving this problem?"
+ - "Are there any relevant data or information that can provide insights into the problem? If yes, what data sources are available, and how can they be analyzed?"
+ - "Are there any stakeholders or individuals who are directly affected by the problem? What are their perspectives and needs?"
+ - "What resources (financial, human, technological, etc.) are needed to tackle the problem effectively?"
+ - "How can progress or success in solving the problem be measured or evaluated?"
+ - "What indicators or metrics can be used?"
+ - "Is the problem a technical or practical one that requires a specific expertise or skill set? Or is it more of a conceptual or theoretical problem?"
+ - "Does the problem involve a physical constraint, such as limited resources, infrastructure, or space?"
+ - "Is the problem related to human behavior, such as a social, cultural, or psychological issue?"
+ - "Does the problem involve decision-making or planning, where choices need to be made under uncertainty or with competing objectives?"
+ - "Is the problem an analytical one that requires data analysis, modeling, or optimization techniques?"
+ - "Is the problem a design challenge that requires creative solutions and innovation?"
+ - "Does the problem require addressing systemic or structural issues rather than just individual instances?"
+ - "Is the problem time-sensitive or urgent, requiring immediate attention and action?"
+ - "What kinds of solution typically are produced for this kind of problem specification?"
+ - "Given the problem specification and the current best solution, have a guess about other possible solutions."
+ - "Let's imagine the current best solution is totally wrong, what other ways are there to think about the problem specification?"
+ - "What is the best way to modify this current best solution, given what you know about these kinds of problem specification?"
+ - "Ignoring the current best solution, create an entirely new solution to the problem."
+ - "Let's think step by step."
+ - "Let's make a step by step plan and implement it with good notion and explanation."
+
+
+# ans_delimiter_instruction: " Wrap only your final answer, without reason for each question separately between and tags."
+ans_delimiter_instruction: ""
+
+meta_critique_template: |
+ I'm trying to write a zero-shot instruction that will help the most capable and suitable agent to solve the task.
+ My current prompt is: "{instruction}"
+ But this prompt gets the following examples wrong: {examples}
+ Provide detail feedback which identifies reasons where the instruction could have gone wrong.
+ Wrap each reason with and
+
+
+meta_positive_critique_template: |
+ I'm trying to write a prompt for zero-shot instruction task that will help the most capable and suitable agent to solve the task.
+ My current prompt is:
+ [CURRENT PROMPT] "{instruction}"
+ Now this prompt got the following examples correct:
+ [CORRECT EXAMPLES] {examples}
+ Since you cant use these examples, analyse and understand characteristics/complexity and diversity of these examples and their reasoning chain and
+ accordingly provide suggestions to further improve the prompt and make it better as a zero shot instruction task.
+
+
+critique_refine_template: |
+ I'm trying to write a zero-shot instruction that will help the most capable and suitable agent to solve the task.
+ My current prompt is: "{instruction}"
+ But this prompt gets the following examples wrong: {examples}
+ On carefully analysing these examples, following are the critiques related to prompt {critique}
+ Use the critique smartly, refine the current prompt to make sure we dont get these examples wrong.
+ Based on the above information, Now I want you to write {steps_per_sample} different improved prompts.
+ Each prompt should be wrapped with and .
+ [Refined Prompts]:
+
+
+solve_template: |
+ You are given a prompt instruction and the following {questions_batch_size} questions of the same task.
+ [Instruction]: {instruction}
+
+ [Question]: {questions}
+
+ {answer_format}
+
+ [Answers]:
+
+
+meta_sample_template: |
+ You are given a task description and a prompt instruction and different styles known as meta prompts:
+ [Task Description]: {task_description}
+ [Meta Prompt]: {meta_prompts}
+ Now you need to generate {num_variations} variations of following Instruction adaptively mixing meta prompt while keeping similar semantic meaning.
+ Make sure to wrap each generated prompt with and
+ [Prompt Instruction]: {prompt_instruction}
+ [Generated Prompts]:
+
+
+intent_template: |
+ You are given an instruction along description of task labelled as [Task Description]. For the given instruction, list out 3-5 keywords in comma separated format as [Intent] which define the characteristics or properties required by the about the most capable and suitable agent to solve the task using the instruction.
+
+
+ [Task Description]: {task_description}
+ [Instruction]: {instruction}
+
+
+ [Intent]:
+
+
+expert_template: |
+ For each instruction, write a high-quality description about the most capable and suitable agent to answer the instruction. In second person perspective.\n
+
+ [Instruction]: Make a list of 5 possible effects of deforestation.\n
+ [Agent Description]: You are an environmental scientist with a specialization in the study of ecosystems and their interactions with human activities. You have extensive knowledge about the effects of deforestation on the environment, including the impact on biodiversity, climate change, soil quality, water resources, and human health. Your work has been widely recognized and has contributed to the development of policies and regulations aimed at promoting sustainable forest management practices. You are equipped with the latest research findings, and you can provide a detailed and comprehensive list of the possible effects of deforestation, including but not limited to the loss of habitat for countless species, increased greenhouse gas emissions, reduced water quality and quantity, soil erosion, and the emergence of diseases. Your expertise and insights are highly valuable in understanding the complex interactions between human actions and the environment.
+
+
+ [Instruction]: Identify a descriptive phrase for an eclipse.\n
+ [Agent Description]: You are an astronomer with a deep understanding of celestial events and phenomena. Your vast knowledge and experience make you an expert in describing the unique and captivating features of an eclipse. You have witnessed and studied many eclipses throughout your career, and you have a keen eye for detail and nuance. Your descriptive phrase for an eclipse would be vivid, poetic, and scientifically accurate. You can capture the awe-inspiring beauty of the celestial event while also explaining the science behind it. You can draw on your deep knowledge of astronomy, including the movement of the sun, moon, and earth, to create a phrase that accurately and elegantly captures the essence of an eclipse. Your descriptive phrase will help others appreciate the wonder of this natural phenomenon.
+
+
+
+ [Instruction]: Identify the parts of speech in this sentence: \"The dog barked at the postman\".\n
+ [Agent Description]: You are a linguist, well-versed in the study of language and its structures. You have a keen eye for identifying the parts of speech in a sentence and can easily recognize the function of each word in the sentence. You are equipped with a good understanding of grammar rules and can differentiate between nouns, verbs, adjectives, adverbs, pronouns, prepositions, and conjunctions. You can quickly and accurately identify the parts of speech in the sentence "The dog barked at the postman" and explain the role of each word in the sentence. Your expertise in language and grammar is highly valuable in analyzing and understanding the nuances of communication.
+
+
+ [Instruction]: {task_description}
+ [Agent Description]:
+
+
+examples_critique_template: |
+ You are an expert example selector who can help in selection of right in-context examples to help the most suitable agent solve this problem.
+ You are also given the prompt instruction which is used to solve this task
+ [Prompt]: {prompt}
+ You are given the task description of the task:
+ [Task Description]: {task_description}
+ I'm trying to write a few shots prompt using {num_examples} in-context examples to effectively solve any questions of the above task.
+ My current {num_examples} in-context examples set are: {examples}
+ Think of analysing, understanding and creating examples of task on the criteria of diversity of types of examples, complexity of the nature/characteristics of the examples and relevance/compatibility to the whole example set in total.
+ Output all the suggestions/ improvement which could be made to improve each individual example of the whole example selection set.
+
+examples_critique_template_zero_shot: |
+ You are an expert example selector who can help in selection of right in-context examples to help the most suitable agent solve this problem.
+ You are also given the prompt instruction which is used to solve this task
+ [Prompt]: {prompt}
+ You are given the task description of the task:
+ [Task Description]: {task_description}
+ I'm trying to write a few shots prompt using {num_examples} in-context examples to effectively solve any questions of the above task.
+ Think of analysing, understanding and creating examples of task on the criteria of diversity of types of examples, complexity of the nature/characteristics of the examples and relevance/compatibility to the whole example set in total.
+ Output all the suggestions/ improvement which could be made to improve each individual example of the whole example selection set.
+
+examples_optimization_template: |
+ You are an expert example selector who can help in selection of right in-context examples to help the agent solve this problem.
+ You are also given the prompt instruction which is used to solve this task
+ [Prompt]: {prompt}
+ You are given the description of the task:
+ [Task Description]: {task_description}
+ I'm trying to write a few shots prompt using {num_examples} in-context examples to effectively solve any questions of the above task.
+ My current {num_examples} in-context examples set are: {examples}
+ You are also given a set of suggestions/improvements which could be made to improve each individual example of the whole example selection set:
+ [SUGGESTION/IMPROVEMENT]: {critique}
+ Based on the above information, use all of it smartly and diligently to carefully create new set of {num_examples}, which follow these suggestion and improvements.
+ Make sure to output each example wrapped with and .
+
+ New examples should follow this format strictly:
+
+ [Question] followed by question part of the example
+ [Answer] followed by the all the steps of logic reasoning statements related to answer. The final answer as "[answer]"
+
+ For Example:
+ {gt_example}
+
+
+ [New Examples]:
+
+
+generate_reason_template: |
+ You are given a task description and instruction followed by a set of correct examples of the task.
+
+ [Task Description]: {task_description}
+
+ [Instruction]: {instruction}
+
+ Each example has a question denoted by question [Question] and a final answer [Answer] .
+
+ [Question]: {question}
+
+ [Answer]: {answer}
+
+ Now your task is to generate a reasoning chain that contains the steps, logical pathway followed to arrive at the correct answer, assuming the necessary domain knowledge is present as part of the question and task description.
+
+ Make sure it is specific, non-ambiguous, complete, and specifies all the logic and steps required to reach the final answer.
+
+ [Improved Reasoning Chain]:
+
+
+reason_optimization_template: |
+ You are given a task description and instructions of given task
+
+ [Task Description]: {task_description}
+
+ [Instruction]: {instruction}
+
+ Each example has a question denoted by a question [Question] and a final answer [Answer].
+
+ [Question]: {question}
+
+ [Answer]: {answer}
+
+ Please explain your reasoning behind reaching the answer given in a concise, complete, and coherent text of reasoning that contains all the steps or logical pathways followed. Ensure it is specific and non-ambiguous, and assume the necessary domain knowledge is in the question and task description.
+
+ [Improved Reasoning Chain]:
+
+
diff --git a/promptwizard/glue/promptopt/utils.py b/promptwizard/glue/promptopt/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..55ab31a8378b79a345ba36e0d868edd76edd4346
--- /dev/null
+++ b/promptwizard/glue/promptopt/utils.py
@@ -0,0 +1,25 @@
+from ..common.exceptions import GlueValidaionException
+from .constants import PromptOptimizationParams, PromptPool, SupportedPromptOpt
+from .techniques.common_logic import PromptOptimizer
+from .techniques.critique_n_refine.core_logic import CritiqueNRefine
+from .techniques.critique_n_refine.base_classes import CritiqueNRefineParams, \
+ CritiqueNRefinePromptPool
+
+
+def get_promptopt_class(prompt_technique_name: str) -> (PromptOptimizer, PromptOptimizationParams, PromptPool):
+ """
+ :params prompt_technique_name: Name of prompt optimization technique
+ :return: Instance of class PromptRefinements, which is super class for all Prompt Optimization classes,
+ Instance of class that holds all hyperparameters for that technique,
+ Instance of class that holds all prompt strings for that techniques
+ """
+ prompt_technique_name = prompt_technique_name.lower()
+ if prompt_technique_name == SupportedPromptOpt.CRITIQUE_N_REFINE.value:
+ return CritiqueNRefine, CritiqueNRefineParams, CritiqueNRefinePromptPool
+ else:
+ raise GlueValidaionException(f"Value provided for `prompt_technique_name` field in config yaml of "
+ f"prompt manager is `{prompt_technique_name}`, which is not a valid name for "
+ f"the prompt optimization techniques that we support. Please provide input as one "
+ f"among the following: {SupportedPromptOpt.all_values()}", None)
+
+
diff --git a/promptwizard/version.py b/promptwizard/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..923277bf37675ab64816213bb89060bd3ef2e25a
--- /dev/null
+++ b/promptwizard/version.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+_MAJOR = "0"
+_MINOR = "2"
+# On master and in a nightly release the patch should be one ahead of the last
+# released build.
+_PATCH = "2"
+# This is mainly for nightly builds which have the suffix ".dev$DATE". See
+# https://semver.org/#is-v123-a-semantic-version for the semantics.
+_SUFFIX = ""
+
+VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
+VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..c86bb8a71dd5d0bdfebdd28616c5c5dc07bb4d46
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,11 @@
+[tool.black]
+line-length = 88
+target-version = ['py38']
+include = '\.pyi?$'
+
+[tool.isort]
+atomic = true
+profile = "black"
+line_length = 88
+skip_gitignore = true
+known_first_party = ["promptwizard"]
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..d21b6793a8ceabdfce5594db80198199a40a75c8
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,29 @@
+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = sdtools
+known_third_party =
+ imblearn
+ numpy
+ pandas
+ pytorch-tabnet
+ scipy
+ sklearn
+ ipywidgets
+ torch
+ torchaudio
+ torchvision
+ torch_xla
+ tqdm
+ xgboost
+
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+
+[flake8]
+ignore = E203, E501, E741, W503, W605
+max-line-length = 119
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b63d43f4c71b9d35a32108848da98843af8db11
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+from setuptools import find_packages, setup
+
+# PEP0440 compatible formatted version, see:
+# https://www.python.org/dev/peps/pep-0440/
+#
+# release markers:
+# X.Y
+# X.Y.Z # For bugfix releases
+#
+# pre-release markers:
+# X.YaN # Alpha release
+# X.YbN # Beta release
+# X.YrcN # Release Candidate
+# X.Y # Final release
+
+# version.py defines the VERSION and VERSION_SHORT variables.
+# We use exec here so we don't import allennlp whilst setting up.
+VERSION = {} # type: ignore
+with open("promptwizard/version.py", "r") as version_file:
+ exec(version_file.read(), VERSION)
+
+INSTALL_REQUIRES = [
+ "datasets",
+ "tiktoken",
+ "nltk",
+ "openai",
+ "azure-identity",
+ "azure-search-documents",
+ "pyyaml~=6.0.1",
+ "pyarrow==15.0.2",
+ "llama-index==0.11.10",
+ "llama-index-core==0.11.10",
+ "python-dotenv"
+]
+QUANLITY_REQUIRES = [
+ "black==21.4b0",
+ "flake8>=3.8.3",
+ "isort>=5.5.4",
+ "pre-commit",
+ "pytest",
+ "pytest-xdist",
+]
+DEV_REQUIRES = INSTALL_REQUIRES + QUANLITY_REQUIRES
+
+setup(
+ name="promptwizard",
+ version=VERSION["VERSION"],
+ author="The PromptWizard team",
+ author_email="promptwizard@microsoft.com",
+ description="Optimize Prompt",
+ long_description=open("README.md", encoding="utf8").read(),
+ long_description_content_type="text/markdown",
+ keywords="PromptWizard",
+ license="MIT License",
+ url="https://github.com/microsoft/PromptWizard",
+ classifiers=[
+ "Intended Audience :: Science/Research",
+ "Development Status :: 3 - Alpha",
+ "Programming Language :: Python :: 3",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ ],
+ package_dir={"": "."},
+ packages=find_packages("."),
+ extras_require={
+ "dev": DEV_REQUIRES,
+ "quality": QUANLITY_REQUIRES,
+ },
+ install_requires=INSTALL_REQUIRES,
+ include_package_data=True,
+ python_requires=">=3.8.0",
+ zip_safe=False,
+)