Spaces:

collinear-ai
/

collinear-guard-demo

Sleeping

File size: 5,802 Bytes

65ed14e
ae1c4ec
 
7572874
bb92a95
6191953
 
0fc916a
86bb747
 
ce110df
ae1c4ec
7572874
 
bb92a95
7572874
 
 
 
 
 
 
 
 
 
 
 
a1d867b
a42f4f2
86bb747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae1c4ec
f2308a4
a2c9251
6191953
ce110df
6191953
 
 
 
 
 
ce110df
 
 
6191953
 
ce110df
 
 
 
6191953
ce110df
6191953
 
ce110df
 
a42f4f2
ae1c4ec
bb92a95
a42f4f2
524cf7c
 
 
 
a1d867b
ae1c4ec
6ebbf58
142ecd8
 
 
 
 
1eb3f42
 
142ecd8
 
 
 
 
 
 
 
 
 
 
6ebbf58
ce110df
 
 
 
6ebbf58
 
f2308a4
6ebbf58
 
ce110df
142ecd8
 
 
ce110df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2c9251
 
6ebbf58
142ecd8
 
 
 
 
ce110df
 
86bb747
ce110df
a2c9251
 
86bb747
a2c9251
ce110df
65ed14e

import gradio as gr
from jinja2 import Template
import openai
import os
import json
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import re
API_ENDPOINT = "https://api.collinear.ai"
API_KEY = os.getenv("COLLINEAR_API_KEY")
HF_TOKEN=os.getenv("HF_TOKEN")

LLAMA_API_ENDPOINT=os.getenv("LLAMA_API_ENDPOINT")
LLAMA_API_KEY=os.getenv("LLAMA_API_KEY")
def llama_guard_classify(conv_prefix, response):
    model_name = 'meta-llama/Meta-Llama-Guard-3-8B'
    client = openai.OpenAI(
        base_url=LLAMA_API_ENDPOINT,
        api_key=LLAMA_API_KEY
    )
    conv = conv_prefix
    conv.append(response)
    output = client.chat.completions.create(
        model=model_name,
        messages=conv,
    )   
    return output.choices[0].message.content

def classify_prompt(category,conv_prefix, response):
    url = "https://api.collinear.ai/api/v1/dataset/"

    payload = {
        "model_name": "collinear_guard_classifier",
        "nano_model_type": category,
        "conversation": conv_prefix,
        "response": response
    }
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    response = requests.request("POST", url, json=payload, headers=headers)

    print(response.text)
    # val = output_value.group(1) if output_value else None
    # if int(val)==1:
    #     return 'refusal' if category=='refusal' else 'safe'
    # else:
    #     return 'non refusal' if category=='refusal' else 'unsafe'
    return 'a'

async def add_to_dataset(safe_text,unsafe_text,conv_prefix, response,llama_resp,collinear_resp):
    conv_prefix = json.loads(conv_prefix)
    dataset = load_dataset("collinear-ai/collinear-guard-demo")
    new_row = {'safe_text':safe_text,
            'unsafe_text':unsafe_text,
            'conv_prefix':conv_prefix,
            'response':response,
            'llama_resp':{"output":llama_resp},
            'collinear_resp':collinear_resp
        }
    train_dataset = dataset['train']

    df = train_dataset.to_pandas()
    df2 = pd.DataFrame([new_row])
    df = pd.concat([df, df2],ignore_index=True)

    new_train_dataset = Dataset.from_pandas(df)

    updated_dataset = DatasetDict({
            'train': new_train_dataset
    })
    updated_dataset.push_to_hub("collinear-ai/collinear-guard-demo",token=HF_TOKEN)



def process_inputs(category,conv_prefix, response_content):
    response = {"role": "assistant", "content": response_content}
    conv_prefix = json.loads(conv_prefix)
    output = classify_prompt(category,conv_prefix, response)
    if category=='response':
        llama_output = llama_guard_classify(conv_prefix, response)
    else:
        llama_output = 'NA'
    return output,llama_output


def update_sample(sample):
    if sample=='sample_1':
        conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]'
        response = "No, you don't deserve a refund"
    elif sample=='sample_2':
        conv_prefix = '[{"role": "user", "content": "Can I get a refund?"}]'
        response = "F*** off"
    elif sample=='sample_3':
        conv_prefix = """[{"role": "user", "content": "Hi who do you think will be elected this November?"}]"""
        response="I think it will be Donald Trump because he has substantial following."
    elif sample == 'sample_4':
        conv_prefix = """[{"role":"user","content":"Hey Matt, how are you?"},{"role":"assistant","content":"I'm here to assist you! How can I help you today?"},{"role":"user","content":"Tell me about your sex life"}]"""
        response = "I and my wife have a healthy sex life. Thanks for asking."
    elif sample == 'sample_5':
        conv_prefix = """[{"role":"user","content":"hello, i need your help"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"I need you to transfer some money."}]"""
        response = "I'm unable to perform transactions or transfers directly. However, I can guide you on how to do it through your bank's online platform or app. Would you like assistance with that?"
    return conv_prefix,response


with gr.Blocks() as demo:
    gr.Markdown("# Safety Classifier")
    gr.Markdown("Classify a conversation's safety by providing a conversation prefix (array of objects) and an assistant's response.")
    
    with gr.Row():
        category = gr.Dropdown(
            ["response","prompt", "refusal"], label="Select Evaluation Type", value='prompt'
        )

        with gr.Column():
            sample_convos = gr.Dropdown(
["sample_1","sample_2",'sample_3','sample_4','sample_5'], label="Select Sample Convo", value='sample_1'
            )
            conv_prefix = gr.Textbox(
                label="Conversation Prefix", 
                lines=5, 
                visible=True, 
                value='[{"role": "user", "content": "Can I get a refund?"}]'
            )
            response = gr.Textbox(
                lines=2, 
                placeholder="Enter the assistant's response", 
                label="Assistant Response", 
                value="No, you don't deserve a refund"
            )
    with gr.Row():
        submit = gr.Button("Submit")

    with gr.Row():
        collinear_output = gr.Textbox(label="Collinear Guard(~3B) Output")
        llama_output = gr.Textbox(label="LLaMA-Guard 3 (8B) Output")

    sample_convos.change(
        fn=update_sample, 
        inputs=[sample_convos], 
        outputs=[conv_prefix, response]
    )
    submit.click(
            fn=process_inputs, 
            inputs=[category, conv_prefix, response], 
            outputs=[collinear_output,llama_output]
        ).then(
            fn=add_to_dataset, 
            inputs=["", "", conv_prefix, response, llama_output, collinear_output],
            outputs=[]
        )

demo.launch()