File size: 4,826 Bytes
f227549
 
 
 
 
 
 
 
4371a52
f227549
 
 
 
 
 
 
 
70b8145
f227549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb10985
f227549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import openai 
import tiktoken
import numpy as np
import ast
import pandas as pd
import matplotlib.pyplot as plt
import gradio as gr
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API keys from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

openai.api_key = OPENAI_API_KEY
client = openai

# Initialize the tokenizer for the model
tokenizer = tiktoken.get_encoding('p50k_base')  # Use the appropriate encoding for your model

def get_embedding(text, model='text-embedding-3-small', max_tokens=7000):
    # Tokenize the text and truncate if necessary
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens)
    
    return client.embeddings.create(input=[text],model=model).data[0].embedding
    
data = pd.read_csv("ucdavis_health_embeddings.csv")

# Handle NaN values and convert the 'embedding' column from strings to lists of floats
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []

data['embedding'] = data['embedding'].apply(safe_literal_eval)

# Ensure all embeddings are lists of floats and filter out empty embeddings
data['embedding'] = data['embedding'].apply(lambda x: [float(i) for i in x] if isinstance(x, list) else [])
data = data[data['embedding'].apply(lambda x: len(x) > 0)]


def query(question):
    question_embedding = get_embedding(question)
    
    def fn(page_embedding):
        return np.dot(page_embedding, question_embedding)
    
    distance_series = data['embedding'].apply(fn)
    
    top_four = distance_series.sort_values(ascending=False).index[0:4]
    
    context_series = data.loc[top_four]['text']
    context = " ".join(context_series)
    similarity_scores = distance_series.sort_values(ascending=False)[0:4]
    links_series = data.loc[top_four]['url']
    links = "\n \n".join(links_series)
    link_list = links_series.tolist()
    
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant tasked to respond to users of UC Davos Health who are seeking information about their services"},
            {"role": "user", "content": question},
            {"role": "assistant", "content": f"Use this information from the UC Davis Health website and answer the user's question: {context}. Please stick to this context while answering the question. Include all important information relevant to what the user is seeking, also tell them things they should be mindful of while following instructions. Don't miss any details about timings or weekdays."}
        ],
        model="gpt-3.5-turbo"
    )

    return chat_completion.choices[0].message.content, links, similarity_scores.tolist(), link_list

def plot_bar_chart(similarity_scores, links_series):
    # Sort the similarity scores and links together
    sorted_pairs = sorted(zip(similarity_scores, links_series))  # Remove reverse=True to keep ascending order
    sorted_scores, sorted_links = zip(*sorted_pairs)
    
    # Create labels as "Link 1", "Link 2", etc.
    link_labels = [f"Link {i+1}" for i in range(len(sorted_links)-1, -1, -1)]
    
    plt.figure(figsize=(12, 8))  # Adjusting the figure size to make it larger
    bars = plt.barh(link_labels, sorted_scores, color='skyblue', edgecolor='black')
    plt.xlabel('Similarity Score')
    plt.ylabel('Links')
    plt.title('Similarity Scores Bar Chart for the above links in the same order')
    plt.xlim(0, 1)  # Set x-axis scale from 0 to 1
    plt.grid(True, axis='x')
    
    # Add labels for each bar
    for bar, score in zip(bars, sorted_scores):
        plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
                 f'{score:.2f}', va='center', ha='left')

    plt.tight_layout()
    plt.savefig('bar_chart.png')
    return 'bar_chart.png'

# Define the Gradio interface
def gradio_query(question):
    answer, links, similarity_scores, link_list = query(question)
    bar_plot_path = plot_bar_chart(similarity_scores, link_list)
    return answer, links, bar_plot_path

interface = gr.Interface(
    fn=gradio_query,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="For more information, visit these links"),
        gr.Image(type="filepath", label="Similarity Scores Bar Chart", elem_id="bar_chart")
    ],
    title="UC Davis Health Query Assistant",
    description="Ask your questions about UC Davis Health services and get relevant information from their website.",
    css=".gradio-container #bar_chart img {width: 200%; height: auto;}"
)

# Launch the interface
interface.launch(share=True)