Spaces:

ask-ANRG
/

Ask-ANRG

Sleeping

File size: 8,636 Bytes

import ast
import json
import os
from pathlib import Path

import openai
import pandas as pd
import numpy as np
from tqdm import tqdm

from annoy import AnnoyIndex

# from openai_function_utils.openai_function_interface import OPENAI_AVAILABLE_FUNCTIONS, OPENAI_FUNCTIONS_DEFINITIONS
DEBUG_PRINT = False
# openai.api_key = OPENAI_KEY
# openai.organization = 'org-dsEkob5KeBBq3lbBLhnCXcJt'


def get_embeddings(input):
    response = openai.Embedding.create(model="text-embedding-ada-002", input=input)
    return response['data'][0]['embedding']


def debug_print(*args, **kwargs):
    if DEBUG_PRINT:
        print(*args, **kwargs)


def transform_user_question(question, model):
    messages = [
        {"role": "system",
         "content": "You are a helpful assistant for ChatGPT that will formulate user's input question to a version that is more understandable by ChatGPT for answering questions related to a research lab."},
        {"role": "user",
         "content": f"Formulate this question into a version that is more understandable by ChatGPT: \"{question}\""}
    #     "content": f"Formulate this question into a version that is more understandable by ChatGPT and is more suitable for embedding retrieval (i.e. we will use the embedding of the re-formulated question to retrieve related documents): \"{question}\""}
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        max_tokens=200
    )
    chagpt_question = response["choices"][0]["message"].content
    return chagpt_question


def search_document(user_question_embed: list, top_k: int = 1):
    csv_filename = 'document_name_to_embedding.csv'
    if not os.path.exists(csv_filename):
        print("This won't happen!")
        return

    df = pd.read_csv(csv_filename)
    # Convert the embedding column from string to list/array
    df['embedding'] = df['embedding'].apply(ast.literal_eval).apply(np.array)

    # Calculate cosine similarity
    user_question_norm = np.linalg.norm(user_question_embed)
    similarities = {}
    for _, row in df.iterrows():
        dot_product = np.dot(user_question_embed, row['embedding'])
        embedding_norm = np.linalg.norm(row['embedding'])
        cosine_similarity = dot_product / (user_question_norm * embedding_norm)
        similarities[row['original_filename']] = cosine_similarity

    # Rank documents by similarity
    ranked_documents = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    debug_print("Ranked documents by similarity:", ranked_documents)

    # Get the most similar article
    for i in range(top_k):
        best_document_filename = ranked_documents[i][0]
        with open(best_document_filename, 'rb') as f:
            document_content = f.read().decode('utf-8')
        debug_print("document_content: ", document_content)
    return document_content


def search_document_annoy(user_question_embed: list, top_k: int, metric):
    csv_filename = 'document_name_to_embedding.csv'
    if not os.path.exists(csv_filename):
        print("This won't happen!")
        return

    df = pd.read_csv(csv_filename, index_col=0)
    # Convert the embedding column from string to list/array
    df['embedding'] = df['embedding'].apply(ast.literal_eval).apply(np.array)

    f = len(df['embedding'][0])  # Length of item vector that will be indexed

    t = AnnoyIndex(f, metric)
    for i in range(len(df)):
        v = df['embedding'][i]
        t.add_item(i, v)

    t.build(10)  # 10 trees
    t.save('test.ann')

    u = AnnoyIndex(f, metric)
    u.load('test.ann')  # will just mmap the file
    ret = u.get_nns_by_vector(user_question_embed, top_k)  # will find top 3 nearest neighbors
    debug_print(df['original_filename'][ret[0]])
    document_content = ""
    for name in ret:
        best_document_filename = df['original_filename'][name]
        with open(best_document_filename, 'rb') as f:
            document_content += f.read().decode('utf-8')
    debug_print("document_content: ", document_content)
    return document_content


def get_document_embeddings(path: str, all_fns: list):
    all_embeddings = []
    all_embedding_fns = []
    all_original_filename = []

    output_sub_dir = path.split('database/original_documents/')
    output_sub_dir = '' if len(output_sub_dir) == 1 else output_sub_dir[1]

    output_dir = os.path.join('database/embeddings', output_sub_dir)

    Path(output_dir).mkdir(parents=True, exist_ok=True)

    for fn in tqdm(all_fns):
        document_name = fn.split('.')[0]
        original_filename = os.path.join(path, fn)
        try:
            with open(original_filename, 'rb') as fin:
                tmp_file = fin.read().decode('utf-8')
                embedding = get_embeddings(tmp_file)
                if embedding is not None:
                    embedding_fn = os.path.join(output_dir, document_name + '.json')
                    with open(embedding_fn, 'w') as fout:
                        json.dump(embedding, fout)
                    all_original_filename.append(original_filename)
                    all_embedding_fns.append(embedding_fn)
                    all_embeddings.append(embedding)
        except Exception:
            print(
                f"Error when obtaining embedding vector for {original_filename}. The model's maximum context length is 8192 tokens. Please make sure the file is valid and file length is not too long.")

    return pd.DataFrame({
        'original_filename': all_original_filename,
        'embedding_filename': all_embedding_fns,
        'embedding': all_embeddings
    })


def util():
    model = "gpt-3.5-turbo"
    question = "Can you give me a paper about graph neural networks?"

    functions = [
        {
            "name": "semantic_search",
            "description": "does a semantic search over the documents based on query",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The query to search for",
                    }
                },
                "required": ["query"],
            }
        },
    ]

    messages = [
        {
            "role": "system",
            "content": "".join([
                "You are a helpful assistant for ChatGPT that will answer the user's questions. ",
                "In order to do so, you may use semantic_search to find relevant documents. ",
            ])
        },
        {
            "role": "user",
            "content": question
        }
    ]

    while True:
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            max_tokens=200,
            functions=functions
        )
        response_message = response["choices"][0]["message"]
        messages.append(
            {
                "role": "assistant",
                "content": response_message.get("content"),
                "function_call": response_message.get("function_call"),
            }
        )

        if response_message.get("function_call"):
            function_args = json.loads(response_message["function_call"]["arguments"])
            embedding = get_embeddings(function_args['query'])
            function_response = search_document(embedding)
            messages.append({
                "role": "function",
                "name": "semantic_search",
                "content": function_response
            })
        else:
            print("Answering question")
            print(response_message["content"])
            return

def main():
    final_df = pd.DataFrame({})
    all_fn_list = os.walk('database/original_documents')

    for path, _, fn_list in all_fn_list:
        filename_to_embedding_df = get_document_embeddings(path, fn_list)
        final_df = pd.concat([final_df, filename_to_embedding_df], axis=0, ignore_index=True)

    final_df.to_csv('document_name_to_embedding.csv')


def parse_downloads_to_title_to_info():
    download_fn = os.path.join(os.getcwd(), 'database/original_documents/downloads.json')
    with open(download_fn, 'r') as fin:
        all_download_info = json.load(fin)
    title_to_info = {}
    for k, v in all_download_info.items():
        tmp_list = v[0] if len(v) == 1 else v
        for entry in tmp_list:
            title_to_info.setdefault(entry['title'], entry)
    download_fn = os.path.join(os.getcwd(), 'database/original_documents/parsed_downloads.json')
    with open(download_fn, 'w') as fout:
        json.dump(title_to_info, fout)


if __name__ == "__main__":
    main()