falcon-lablabai-hackathon-brainstorming-buddy-for-researchers

Sleeping

falcon-lablabai-hackathon-brainstorming-buddy-for-researchers

File size: 8,653 Bytes

140793a
0b15f14
140793a
 
0b15f14
140793a
 
0b15f14
140793a
fca63f5
0b15f14
 
7b98782
140793a
c683fd1
140793a
 
 
 
 
 
b562a52
 
 
 
 
 
 
 
 
a4f602f
 
 
 
58024e1
a4f602f
 
 
 
 
 
 
 
58024e1
0fc355c
a4f602f
 
 
 
 
 
 
 
 
 
 
 
d41d3c0
a4f602f
d41d3c0
b562a52
 
a7a18ce
 
 
b562a52
2a75905
b562a52
2a75905
6f44849
b562a52
 
 
d41d3c0
b562a52
 
 
d41d3c0
 
 
83a6345
 
dc4eae6
 
 
 
140793a
 
dc4eae6
83a6345
140793a
 
dc4eae6
 
 
140793a
 
83a6345
140793a
83a6345
f85192b
 
 
 
 
 
 
16d8914
a4f602f
f85192b
a4f602f
 
 
b2b04da
75b57b8
875c9b5
a4f602f
b0e60d8
b2b04da
2c9f508
 
a4f602f
 
 
58024e1
b0e60d8
a4f602f
 
f85192b
e96a163
ced62d8
f84c25c
e2b8bf1
 
ac25f50
e2b8bf1
ac25f50
 
 
 
 
e2b8bf1
9c1abb5
140793a
 
 
ced62d8
 
140793a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b15f14
140793a
0b15f14
140793a
 
 
736d538
 
0b15f14
140793a
72270c7
b7adee1
3eabe69
140793a
72270c7
 
8657d51
 
 
72270c7
 
b456778
3eabe69
140793a
 
 
 
 
ced62d8
 
 
 
 
140793a
ced62d8

import json
import os
import shutil
import requests

import gradio as gr
from huggingface_hub import Repository, InferenceClient

HF_TOKEN = os.environ.get("HF_TOKEN", None)
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat"
BOT_NAME = "Falcon"

STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"]

EXAMPLES = [["climate change"], ["2308.15699"], ["hallucination"], ["2308.00205"], ["large language model"], ["2308.05204"], ["2308.10873"], ["2308.06355"],["2308.01684"],["2308.00352"],["2308.07773"]]

client = InferenceClient(
    API_URL,
    headers={"Authorization": f"Bearer {HF_TOKEN}"},
)

id_dict = {}
for i in range(0,4):
    fname = "arxiv_2023_" + str(i)
    with open(fname, "r") as f:
        for line in f:
            D = json.loads(line)
            id_dict[D['id']] = D


def format_prompt_summarize(message, history, system_prompt, keyword):

  prompt = ""
  prompt += "System: You are scholarly RESEARCH ASSISTANT who can read the ARXIV scholarly article.\n"
  prompt += "User: READ ALL THE TITLEs and ABSTRACTs of various article below\n"
  prompt += "Generate a SUMMARY of all the articles below relevant to the research for the field of \"" + keyword + "\"\n"
  prompt += "SUGGEST FIVE IMPORTANT FINDINGS or ORIGINAL CONTRIBUTIONS of OBSERVATIONs for the field of \"" + keyword + "\" that summarizes the work.\n"
  prompt += "Each BULLET POINT must be be less than 15 WORDS. \n"
  prompt += "Output the FIVE KEY FINDINGS as BULLET POINTS with UNDERLINE OR BOLDEN KEY PHRASES.\n"
  prompt += "Propose ONE CREATIVE ACTIONABLE IDEA for FUTURE extension of the RESEARCH\n. You MUST output the CREATIVE IDEA with a BULB OR IDEA OR THINKING emoji.\n"
  prompt += "Output ONE CREATIVE IDEA for FUTURE extension with a RANDOM emoji\n"
  prompt += "Choose an UNRELATED or ORTHOGONAL field where the FINDINGS of the article can be applied.\n"
  prompt += "In a new line, OUTPUT ONE CRAZY IDEA  in 20 WORDS how the KEY FINDINGS of RESEARCH article can be applied in an ORTHOGONAL or UNRELATED FIELD with a CRAZY IDEA emoji \n"
  prompt +=  message + "\n"

  mock_prompt = ""
  if system_prompt == "":
    mock_prompt += f"System: {system_prompt}\n"
  for user_prompt, bot_response in history:
    mock_prompt += f"User: {user_prompt}\n"
    mock_prompt += f"Falcon: {bot_response}\n" # Response already contains "Falcon: "
  mock_prompt += f"""User: {message}
Falcon:"""
  return prompt



def format_prompt(message, history, system_prompt):

  prompt = ""
  prompt += "System: You are scholarly RESEARCH ASSISTANT who can read the ARXIV scholarly article.\n"
  prompt += "READ THE TITLE and ABSTRACT of the article below\n"
  prompt += "After understanding the ABSTRACT, SUGGEST 4 IMPORTANT FINDINGS or ORIGINAL CONTRIBUTIONS of OBSERVATIONs that summarizes the work.\n"
  prompt += "Each BULLET POINT must be be less than 15 WORDS. \n"
  prompt += "Output the FOUR KEY FINDINGS as BULLET POINTS with UNDERLINE OR BOLDEN KEY PHRASES.\n"
  prompt += "Propose ONE CREATIVE ACTIONABLE IDEA for FUTURE extension of the RESEARCH\n. You MUST output the CREATIVE IDEA with a BULB OR IDEA OR THINKING emoji.\n"
  prompt += "Output ONE CREATIVE IDEA for FUTURE extension with a RANDOM emoji\n"
  prompt += "Choose an UNRELATED or ORTHOGONAL field where the FINDINGS of the article can be applied.\n"
  prompt += "In a new line, OUTPUT ONE CRAZY IDEA  in 20 WORDS how the KEY FINDINGS of RESEARCH article can be applied in an ORTHOGONAL or UNRELATED FIELD with a CRAZY IDEA emoji \n"
  prompt += "User:" + message + "\n"
  mock_prompt = ""
  if system_prompt == "":
    mock_prompt += f"System: {system_prompt}\n"
  for user_prompt, bot_response in history:
    mock_prompt += f"User: {user_prompt}\n"
    mock_prompt += f"Falcon: {bot_response}\n" # Response already contains "Falcon: "
  mock_prompt += f"""User: {message}
Falcon:"""
  return prompt

seed = 42

def generate(
    prompt, history, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)
    global seed
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop_sequences=STOP_SEQUENCES,
        do_sample=True,
        seed=seed,
    )
    seed = seed + 1

    title = "INPUT ARXI ID"
    abstract = ""
    if prompt in id_dict:
        title = id_dict[prompt]['title']
        abstract = id_dict[prompt]['abstract']
        prompt = f"TITLE: {title} ABSTRACT: {abstract}\n"
        output = f"<b>Title: </b> {title} \n <br>"
        formatted_prompt = format_prompt(prompt, history, system_prompt)
    else:
        keyword = prompt
        counter= 0
        for d in id_dict:
            title = id_dict[d]['title']
            abstract = id_dict[d]['abstract']
            if keyword in title or keyword in abstract:
               counter+=1## its a hit
               prompt += "ARTICLE " + str(counter) + "\n"
               prompt += f"TITLE: {title} ABSTRACT: {abstract}\n"
               if counter >= 4:
                   break

        prompt += "Keyword: " + keyword + "\n"
        formatted_prompt = format_prompt_summarize(prompt, history, system_prompt, keyword)
        output = "Articles related to the keyword " + keyword + "\n"
    

    
    

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    #output = ""

    for response in stream:
        output += response.token.text

        for stop_str in STOP_SEQUENCES:
            if output.endswith(stop_str):
                output = output[:-len(stop_str)]
                output = output.rstrip()
                yield output
        yield output
    return output


additional_inputs=[
    gr.Textbox("", label="Optional system prompt"),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=8192,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=0.4):
            gr.Image("better_banner.jpeg", elem_id="banner-image", show_label=False)
        with gr.Column():
            gr.Markdown(
                """
                # 
                ** The idea is inspired by CREATIVE WHACK PACK https://apps.apple.com/us/app/creative-whack-pack/id307306326
                
                ** ##Researchers need INSPIRATION to come up with CREATIVE IDEAS.
                ** ###We use Falcon 180B to 
                    <br> - generate a <b>SUMMARY</b> of the arxiv articles (only August articles are supported)
                    <br> - generate a <b>CREATIVE IDEA </b> for future extension
                    <br> - generate a </b>CRAZY IDEA</b> for application in an orthogonal field.

                 This should hopefully CONNECT unrelated fields and inspire researchers to come up with CREATIVE IDEAS.
                ## Please input ARXIV ID or a query, see examples below (limited to 15K articles from August 2023)
                ➡️️ **Intended Use**: this demo is intended to showcase how LLMs can be used to generate creative ideas for future extension and application in orthogonal field.
                
                ⚠️ **Limitations**: the model can and will produce factually incorrect information, hallucinating facts and actions. As it has not undergone any advanced tuning/alignment, it can produce problematic outputs, especially if prompted to do so. Finally, this demo is limited to a session length of about 1,000 words.
                """
            )

    gr.ChatInterface(
        generate, 
        examples=EXAMPLES,
        additional_inputs=additional_inputs,
    ) 

demo.queue(concurrency_count=100, api_open=False).launch(show_api=False)