Spaces:

lamhieu
/

ghost-8b-beta-128k

Paused

App Files Files Community

lamhieu commited on Jul 23, 2024

Commit

ef88752

1 Parent(s): 7a736e5

chore: support tools with search on internet

Browse files

Files changed (3) hide show

README.md +2 -4
app.py +247 -56
requirements.txt +3 -1

README.md CHANGED Viewed

@@ -31,11 +31,9 @@ tags:
 ### Notes
-The extension source code belongs to: "LLM Maybe LongLM: Self-Extend LLM Context Window Without Tuning".
-See source code details [here](https://github.com/datamllab/LongLM).
-```
 @misc{jin2024llm,
       title={LLM Maybe LongLM: Self-Extend LLM Context Window Without Tuning},
       author={Hongye Jin and Xiaotian Han and Jingfeng Yang and Zhimeng Jiang and Zirui Liu and Chia-Yuan Chang and Huiyuan Chen and Xia Hu},

 ### Notes
+The extension source code belongs to: "LLM Maybe LongLM: Self-Extend LLM Context Window Without Tuning". See source code details [here](https://github.com/datamllab/LongLM).
+```tex
 @misc{jin2024llm,
       title={LLM Maybe LongLM: Self-Extend LLM Context Window Without Tuning},
       author={Hongye Jin and Xiaotian Han and Jingfeng Yang and Zhimeng Jiang and Zirui Liu and Chia-Yuan Chang and Huiyuan Chen and Xia Hu},

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # pylint: skip-file
 import subprocess
 subprocess.run(
     f"pip install flash-attn --no-build-isolation",
@@ -15,24 +17,27 @@ from typing import Iterator
 import gradio as gr
 import spaces
 import torch
 import SelfExtend
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-MAX_MAX_NEW_TOKENS = 4096
-DEFAULT_MAX_NEW_TOKENS = 1536
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "123392"))
 DESCRIPTION = """\
-# Playground with Ghost 8B Beta (β, 128k)
-**Ghost 8B Beta** is a large language model developed with goals that include excellent multilingual support, superior knowledge capabilities, and cost-effectiveness. The model comes in two context length versions, [8k](https://huggingface.co/spaces/lamhieu/ghost-8b-beta-8k) and [128k](https://huggingface.co/spaces/lamhieu/ghost-8b-beta-128k), along with multilingual function tools support by default.
-The Ghost 8B Beta model outperforms prominent models such as Llama 3 8B Instruct, GPT 3.5 Turbo in the lc_winrate score. In addition, it also outperforms Claude 3 Opus, Claude 3 Sonnet, GPT-4, and Mistral Large when comparing the winrate score of AlpacaEval 2.0, [*](https://ghost-x.org/docs/models/ghost-8b-beta/).
 The languages supported are 🇺🇸 English, 🇫🇷 French, 🇮🇹 Italian, 🇪🇸 Spanish, 🇵🇹 Portuguese, 🇩🇪 German, 🇻🇳 Vietnamese, 🇰🇷 Korean and 🇨🇳 Chinese.
-📋 Note: current model version is "disl-0x5" (10 Jul 2024), context length 128k (123392 tokens) and current status is "moderating / previewing". For detailed information about the model, see [here](https://ghost-x.org/docs/models/ghost-8b-beta/). Try to experience it the way you want!
 """
@@ -251,19 +256,19 @@ if not torch.cuda.is_available():
 if torch.cuda.is_available():
     model_id = "ghost-x/ghost-8b-beta"
-    model_tk = os.getenv("HF_TOKEN", None)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
-        token=model_tk,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_id,
         trust_remote_code=True,
-        token=model_tk,
     )
     SelfExtend.apply(
         model,
@@ -274,73 +279,259 @@ if torch.cuda.is_available():
     )
     model.generation_config.max_length = 123392
-@spaces.GPU(duration=120)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
-    system_prompt: str,
-    max_new_tokens: int = 1536,
     temperature: float = 0.4,
     top_p: float = 0.95,
     top_k: int = 50,
     repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
-    conversation = []
-    if system_prompt:
-        conversation.append({"role": "system", "content": system_prompt})
-    for user, assistant in chat_history:
-        conversation.extend(
-            [
-                {"role": "user", "content": user},
-                {"role": "assistant", "content": assistant},
-            ]
         )
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(
-        conversation, add_generation_prompt=True, return_tensors="pt"
-    )
-    input_ids = input_ids.to(model.device)
-    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-        gr.Warning(
-            f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens."
         )
-    streamer = TextIteratorStreamer(
-        tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
-    )
-    generate_kwargs = dict(
-        input_ids=input_ids,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        repetition_penalty=repetition_penalty,
-    )
-    if temperature == 0:
-        generate_kwargs["do_sample"] = False
-    else:
-        generate_kwargs["temperature"] = temperature
-        generate_kwargs["top_p"] = top_p
-        generate_kwargs["top_k"] = top_k
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
-chatbot = gr.Chatbot(height=500, placeholder=PLACEHOLDER, label="Ghost 8B Beta")
 chat_interface = gr.ChatInterface(
     fn=generate,
     chatbot=chatbot,
     fill_height=True,
     additional_inputs=[
         gr.Textbox(label="System prompt", lines=6),
         gr.Slider(
             label="Max new tokens",
@@ -382,6 +573,7 @@ chat_interface = gr.ChatInterface(
     cache_examples=False,
     examples=EXAMPLES,
     examples_per_page=9,
 )
 with gr.Blocks(fill_height=True, css="style.css") as demo:
@@ -391,4 +583,3 @@ with gr.Blocks(fill_height=True, css="style.css") as demo:
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)
-    # demo.launch(share=True)

 # pylint: skip-file
 import subprocess
+import json
+import requests
 subprocess.run(
     f"pip install flash-attn --no-build-isolation",
 import gradio as gr
 import spaces
 import torch
+import wikipedia
+import time
 import SelfExtend
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from bs4 import BeautifulSoup
+from functools import lru_cache
+MAX_MAX_NEW_TOKENS = 8192
+DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "123392"))
 DESCRIPTION = """\
+# Playground with Ghost 8B Beta (β, 8k)
+**Ghost 8B Beta** model outperforms prominent models such as Llama 3 8B Instruct, GPT 3.5 Turbo in the lc_winrate score. In addition, it also outperforms Claude 3 Opus, Claude 3 Sonnet, GPT-4, and Mistral Large when comparing the winrate score of AlpacaEval 2.0, [*](https://ghost-x.org/docs/models/ghost-8b-beta/). The model comes in two context length versions, [8k](https://huggingface.co/spaces/lamhieu/ghost-8b-beta-8k) and [128k](https://huggingface.co/spaces/lamhieu/ghost-8b-beta-128k), along with multilingual function tools support by default.
 The languages supported are 🇺🇸 English, 🇫🇷 French, 🇮🇹 Italian, 🇪🇸 Spanish, 🇵🇹 Portuguese, 🇩🇪 German, 🇻🇳 Vietnamese, 🇰🇷 Korean and 🇨🇳 Chinese.
+🗞️ **Updates**
+* Jul 23, 2024: added support for tools, now available to search for information on the internet.
 """
 if torch.cuda.is_available():
     model_id = "ghost-x/ghost-8b-beta"
+    hf_serect = os.getenv("HF_TOKEN", None)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
+        token=hf_serect,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_id,
         trust_remote_code=True,
+        token=hf_serect,
     )
     SelfExtend.apply(
         model,
     )
     model.generation_config.max_length = 123392
+waiting_tools_timeout = 7.5
+supported_tools = json.dumps(
+    [
+        {
+            "type": "function",
+            "function": {
+                "name": "search_on_internet",
+                "description": "Use this tool to search online, only use it for information you don't know or are unsure of, don't abuse it.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "keyword": {
+                            "type": "string",
+                            "description": "Search keywords, rephrase to optimize search results based on questions suitable to the specified search type.",
+                            "required": True,
+                        },
+                        "type": {
+                            "type": "string",
+                            "description": "Search type, based on the question to determine whether to search for it in 'wikipedia' or 'google', prefer to use wikipedia for information about events, history and people.",
+                            "enum": ["wikipedia", "google"],
+                            "default": "google",
+                            "required": True,
+                        },
+                    },
+                },
+            },
+        }
+    ],
+    ensure_ascii=False,
+)
+@lru_cache(maxsize=128)
+def extract_text_from_webpage(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg"]):
+        tag.extract()
+    visible_text = soup.get_text(strip=True, separator=" ")
+    return visible_text
+def search_with_wikipedia(query: str):
+    all_results = []
+    try:
+        all_results.append(wikipedia.summary(query))
+    except Exception as e:
+        pass
+    return all_results
+def search_with_google(
+    query: str,
+    num_results: int = 3,
+    timeout: int = 5,
+    ssl_verify: bool = None,
+):
+    all_results = []
+    max_chars_per_page = 4096
+    with requests.Session() as session:
+        resp = session.get(
+            url="https://www.google.com/search",
+            headers={
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
+            },
+            params={
+                "q": query,
+                "num": num_results,
+                "udm": 14,
+            },
+            timeout=timeout,
+            verify=ssl_verify,
+        )
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        result_block = soup.find_all("div", attrs={"class": "g"})
+        for result in result_block:
+            link = result.find("a", href=True)
+            if link:
+                link = link["href"]
+                try:
+                    webpage = session.get(
+                        link,
+                        headers={
+                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
+                        },
+                    )
+                    webpage.raise_for_status()
+                    visible_text = extract_text_from_webpage(webpage.text)
+                    if len(visible_text) > max_chars_per_page:
+                        visible_text = visible_text[:max_chars_per_page]
+                    all_results.append({"link": link, "text": visible_text})
+                except requests.exceptions.RequestException as e:
+                    print(f"Error fetching or processing {link}: {e}")
+                    pass
+            else:
+                pass
+    return all_results
+@spaces.GPU(duration=180)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
+    allow_used_tools: bool = True,
+    system_prompt: str = "",
+    max_new_tokens: int = 2048,
     temperature: float = 0.4,
     top_p: float = 0.95,
     top_k: int = 50,
     repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
+    # print()
+    # print("allow_used_tools:\n", allow_used_tools)
+    # print("system_prompt:\n", system_prompt)
+    # print("max_new_tokens:\n", max_new_tokens)
+    # print("temperature:\n", temperature)
+    def build_input_ids(
+        apply_tools: bool = None,
+        references: list[str] = None,
+    ):
+        conversation = []
+        if system_prompt:
+            conversation.append({"role": "system", "content": system_prompt})
+        if apply_tools is True:
+            conversation.append({"role": "tools", "content": supported_tools})
+        if (
+            references is not None
+            and isinstance(references, list)
+            and len(references) > 0
+        ):
+            conversation.append(
+                {
+                    "role": "refs",
+                    "content": json.dumps(references, ensure_ascii=False),
+                }
+            )
+        for user, assistant in chat_history:
+            conversation.extend(
+                [
+                    {"role": "user", "content": user},
+                    {"role": "assistant", "content": assistant},
+                ]
+            )
+        conversation.append({"role": "user", "content": message})
+        input_ids = tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, return_tensors="pt"
         )
+        input_ids = input_ids.to(model.device)
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(
+                f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens."
+            )
+        return input_ids
+    def generate_chat_responses(
+        previous_response: str = None,
+    ):
+        document_references = []
+        if previous_response is not None:
+            scheduled_tools_runs = None
+            try:
+                scheduled_tools_runs = json.loads(previous_response)
+                if scheduled_tools_runs["type"] == "function" and scheduled_tools_runs[
+                    "name"
+                ] in ["search_on_internet"]:
+                    pass
+                else:
+                    scheduled_tools_runs = None
+            except Exception as e:
+                print(e)
+                pass
+            if (
+                scheduled_tools_runs is not None
+                and scheduled_tools_runs["name"] == "search_on_internet"
+            ):
+                keyword = scheduled_tools_runs["arguments"]["keyword"]
+                search_type = scheduled_tools_runs["arguments"]["type"]
+                if search_type == "wikipedia":
+                    gr.Info("Searching for information on the Wikipedia.")
+                    document_references = search_with_wikipedia(keyword)
+                else:
+                    gr.Info("Searching for information on the Google.")
+                    document_references = search_with_google(keyword)
+        input_ids = build_input_ids(
+            apply_tools=(
+                True
+                if allow_used_tools is True and previous_response is None
+                else False
+            ),
+            references=document_references,
+        )
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+        )
+        generate_kwargs = dict(
+            input_ids=input_ids,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            repetition_penalty=repetition_penalty,
         )
+        if temperature == 0:
+            generate_kwargs["do_sample"] = False
+        else:
+            generate_kwargs["temperature"] = temperature
+            generate_kwargs["top_p"] = top_p
+            generate_kwargs["top_k"] = top_k
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        state = {
+            "mark": None,
+            "respond": False,
+        }
+        outputs = []
+        for text in streamer:
+            if state["mark"] is None:
+                state["mark"] = time.time()
+            outputs.append(text)
+            if state["mark"] + waiting_tools_timeout < time.time():
+                state["respond"] = True
+                yield "".join(outputs)
+        if (
+            state["respond"] is False
+            and state["mark"] + waiting_tools_timeout > time.time()
+        ):
+            gr.Info("Searching for information on the internet.")
+            previous_response = "".join(outputs)
+            yield from generate_chat_responses(previous_response=previous_response)
+    yield from generate_chat_responses(previous_response=None)
+chatbot = gr.Chatbot(
+    height=500, placeholder=PLACEHOLDER, label="Ghost 8B Beta", show_copy_button=True
+)
 chat_interface = gr.ChatInterface(
     fn=generate,
     chatbot=chatbot,
     fill_height=True,
     additional_inputs=[
+        gr.Checkbox(
+            label="Allow used tools (available: search on internet)", value=True
+        ),
         gr.Textbox(label="System prompt", lines=6),
         gr.Slider(
             label="Max new tokens",
     cache_examples=False,
     examples=EXAMPLES,
     examples_per_page=9,
+    concurrency_limit=100,
 )
 with gr.Blocks(fill_height=True, css="style.css") as demo:
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
 accelerate==0.30.1
 bitsandbytes==0.43.1
-gradio==4.37.2
 scipy==1.13.0
 sentencepiece==0.2.0
 spaces==0.28.3
 torch==2.0.0
 transformers==4.41.0

 accelerate==0.30.1
 bitsandbytes==0.43.1
+gradio==4.39.0
 scipy==1.13.0
 sentencepiece==0.2.0
 spaces==0.28.3
 torch==2.0.0
 transformers==4.41.0
+beautifulsoup4>=4.9
+wikipedia==1.4.0