Ll2

Running

App Files Files Community

Makhinur commited on Feb 9

Commit

265cb8e

verified ·

1 Parent(s): 3f2eae3

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -8

app.py CHANGED Viewed

@@ -1,13 +1,25 @@
 import os
 from typing import Iterator
 import gradio as gr
-from model import run
 # Ensure the HF_TOKEN environment variable is set
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if HF_TOKEN is None:
     raise ValueError("Please set the HF_TOKEN environment variable.")
 HF_PUBLIC = os.environ.get("HF_PUBLIC", False)
 DEFAULT_SYSTEM_PROMPT = """\
@@ -36,17 +48,52 @@ As a derivate work of Code Llama by Meta,
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/codellama-2-34b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/codellama-2-34b-chat/blob/main/USE_POLICY.md).
 """
 def clear_and_save_textbox(message: str) -> tuple[str, str]:
     return '', message
 def display_input(message: str,
                   history: list[tuple[str, str]]) -> list[tuple[str, str]]:
     history.append((message, ''))
     return history
 def delete_prev_fn(
         history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
     try:
@@ -77,20 +124,17 @@ def generate(
     for response in generator:
         yield history + [(message, response)]
 def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
     generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
     for x in generator:
         pass
     return '', x
 def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
     input_token_length = len(message) + len(chat_history)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
         raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
 with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
     gr.DuplicateButton(value='Duplicate Space for private use',
@@ -148,8 +192,6 @@ with gr.Blocks(css='style.css') as demo:
             step=1,
             value=10,
         )
     gr.Markdown(LICENSE)

 import os
 from typing import Iterator
 import gradio as gr
+from text_generation import Client
 # Ensure the HF_TOKEN environment variable is set
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if HF_TOKEN is None:
     raise ValueError("Please set the HF_TOKEN environment variable.")
+# Model and API setup
+model_id = 'codellama/CodeLlama-34b-Instruct-hf'
+API_URL = "https://api-inference.huggingface.co/models/" + model_id
+client = Client(
+    API_URL,
+    headers={"Authorization": f"Bearer {HF_TOKEN}"},
+)
+EOS_STRING = "</s>"
+EOT_STRING = "<EOT>"
 HF_PUBLIC = os.environ.get("HF_PUBLIC", False)
 DEFAULT_SYSTEM_PROMPT = """\
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/codellama-2-34b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/codellama-2-34b-chat/blob/main/USE_POLICY.md).
 """
+def get_prompt(message: str, chat_history: list[tuple[str, str]],
+               system_prompt: str) -> str:
+    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
+    do_strip = False
+    for user_input, response in chat_history:
+        user_input = user_input.strip() if do_strip else user_input
+        do_strip = True
+        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
+    message = message.strip() if do_strip else message
+    texts.append(f'{message} [/INST]')
+    return ''.join(texts)
+def run(message: str,
+        chat_history: list[tuple[str, str]],
+        system_prompt: str,
+        max_new_tokens: int = 1024,
+        temperature: float = 0.1,
+        top_p: float = 0.9,
+        top_k: int = 50) -> Iterator[str]:
+    prompt = get_prompt(message, chat_history, system_prompt)
+    generate_kwargs = dict(
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+    )
+    stream = client.generate_stream(prompt, **generate_kwargs)
+    output = ""
+    for response in stream:
+        if any([end_token in response.token.text for end_token in [EOS_STRING, EOT_STRING]]):
+            return output
+        else:
+            output += response.token.text
+        yield output
+    return output
 def clear_and_save_textbox(message: str) -> tuple[str, str]:
     return '', message
 def display_input(message: str,
                   history: list[tuple[str, str]]) -> list[tuple[str, str]]:
     history.append((message, ''))
     return history
 def delete_prev_fn(
         history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
     try:
     for response in generator:
         yield history + [(message, response)]
 def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
     generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
     for x in generator:
         pass
     return '', x
 def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
     input_token_length = len(message) + len(chat_history)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
         raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
 with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
     gr.DuplicateButton(value='Duplicate Space for private use',
             step=1,
             value=10,
         )
     gr.Markdown(LICENSE)