Spaces:

neuralmagic
/

sparse-llama-gsm8k

Paused

App Files Files Community

mgoin commited on Nov 17, 2023

Commit

0175c44

1 Parent(s): d54ef7f

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -43

app.py CHANGED Viewed

@@ -100,33 +100,6 @@ with gr.Blocks() as demo:
                 interactive=True,
                 info="Higher values produce more diverse outputs",
             )
-            top_p = gr.Slider(
-                label="Top-p (nucleus) sampling",
-                value=0.40,
-                minimum=0.0,
-                maximum=1,
-                step=0.05,
-                interactive=True,
-                info="Higher values sample more low-probability tokens",
-            )
-            top_k = gr.Slider(
-                label="Top-k sampling",
-                value=20,
-                minimum=1,
-                maximum=100,
-                step=1,
-                interactive=True,
-                info="Sample from the top_k most likely tokens",
-            )
-            repetition_penalty = gr.Slider(
-                label="Repetition penalty",
-                value=1.2,
-                minimum=1.0,
-                maximum=2.0,
-                step=0.05,
-                interactive=True,
-                info="Penalize repeated tokens",
-            )
             # Generation inference
             def generate(
@@ -134,21 +107,14 @@ with gr.Blocks() as demo:
                 history,
                 max_new_tokens: int,
                 temperature: float,
-                top_p: float,
-                top_k: int,
-                repetition_penalty: float,
             ):
                 generation_config = {
                     "max_new_tokens": max_new_tokens,
                     "temperature": temperature,
-                    "top_p": top_p,
-                    "top_k": top_k,
-                    "repetition_penalty": repetition_penalty,
                 }
                 inference = pipe(sequences=message, streaming=True, **generation_config)
                 history[-1][1] += message
                 for token in inference:
-                    print(token.generations[0].text)
                     history[-1][1] += token.generations[0].text
                     yield history
                 print(pipe.timer_manager)
@@ -172,9 +138,6 @@ with gr.Blocks() as demo:
                     chatbot,
                     max_new_tokens,
                     temperature,
-                    top_p,
-                    top_k,
-                    repetition_penalty,
                 ],
                 outputs=[chatbot],
                 api_name=False,
@@ -199,9 +162,6 @@ with gr.Blocks() as demo:
                     chatbot,
                     max_new_tokens,
                     temperature,
-                    top_p,
-                    top_k,
-                    repetition_penalty,
                 ],
                 outputs=[chatbot],
                 api_name=False,
@@ -226,9 +186,6 @@ with gr.Blocks() as demo:
                     chatbot,
                     max_new_tokens,
                     temperature,
-                    top_p,
-                    top_k,
-                    repetition_penalty,
                 ],
                 outputs=[chatbot],
                 api_name=False,

                 interactive=True,
                 info="Higher values produce more diverse outputs",
             )
             # Generation inference
             def generate(
                 history,
                 max_new_tokens: int,
                 temperature: float,
             ):
                 generation_config = {
                     "max_new_tokens": max_new_tokens,
                     "temperature": temperature,
                 }
                 inference = pipe(sequences=message, streaming=True, **generation_config)
                 history[-1][1] += message
                 for token in inference:
                     history[-1][1] += token.generations[0].text
                     yield history
                 print(pipe.timer_manager)
                     chatbot,
                     max_new_tokens,
                     temperature,
                 ],
                 outputs=[chatbot],
                 api_name=False,
                     chatbot,
                     max_new_tokens,
                     temperature,
                 ],
                 outputs=[chatbot],
                 api_name=False,
                     chatbot,
                     max_new_tokens,
                     temperature,
                 ],
                 outputs=[chatbot],
                 api_name=False,