Spaces:

miracFence
/

Medical_QA_Chatbot

Runtime error

App Files Files Community

miracFence commited on Oct 2, 2024

Commit

d7db62d

verified ·

1 Parent(s): 3df4d21

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -6

app.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 import spaces
 # Define quantization configuration
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,  # Specify 4-bit quantization
@@ -14,6 +18,7 @@ quantization_config = BitsAndBytesConfig(
 # Load the tokenizer and quantized model from Hugging Face
 model_name = "llSourcell/medllama2_7b"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Load model with quantization
 model = AutoModelForCausalLM.from_pretrained(model_name,
@@ -29,24 +34,49 @@ def format_history(msg: str, history: list[list[str, str]], system_prompt: str):
     return chat_history
 @spaces.GPU(duration=90)
-def generate_response(msg: str, history: list[list[str, str]], system_prompt: str):
     chat_history = format_history(msg, history, system_prompt)
     # Tokenize the input prompt
     inputs = tokenizer(chat_history, return_tensors="pt").to("cuda")
     # Generate a response using the model
-    outputs = model.generate(inputs["input_ids"], max_length=500, pad_token_id=tokenizer.eos_token_id)
     # Decode the response back to a string
-    response = tokenizer.decode(outputs[:, inputs["input_ids"].shape[-1]:][0], skip_special_tokens=True)
     # Yield the generated response
-    yield response
 # Define the Gradio ChatInterface
 chatbot = gr.ChatInterface(
-                generate_response,
                 chatbot=gr.Chatbot(
                         height="64vh"
                     ),

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 import torch
 import spaces
+import os
+from threading import Thread
+from typing import Iterator
 # Define quantization configuration
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,  # Specify 4-bit quantization
 # Load the tokenizer and quantized model from Hugging Face
 model_name = "llSourcell/medllama2_7b"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load model with quantization
 model = AutoModelForCausalLM.from_pretrained(model_name,
     return chat_history
 @spaces.GPU(duration=90)
+def generate(msg: str,
+            history: list[list[str, str]],
+            system_prompt: str,
+            max_new_tokens: int = 1024,
+            temperature: float = 0.6,
+            top_p: float = 0.9,
+            top_k: int = 50,
+            repetition_penalty: float = 1.2,) -> Iterator[str]:
     chat_history = format_history(msg, history, system_prompt)
     # Tokenize the input prompt
     inputs = tokenizer(chat_history, return_tensors="pt").to("cuda")
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     # Generate a response using the model
+    # outputs = model.generate(inputs["input_ids"], max_length=500, pad_token_id=tokenizer.eos_token_id)
     # Decode the response back to a string
+    # response = tokenizer.decode(outputs[:, inputs["input_ids"].shape[-1]:][0], skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
     # Yield the generated response
+    #yield response
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 # Define the Gradio ChatInterface
 chatbot = gr.ChatInterface(
+                fn=generate,
                 chatbot=gr.Chatbot(
                         height="64vh"
                     ),