Spaces:

ccm
/

chat-with-publications

Running on Zero

App Files Files Community

ccm commited on Jul 21, 2024

Commit

30d4d88

verified ·

1 Parent(s): 2ce9b97

Update main.py

Browse files

Files changed (1) hide show

main.py +30 -15

main.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas  # to work with pandas
 import json  # to work with JSON
 import datasets  # to load the dataset
 import spaces  # for GPU
 # Load the dataset and convert to pandas
 full_data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
@@ -60,12 +61,12 @@ def search(query: str, k: int) -> tuple[str]:
 # Create an LLM pipeline that we can send queries to
-pipe = transformers.pipeline(
-    "text-generation",
-    model="Qwen/Qwen2-0.5B-Instruct",
-    trust_remote_code=True,
-    max_new_tokens = 512,
-    device="cuda:0",
 )
 def preprocess(message: str) -> tuple[str]:
@@ -77,7 +78,6 @@ def postprocess(response: str, bypass_from_preprocessing: str) -> str:
     """Applies a postprocessing step to the LLM's response before the user receives it"""
     return response + bypass_from_preprocessing
-@spaces.GPU
 def predict(message: str, history: list[str]) -> str:
     """This function is responsible for crafting a response"""
@@ -93,14 +93,29 @@ def predict(message: str, history: list[str]) -> str:
         for idx, msg in enumerate(history)
     ] + [{"role": "user", "content": message}]
-    # Create a response
-    response = pipe(history_transformer_format)
-    response_message = response[0]["generated_text"][-1]["content"]
-    # Apply postprocessing
-    response_message = postprocess(response_message, bypass)
-    return response_message
 # Create and run the gradio interface

 import json  # to work with JSON
 import datasets  # to load the dataset
 import spaces  # for GPU
+import threading
 # Load the dataset and convert to pandas
 full_data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
 # Create an LLM pipeline that we can send queries to
+tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+streamer = transformers.TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+chatmodel = transformers.AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen2-0.5B-Instruct"
+    torch_dtype="auto",
+    device_map="auto"
 )
 def preprocess(message: str) -> tuple[str]:
     """Applies a postprocessing step to the LLM's response before the user receives it"""
     return response + bypass_from_preprocessing
 def predict(message: str, history: list[str]) -> str:
     """This function is responsible for crafting a response"""
         for idx, msg in enumerate(history)
     ] + [{"role": "user", "content": message}]
+    # Stream a response from pipe
+    text = tokenizer.apply_chat_template(
+        history_transformer_format,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    model_inputs = tokenizer([text], return_tensors="pt")
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=512
+    )
+    t = threading.Thread(target=chatmodel.generate, kwargs=generate_kwargs)
+    t.start()
+    partial_message = ""
+    for new_token in streamer:
+        if new_token != '<':
+            partial_message += new_token
+            yield partial_message
+    yield bypass
 # Create and run the gradio interface