Spaces:

microsoft
/

MInference

Running on Zero

iofu728 commited on Jul 2, 2024

Commit

7ea924f

1 Parent(s): c68af66

Feature(MInference): update information

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,7 +13,10 @@ from transformers.utils.import_utils import _is_package_available
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-DESCRIPTION = """# MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention (Under Review, ES-FoMo @ ICML'24)
 _Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
@@ -73,7 +76,7 @@ if torch.cuda.is_available() and _is_package_available("pycuda"):
 terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
-@spaces.GPU(duration=120)
 def chat_llama3_8b(
     message: str, history: list, temperature: float, max_new_tokens: int
 ) -> str:

 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+DESCRIPTION = """
+<div>
+<h1>MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention (Under Review, ES-FoMo @ ICML'24)</h1>
+</div>
 _Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
 terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+# @spaces.GPU(duration=120)
 def chat_llama3_8b(
     message: str, history: list, temperature: float, max_new_tokens: int
 ) -> str: