Spaces:

simone-papicchio
/

qatch-demo

Running

simone-papicchio commited on Apr 6

Commit

40354df

1 Parent(s): 45d1f9d

feat add llama8b zero spaces

Files changed (1) hide show

prediction.py CHANGED Viewed

@@ -24,6 +24,14 @@ import litellm
 from tqdm import tqdm
 class ModelPrediction:
     def __init__(self):
         self.model_name2pred_func = {
@@ -47,16 +55,6 @@ class ModelPrediction:
             "{db_schema}\n"
         )
-    @property
-    def pipeline(self):
-        if self._pipeline is None:
-            self._pipeline = hf_pipeline(
-                task="text-generation",
-                model=self._model_name,
-                device_map="auto",
-            )
-        return self._pipeline
     def _reset_pipeline(self, model_name):
         if self._model_name != model_name:
             self._model_name = model_name
@@ -110,10 +108,11 @@ class ModelPrediction:
     @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:
-        self._reset_pipeline(model_name)
-        response = self.pipeline([{"role": "user", "content": prompt}])[0][
-            "generated_text"
-        ][-1]["content"]
         return {"response": response, "cost": 0.0}
     def _init_model_prediction(self, model_name):

 from tqdm import tqdm
+pipeline = transformers.pipeline(
+    "text-generation",
+    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    model_kwargs={"torch_dtype": torch.bfloat16},
+)
+pipeline.to('cuda')
 class ModelPrediction:
     def __init__(self):
         self.model_name2pred_func = {
             "{db_schema}\n"
         )
     def _reset_pipeline(self, model_name):
         if self._model_name != model_name:
             self._model_name = model_name
     @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:
+        outputs = pipeline(
+            [{"role": "user", "content": prompt}],
+            max_new_tokens=256,
+        )
+        response = outputs[0]["generated_text"][-1]
         return {"response": response, "cost": 0.0}
     def _init_model_prediction(self, model_name):