Spaces:

simone-papicchio
/

qatch-demo

Running

simone-papicchio commited on Apr 6

Commit

a1a2a18

1 Parent(s): d4aa01a

feat add llama8 with API

Files changed (1) hide show

prediction.py CHANGED Viewed

@@ -18,7 +18,7 @@ else:
             return wrapper
 from transformers import pipeline as hf_pipeline
 import litellm
 from tqdm import tqdm
@@ -109,9 +109,8 @@ class ModelPrediction:
             "cost": response._hidden_params["response_cost"],
         }
-    @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:
         start_time = time.time()
         outputs = pipeline(
             [{"role": "user", "content": prompt}],
@@ -120,10 +119,9 @@ class ModelPrediction:
         end_time = time.time()
         elapsed_time = end_time - start_time
         # inference endpoint costs HF per Hour 3.6$/h -> 0.001 $ per second
-        # https://huggingface.co/docs/inference-endpoints/en/pricing?utm_source=chatgpt.com
         cost_per_second=0.001
         response = outputs[0]["generated_text"][-1]['content']
-        print(response)
         return {
             "response": response,
             "cost": elapsed_time * cost_per_second
@@ -142,8 +140,7 @@ class ModelPrediction:
         elif "DeepSeek-R1-Distill-Llama-70B" in model_name:
             model_name = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
         elif "llama-8" in model_name:
-            model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
-            predict_fun = self.predict_with_hf
         else:
             raise ValueError("Model forbidden")

             return wrapper
 from transformers import pipeline as hf_pipeline
+import torch
 import litellm
 from tqdm import tqdm
             "cost": response._hidden_params["response_cost"],
         }
+    @spaces.GPU(duration=20)
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:
         start_time = time.time()
         outputs = pipeline(
             [{"role": "user", "content": prompt}],
         end_time = time.time()
         elapsed_time = end_time - start_time
         # inference endpoint costs HF per Hour 3.6$/h -> 0.001 $ per second
         cost_per_second=0.001
         response = outputs[0]["generated_text"][-1]['content']
+        # print(response)
         return {
             "response": response,
             "cost": elapsed_time * cost_per_second
         elif "DeepSeek-R1-Distill-Llama-70B" in model_name:
             model_name = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
         elif "llama-8" in model_name:
+            model_name = "together_ai/meta-llama/Meta-Llama-3-8B-Instruct"
         else:
             raise ValueError("Model forbidden")