Spaces:
Sleeping
Sleeping
Commit
·
a1a2a18
1
Parent(s):
d4aa01a
feat add llama8 with API
Browse files- prediction.py +4 -7
prediction.py
CHANGED
|
@@ -18,7 +18,7 @@ else:
|
|
| 18 |
return wrapper
|
| 19 |
|
| 20 |
from transformers import pipeline as hf_pipeline
|
| 21 |
-
|
| 22 |
import litellm
|
| 23 |
|
| 24 |
from tqdm import tqdm
|
|
@@ -109,9 +109,8 @@ class ModelPrediction:
|
|
| 109 |
"cost": response._hidden_params["response_cost"],
|
| 110 |
}
|
| 111 |
|
| 112 |
-
@spaces.GPU
|
| 113 |
def predict_with_hf(self, prompt, model_name): # -> dict[str, Any | float]:
|
| 114 |
-
|
| 115 |
start_time = time.time()
|
| 116 |
outputs = pipeline(
|
| 117 |
[{"role": "user", "content": prompt}],
|
|
@@ -120,10 +119,9 @@ class ModelPrediction:
|
|
| 120 |
end_time = time.time()
|
| 121 |
elapsed_time = end_time - start_time
|
| 122 |
# inference endpoint costs HF per Hour 3.6$/h -> 0.001 $ per second
|
| 123 |
-
# https://huggingface.co/docs/inference-endpoints/en/pricing?utm_source=chatgpt.com
|
| 124 |
cost_per_second=0.001
|
| 125 |
response = outputs[0]["generated_text"][-1]['content']
|
| 126 |
-
print(response)
|
| 127 |
return {
|
| 128 |
"response": response,
|
| 129 |
"cost": elapsed_time * cost_per_second
|
|
@@ -142,8 +140,7 @@ class ModelPrediction:
|
|
| 142 |
elif "DeepSeek-R1-Distill-Llama-70B" in model_name:
|
| 143 |
model_name = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
|
| 144 |
elif "llama-8" in model_name:
|
| 145 |
-
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 146 |
-
predict_fun = self.predict_with_hf
|
| 147 |
else:
|
| 148 |
raise ValueError("Model forbidden")
|
| 149 |
|
|
|
|
| 18 |
return wrapper
|
| 19 |
|
| 20 |
from transformers import pipeline as hf_pipeline
|
| 21 |
+
import torch
|
| 22 |
import litellm
|
| 23 |
|
| 24 |
from tqdm import tqdm
|
|
|
|
| 109 |
"cost": response._hidden_params["response_cost"],
|
| 110 |
}
|
| 111 |
|
| 112 |
+
@spaces.GPU(duration=20)
|
| 113 |
def predict_with_hf(self, prompt, model_name): # -> dict[str, Any | float]:
|
|
|
|
| 114 |
start_time = time.time()
|
| 115 |
outputs = pipeline(
|
| 116 |
[{"role": "user", "content": prompt}],
|
|
|
|
| 119 |
end_time = time.time()
|
| 120 |
elapsed_time = end_time - start_time
|
| 121 |
# inference endpoint costs HF per Hour 3.6$/h -> 0.001 $ per second
|
|
|
|
| 122 |
cost_per_second=0.001
|
| 123 |
response = outputs[0]["generated_text"][-1]['content']
|
| 124 |
+
# print(response)
|
| 125 |
return {
|
| 126 |
"response": response,
|
| 127 |
"cost": elapsed_time * cost_per_second
|
|
|
|
| 140 |
elif "DeepSeek-R1-Distill-Llama-70B" in model_name:
|
| 141 |
model_name = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
|
| 142 |
elif "llama-8" in model_name:
|
| 143 |
+
model_name = "together_ai/meta-llama/Meta-Llama-3-8B-Instruct"
|
|
|
|
| 144 |
else:
|
| 145 |
raise ValueError("Model forbidden")
|
| 146 |
|