Spaces:

simone-papicchio
/

qatch-demo

Sleeping

simone-papicchio commited on Apr 6

Commit

6b75ebd

1 Parent(s): e993e1b

feat: add cost for llama 8b

Files changed (1) hide show

prediction.py CHANGED Viewed

@@ -108,12 +108,23 @@ class ModelPrediction:
     @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:
         outputs = pipeline(
             [{"role": "user", "content": prompt}],
             max_new_tokens=256,
         )
         response = outputs[0]["generated_text"][-1]
-        return {"response": response, "cost": 0.0}
     def _init_model_prediction(self, model_name):
         predict_fun = self.predict_with_api

     @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:
+        start_time = time.time()
         outputs = pipeline(
             [{"role": "user", "content": prompt}],
             max_new_tokens=256,
         )
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        # inference endpoint costs HF per Hour 3.6$/h -> 0.001 $ per second
+        # https://huggingface.co/docs/inference-endpoints/en/pricing?utm_source=chatgpt.com
+        cost_per_second=0.001
         response = outputs[0]["generated_text"][-1]
+        print(response)
+        return {
+            "response": response,
+            "cost": elapsed_time * cost_per_second
+        }
     def _init_model_prediction(self, model_name):
         predict_fun = self.predict_with_api