Spaces:

simone-papicchio
/

qatch-demo

Sleeping

App Files Files Community

simone-papicchio commited on Mar 25

Commit

ffec641

1 Parent(s): 220b4dd

feat add model on zeroGpu

Browse files

Files changed (3) hide show

prediction.py +19 -17
requirements.txt +1 -1
test_prediction.py +4 -2

prediction.py CHANGED Viewed

@@ -21,6 +21,8 @@ from transformers import pipeline as hf_pipeline
 import torch
 import litellm
 class ModelPrediction:
     def __init__(self):
@@ -32,6 +34,7 @@ class ModelPrediction:
             "DeepSeek-R1-Distill-Llama-70B": self._model_prediction(
                 "DeepSeek-R1-Distill-Llama-70B"
             ),
         }
         self._model_name = None
@@ -50,6 +53,7 @@ class ModelPrediction:
     def _reset_pipeline(self, model_name):
         if self._model_name != model_name:
             self._model_name = model_name
             self._pipeline = None
@@ -63,6 +67,13 @@ class ModelPrediction:
             matches = re.findall(r"```sql(.*?)```", pred, re.DOTALL)
             return matches[-1].strip() if matches else pred
     def make_prediction(self, prompt, model_name):
         if model_name not in self.model_name2pred_func:
             raise ValueError(
@@ -89,34 +100,25 @@ class ModelPrediction:
             model_name = "together_ai/Qwen/QwQ-32B"
         elif "DeepSeek-R1-Distill-Llama-70B" in model_name:
             model_name = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
         else:
             raise ValueError("Model forbidden")
         return partial(predict_fun, model_name=model_name)
     def predict_with_api(self, prompt, model_name):  # -> dict[str, Any | float]:
-        def track_cost_callback(
-            kwargs,  # kwargs to completion
-            completion_response,  # response from completion
-            start_time,
-            end_time,  # start/end time
-        ):
-            try:
-                response_cost = kwargs[
-                    "response_cost"
-                ]  # litellm calculates response cost for you
-                call_cost = response_cost
-            except:
-                pass
-        litellm.success_callback = [track_cost_callback]
-        call_cost = 0.0
         response = litellm.completion(
             model=model_name,
             messages=[{"role": "user", "content": prompt}],
             num_retries=2,
         )
-        return {"response": response, "cost": call_cost}
     @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:

 import torch
 import litellm
+from tqdm import tqdm
 class ModelPrediction:
     def __init__(self):
             "DeepSeek-R1-Distill-Llama-70B": self._model_prediction(
                 "DeepSeek-R1-Distill-Llama-70B"
             ),
+            "llama-8": self._model_prediction("llama-8"),
         }
         self._model_name = None
     def _reset_pipeline(self, model_name):
         if self._model_name != model_name:
+            print("Resetting pipeline with model", model_name)
             self._model_name = model_name
             self._pipeline = None
             matches = re.findall(r"```sql(.*?)```", pred, re.DOTALL)
             return matches[-1].strip() if matches else pred
+    def make_predictions(self, prompts, model_name) -> list[dict]:
+        preds = []
+        for prompt in tqdm(prompts, desc=f"Analyzing Prompt with {model_name}"):
+            pred = self.make_prediction(prompt, model_name)
+            preds.append(pred)
+        return preds
     def make_prediction(self, prompt, model_name):
         if model_name not in self.model_name2pred_func:
             raise ValueError(
             model_name = "together_ai/Qwen/QwQ-32B"
         elif "DeepSeek-R1-Distill-Llama-70B" in model_name:
             model_name = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+        elif "llama-8" in model_name:
+            model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+            predict_fun = self.predict_with_hf
         else:
             raise ValueError("Model forbidden")
         return partial(predict_fun, model_name=model_name)
     def predict_with_api(self, prompt, model_name):  # -> dict[str, Any | float]:
         response = litellm.completion(
             model=model_name,
             messages=[{"role": "user", "content": prompt}],
             num_retries=2,
         )
+        response_text = response["choices"][0]["message"]["content"]
+        return {
+            "response": response_text,
+            "cost": response._hidden_params["response_cost"],
+        }
     @spaces.GPU
     def predict_with_hf(self, prompt, model_name):  # -> dict[str, Any | float]:

requirements.txt CHANGED Viewed

@@ -10,9 +10,9 @@ eval-type-backport>=0.2.0
 openai==1.66.3
 litellm==1.63.14
 together==1.4.6
-litellm==1.63.14
 # Conditional dependency for Gradio (requires Python >=3.10)
 gradio>=5.20.1; python_version >= "3.10"
 # Test dependencies
 streamlit>=1.43.0

 openai==1.66.3
 litellm==1.63.14
 together==1.4.6
 # Conditional dependency for Gradio (requires Python >=3.10)
 gradio>=5.20.1; python_version >= "3.10"
+accelerate>=0.26.0
 # Test dependencies
 streamlit>=1.43.0

test_prediction.py CHANGED Viewed

@@ -3,8 +3,10 @@ from prediction import ModelPrediction
 def main():
     model = ModelPrediction()
-    response = model.make_prediction("Hi, how are you?", "gpt-3.5")
-    print(response)
 if __name__ == "__main__":
     main()

 def main():
     model = ModelPrediction()
+    response = model.make_prediction("Hi, how are you?", "llama-8")
+    print(response)  # dict[response, response_parsed, cost]
 if __name__ == "__main__":
     main()
+    # do something with prompt