Spaces:

ManishThota
/

GSoC-Super-Rapid-Annotator

Runtime error

App Files Files Community

ManishThota commited on Aug 21, 2024

Commit

9fd7f68

verified ·

1 Parent(s): 57a114d

Update src/text_processor.py

Browse files

Files changed (1) hide show

src/text_processor.py +76 -48

src/text_processor.py CHANGED Viewed

@@ -1,64 +1,92 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from pydantic import BaseModel
-import spaces
-device = 'cuda'
-# Load your LLM model and tokenizer
 torch.random.manual_seed(0)
-model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Phi-3-mini-4k-instruct",
-    device_map=device,
-    torch_dtype="auto",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 pipe = pipeline(
     "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-)
-# Pydantic class for output validation
 class VideoAnalysis(BaseModel):
-    indoor: int
     hands_free: int
-    screen_interaction: int
     standing: int
-@spaces.GPU(duration=100)
-def process_description(description):
-    # Construct a prompt for your LLM based on the video description
-    prompt = f"""
-    You are a helpful AI assistant. Analyze the following video description and answer the questions with 0 for True and 1 for False:
-    Video Description: {description}
-    Questions:
-    - Is the scene indoors?
-    - Are the subject's hands free?
-    - Is there screen interaction by the subject?
-    - Is the subject standing?
-    Provide your answers in JSON format like this:
-    {{"indoor": 0, "hands_free": 1, "screen_interaction": 0, "standing": 1}}
-    """
-    generation_args = {
-        "max_new_tokens": 100,  # Adjust as needed
-        "return_full_text": False,
-        "temperature": 0.0,
-        "do_sample": False,
-    }
-    output = pipe(prompt, **generation_args)
-    json_text = output[0]['generated_text']
-    try:
-        # Attempt to parse and validate the JSON response
-        analysis_result = VideoAnalysis.model_validate_json(json_text)
-        return analysis_result.model_dump_json()  # Return as valid JSON
-    except Exception as e:
-        print(f"Error processing LLM output: {e}")
-        return {"error": "Could not process the video description."}

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from pydantic import BaseModel
+import json
+import warnings
+import spaces
+# Ignore warnings
+warnings.filterwarnings(action='ignore')
+# Set random seed
 torch.random.manual_seed(0)
+# Define the model path
+model_path = "microsoft/Phi-3-mini-4k-instruct"
+device= "cuda"
+# Load the model and pipeline outside the function
 pipe = pipeline(
     "text-generation",
+    model=AutoModelForCausalLM.from_pretrained(
+        model_path,
+        device_map=device,
+        torch_dtype="auto",
+        trust_remote_code=True,
+    ),
+    tokenizer=AutoTokenizer.from_pretrained(model_path),
+)
+generation_args = {
+    "max_new_tokens": 50,
+    "return_full_text": False,
+    "temperature": 0.1,
+    "do_sample": True
+}
+class LLMHelper:
+    def __init__(self, pipeline):
+        self.chatbot = pipeline
+    def generate_logic(self, llm_output: str):
+        prompt = f"""
+        Provide the response in json string for the below keys and context based on the description: '{llm_output}'.
+        Screen.interaction_yes: This field indicates whether there was an interaction of the person with a screen during the activity. A value of 1 means there was screen interaction (Yes), and a value of 0 means there was no screen interaction (No).
+        Hands.free: This field indicates whether the person's hands were free during the activity. A value of 1 means the person was not holding anything (Yes), indicating free hands. A value of 0 means the person was holding something (No), indicating the hands were not free.
+        Indoors: This field indicates whether the activity took place indoors. A value of 1 means the activity occurred inside a building or enclosed space (Yes), and a value of 0 means the activity took place outside (No).
+        Standing: This field indicates whether the person was standing during the activity. A value of 1 means the person was standing (Yes), and a value of 0 means the person was not standing (No).
+        """
+        messages = [
+            {"role": "system", "content": "Please answer questions just based on this information: " + llm_output},
+            {"role": "user", "content": prompt},
+        ]
+        response = self.chatbot(messages, **generation_args)
+        generated_text = response[0]['generated_text']
+        # Extract JSON from the generated text
+        start_index = generated_text.find('{')
+        end_index = generated_text.rfind('}') + 1
+        json_str = generated_text[start_index:end_index]
+        return json_str
 class VideoAnalysis(BaseModel):
+    screen_interaction_yes: int
     hands_free: int
+    indoors: int
     standing: int
+    @classmethod
+    def from_llm_output(cls, llm_output: str, generated_logic: str) -> 'VideoAnalysis':
+        logic_dict = json.loads(generated_logic)
+        return cls(
+            screen_interaction_yes=logic_dict.get("Screen.interaction_yes", 0),
+            hands_free=logic_dict.get("Hands.free", 0),
+            indoors=logic_dict.get("Indoors", 0),
+            standing=logic_dict.get("Standing", 0)
+        )
+# Create an instance of LLMHelper (using the already loaded pipeline)
+llm_helper = LLMHelper(pipe)
+def process_llm_output(input: LLMInput) -> Dict:
+    # Generate the logic from the LLM output
+    generated_logic = llm_helper.generate_logic(input.llm_output)
+    # Create the structured output
+    structured_output = VideoAnalysis.from_llm_output(input.llm_output, generated_logic)
+    # Return the structured output as a dictionary
+    return structured_output.dict()