File size: 3,958 Bytes
5cbf359
962cccf
6ee5519
9fd7f68
 
eec5a54
a5beb04
eec5a54
f4de9a0
eec5a54
9fd7f68
 
6ee5519
9fd7f68
6ee5519
9fd7f68
eec5a54
9fd7f68
 
 
 
 
 
 
 
5cbf359
eec5a54
5cbf359
 
 
 
 
 
 
 
 
 
eec5a54
5cbf359
eec5a54
 
 
 
5cbf359
eec5a54
 
 
 
 
 
 
 
9fd7f68
eec5a54
 
 
 
9fd7f68
5cbf359
eec5a54
9fd7f68
eec5a54
 
 
032d6c3
 
 
 
 
 
 
 
 
f4de9a0
eec5a54
f4de9a0
9fd7f68
f4de9a0
9fd7f68
f4de9a0
 
9fd7f68
eec5a54
032d6c3
 
 
 
 
9fd7f68
 
 
 
 
 
f4de9a0
eec5a54
5cbf359
 
eec5a54
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import json
import warnings
from pydantic import BaseModel
from typing import Dict
import spaces

device = "cuda"
# Ignore warnings
warnings.filterwarnings(action='ignore')

# Set random seed
torch.random.manual_seed(0)

# Define model path and generation arguments
model_path = "microsoft/Phi-3-mini-4k-instruct"
generation_args = {
    "max_new_tokens": 50,
    "return_full_text": False,
    "temperature": 0.1,
    "do_sample": True
}

# Load the model and pipeline once and keep it in memory
def load_model_pipeline(model_path: str):
    if not hasattr(load_model_pipeline, "pipe"):
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map=device,
            torch_dtype="auto",
            trust_remote_code=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        load_model_pipeline.pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return load_model_pipeline.pipe

# Initialize the pipeline and keep it in memory
pipe = load_model_pipeline(model_path)

# Generate logic from LLM output
@spaces.GPU(duration=50)
def generate_logic(llm_output: str) -> str:
    prompt = f"""
    Provide the response in json string for the below keys and context based on the description: '{llm_output}'.
    
    Screen.interaction_yes: This field indicates whether there was an interaction of the person with a screen during the activity. A value of 1 means there was screen interaction (Yes), and a value of 0 means there was no screen interaction (No).
    Hands.free: This field indicates whether the person's hands were free during the activity. A value of 1 means the person was not holding anything (Yes), indicating free hands. A value of 0 means the person was holding something (No), indicating the hands were not free.
    Indoors: This field indicates whether the activity took place indoors. A value of 1 means the activity occurred inside a building or enclosed space (Yes), and a value of 0 means the activity took place outside (No).
    Standing: This field indicates whether the person was standing during the activity. A value of 1 means the person was standing (Yes), and a value of 0 means the person was not standing (No).
    """

    messages = [
        {"role": "system", "content": "Please answer questions just based on this information: " + llm_output},
        {"role": "user", "content": prompt},
    ]

    response = pipe(messages, **generation_args)
    generated_text = response[0]['generated_text']

    # Extract JSON from the generated text
    start_index = generated_text.find('{')
    end_index = generated_text.rfind('}') + 1
    json_str = generated_text[start_index:end_index]
    
    # Log the generated JSON string for debugging
    print(f"Generated JSON: {json_str}")

    if not json_str.strip():
        raise ValueError("Generated logic is empty or invalid JSON")
    
    return json_str

# Pydantic model for structured output
class VideoAnalysis(BaseModel):
    screen_interaction_yes: int
    hands_free: int
    indoors: int
    standing: int

    @classmethod
    def from_llm_output(cls, generated_logic: str) -> 'VideoAnalysis':
        try:
            logic_dict = json.loads(generated_logic)
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON: {e}") from e
        
        return cls(
            screen_interaction_yes=logic_dict.get("Screen.interaction_yes", 0),
            hands_free=logic_dict.get("Hands.free", 0),
            indoors=logic_dict.get("Indoors", 0),
            standing=logic_dict.get("Standing", 0)
        )

# Main function to process LLM output
def process_description(description: str) -> Dict:
    generated_logic = generate_logic(description)
    structured_output = VideoAnalysis.from_llm_output(generated_logic)
    return structured_output.dict()