ManishThota commited on
Commit
9fd7f68
·
verified ·
1 Parent(s): 57a114d

Update src/text_processor.py

Browse files
Files changed (1) hide show
  1. src/text_processor.py +76 -48
src/text_processor.py CHANGED
@@ -1,64 +1,92 @@
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from pydantic import BaseModel
4
- import spaces
 
 
5
 
6
- device = 'cuda'
 
7
 
8
- # Load your LLM model and tokenizer
9
  torch.random.manual_seed(0)
10
- model = AutoModelForCausalLM.from_pretrained(
11
- "microsoft/Phi-3-mini-4k-instruct",
12
- device_map=device,
13
- torch_dtype="auto",
14
- trust_remote_code=True,
15
- )
16
- tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
17
  pipe = pipeline(
18
  "text-generation",
19
- model=model,
20
- tokenizer=tokenizer,
21
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Pydantic class for output validation
24
  class VideoAnalysis(BaseModel):
25
- indoor: int
26
  hands_free: int
27
- screen_interaction: int
28
  standing: int
29
 
30
- @spaces.GPU(duration=100)
31
- def process_description(description):
32
- # Construct a prompt for your LLM based on the video description
33
- prompt = f"""
34
- You are a helpful AI assistant. Analyze the following video description and answer the questions with 0 for True and 1 for False:
35
-
36
- Video Description: {description}
37
-
38
- Questions:
39
- - Is the scene indoors?
40
- - Are the subject's hands free?
41
- - Is there screen interaction by the subject?
42
- - Is the subject standing?
43
-
44
- Provide your answers in JSON format like this:
45
- {{"indoor": 0, "hands_free": 1, "screen_interaction": 0, "standing": 1}}
46
- """
47
 
48
- generation_args = {
49
- "max_new_tokens": 100, # Adjust as needed
50
- "return_full_text": False,
51
- "temperature": 0.0,
52
- "do_sample": False,
53
- }
54
 
55
- output = pipe(prompt, **generation_args)
56
- json_text = output[0]['generated_text']
 
 
 
 
 
 
 
57
 
58
- try:
59
- # Attempt to parse and validate the JSON response
60
- analysis_result = VideoAnalysis.model_validate_json(json_text)
61
- return analysis_result.model_dump_json() # Return as valid JSON
62
- except Exception as e:
63
- print(f"Error processing LLM output: {e}")
64
- return {"error": "Could not process the video description."}
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from pydantic import BaseModel
4
+ import json
5
+ import warnings
6
+ import spaces
7
 
8
+ # Ignore warnings
9
+ warnings.filterwarnings(action='ignore')
10
 
11
+ # Set random seed
12
  torch.random.manual_seed(0)
13
+
14
+ # Define the model path
15
+ model_path = "microsoft/Phi-3-mini-4k-instruct"
16
+ device= "cuda"
17
+
18
+ # Load the model and pipeline outside the function
 
19
  pipe = pipeline(
20
  "text-generation",
21
+ model=AutoModelForCausalLM.from_pretrained(
22
+ model_path,
23
+ device_map=device,
24
+ torch_dtype="auto",
25
+ trust_remote_code=True,
26
+ ),
27
+ tokenizer=AutoTokenizer.from_pretrained(model_path),
28
+ )
29
+
30
+ generation_args = {
31
+ "max_new_tokens": 50,
32
+ "return_full_text": False,
33
+ "temperature": 0.1,
34
+ "do_sample": True
35
+ }
36
+
37
+ class LLMHelper:
38
+ def __init__(self, pipeline):
39
+ self.chatbot = pipeline
40
+
41
+ def generate_logic(self, llm_output: str):
42
+ prompt = f"""
43
+ Provide the response in json string for the below keys and context based on the description: '{llm_output}'.
44
+
45
+ Screen.interaction_yes: This field indicates whether there was an interaction of the person with a screen during the activity. A value of 1 means there was screen interaction (Yes), and a value of 0 means there was no screen interaction (No).
46
+ Hands.free: This field indicates whether the person's hands were free during the activity. A value of 1 means the person was not holding anything (Yes), indicating free hands. A value of 0 means the person was holding something (No), indicating the hands were not free.
47
+ Indoors: This field indicates whether the activity took place indoors. A value of 1 means the activity occurred inside a building or enclosed space (Yes), and a value of 0 means the activity took place outside (No).
48
+ Standing: This field indicates whether the person was standing during the activity. A value of 1 means the person was standing (Yes), and a value of 0 means the person was not standing (No).
49
+ """
50
+
51
+ messages = [
52
+ {"role": "system", "content": "Please answer questions just based on this information: " + llm_output},
53
+ {"role": "user", "content": prompt},
54
+ ]
55
+
56
+ response = self.chatbot(messages, **generation_args)
57
+ generated_text = response[0]['generated_text']
58
+ # Extract JSON from the generated text
59
+ start_index = generated_text.find('{')
60
+ end_index = generated_text.rfind('}') + 1
61
+ json_str = generated_text[start_index:end_index]
62
+ return json_str
63
 
 
64
  class VideoAnalysis(BaseModel):
65
+ screen_interaction_yes: int
66
  hands_free: int
67
+ indoors: int
68
  standing: int
69
 
70
+ @classmethod
71
+ def from_llm_output(cls, llm_output: str, generated_logic: str) -> 'VideoAnalysis':
72
+ logic_dict = json.loads(generated_logic)
73
+ return cls(
74
+ screen_interaction_yes=logic_dict.get("Screen.interaction_yes", 0),
75
+ hands_free=logic_dict.get("Hands.free", 0),
76
+ indoors=logic_dict.get("Indoors", 0),
77
+ standing=logic_dict.get("Standing", 0)
78
+ )
 
 
 
 
 
 
 
 
79
 
80
+ # Create an instance of LLMHelper (using the already loaded pipeline)
81
+ llm_helper = LLMHelper(pipe)
 
 
 
 
82
 
83
+ def process_llm_output(input: LLMInput) -> Dict:
84
+ # Generate the logic from the LLM output
85
+ generated_logic = llm_helper.generate_logic(input.llm_output)
86
+
87
+ # Create the structured output
88
+ structured_output = VideoAnalysis.from_llm_output(input.llm_output, generated_logic)
89
+
90
+ # Return the structured output as a dictionary
91
+ return structured_output.dict()
92