youssef
commited on
Commit
·
bc47c2c
1
Parent(s):
b75046f
feat: add better prompt
Browse files- src/video_processor/processor.py +24 -12
src/video_processor/processor.py
CHANGED
@@ -34,14 +34,25 @@ class VideoAnalyzer:
|
|
34 |
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
35 |
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
36 |
try:
|
37 |
-
# Create message for model
|
38 |
-
messages = [
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
# Process video using chat template
|
47 |
inputs = self.processor.apply_chat_template(
|
@@ -52,11 +63,12 @@ class VideoAnalyzer:
|
|
52 |
return_tensors="pt"
|
53 |
).to(self.model.device)
|
54 |
|
55 |
-
# Generate description
|
56 |
generated_ids = self.model.generate(
|
57 |
**inputs,
|
58 |
-
do_sample=
|
59 |
-
|
|
|
60 |
)
|
61 |
description = self.processor.batch_decode(
|
62 |
generated_ids,
|
@@ -64,7 +76,7 @@ class VideoAnalyzer:
|
|
64 |
)[0]
|
65 |
|
66 |
return [{
|
67 |
-
"description": description
|
68 |
}]
|
69 |
|
70 |
except Exception as e:
|
|
|
34 |
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
35 |
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
36 |
try:
|
37 |
+
# Create message for model with detailed system prompt
|
38 |
+
messages = [
|
39 |
+
{
|
40 |
+
"role": "system",
|
41 |
+
"content": [
|
42 |
+
{
|
43 |
+
"type": "text",
|
44 |
+
"text": "You are a detailed video analysis assistant that can understand videos. Your task is to provide comprehensive descriptions including all events, actions, and important details with their timestamps. Focus on being specific and thorough."
|
45 |
+
}
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"role": "user",
|
50 |
+
"content": [
|
51 |
+
{"type": "video", "path": video_path},
|
52 |
+
{"type": "text", "text": "Please provide a detailed analysis of this video. Include:\n1. All significant actions and events\n2. Temporal information and timestamps\n3. Important visual details and context\n4. Any text or speech content if present\n5. Scene transitions and changes\nBe thorough and specific so the description can be used for detailed searching later."}
|
53 |
+
]
|
54 |
+
}
|
55 |
+
]
|
56 |
|
57 |
# Process video using chat template
|
58 |
inputs = self.processor.apply_chat_template(
|
|
|
63 |
return_tensors="pt"
|
64 |
).to(self.model.device)
|
65 |
|
66 |
+
# Generate description with increased token limit
|
67 |
generated_ids = self.model.generate(
|
68 |
**inputs,
|
69 |
+
do_sample=True,
|
70 |
+
temperature=0.7,
|
71 |
+
max_new_tokens=512 # Increased from 100 to get more detailed descriptions
|
72 |
)
|
73 |
description = self.processor.batch_decode(
|
74 |
generated_ids,
|
|
|
76 |
)[0]
|
77 |
|
78 |
return [{
|
79 |
+
"description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
|
80 |
}]
|
81 |
|
82 |
except Exception as e:
|