Spaces:
Configuration error
Configuration error
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
import logging | |
import tempfile | |
from pathlib import Path | |
from typing import List, Optional | |
import ffmpeg | |
from PIL import Image | |
from scenedetect import ( # type: ignore[import-untyped] | |
SceneManager, | |
VideoManager, | |
) | |
from scenedetect.detectors import ( # type: ignore[import-untyped] | |
ContentDetector, | |
) | |
from camel.agents import ChatAgent | |
from camel.messages import BaseMessage | |
from camel.models import OpenAIAudioModels, BaseModelBackend | |
from camel.toolkits.base import BaseToolkit | |
from camel.toolkits.function_tool import FunctionTool | |
from camel.utils import dependencies_required | |
from .video_downloader_toolkit import ( | |
VideoDownloaderToolkit, | |
_capture_screenshot, | |
) | |
logger = logging.getLogger(__name__) | |
VIDEO_QA_PROMPT = """ | |
Analyze the provided video frames and corresponding audio transcription to \ | |
answer the given question(s) thoroughly and accurately. | |
Instructions: | |
1. Visual Analysis: | |
- Examine the video frames to identify visible entities. | |
- Differentiate objects, species, or features based on key attributes \ | |
such as size, color, shape, texture, or behavior. | |
- Note significant groupings, interactions, or contextual patterns \ | |
relevant to the analysis. | |
2. Audio Integration: | |
- Use the audio transcription to complement or clarify your visual \ | |
observations. | |
- Identify names, descriptions, or contextual hints in the \ | |
transcription that help confirm or refine your visual analysis. | |
3. Detailed Reasoning and Justification: | |
- Provide a brief explanation of how you identified and distinguished \ | |
each species or object. | |
- Highlight specific features or contextual clues that informed \ | |
your reasoning. | |
4. Comprehensive Answer: | |
- Specify the total number of distinct species or object types \ | |
identified in the video. | |
- Describe the defining characteristics and any supporting evidence \ | |
from the video and transcription. | |
5. Important Considerations: | |
- Pay close attention to subtle differences that could distinguish \ | |
similar-looking species or objects | |
(e.g., juveniles vs. adults, closely related species). | |
- Provide concise yet complete explanations to ensure clarity. | |
**Audio Transcription:** | |
{audio_transcription} | |
**Question:** | |
{question} | |
""" | |
class VideoAnalysisToolkit(BaseToolkit): | |
r"""A class for analysing videos with vision-language model. | |
Args: | |
download_directory (Optional[str], optional): The directory where the | |
video will be downloaded to. If not provided, video will be stored | |
in a temporary directory and will be cleaned up after use. | |
(default: :obj:`None`) | |
""" | |
def __init__( | |
self, | |
download_directory: Optional[str] = None, | |
model: Optional[BaseModelBackend] = None, | |
) -> None: | |
self._cleanup = download_directory is None | |
self._download_directory = Path( | |
download_directory or tempfile.mkdtemp() | |
).resolve() | |
self.video_downloader_toolkit = VideoDownloaderToolkit( | |
download_directory=str(self._download_directory) | |
) | |
try: | |
self._download_directory.mkdir(parents=True, exist_ok=True) | |
except FileExistsError: | |
raise ValueError( | |
f"{self._download_directory} is not a valid directory." | |
) | |
except OSError as e: | |
raise ValueError( | |
f"Error creating directory {self._download_directory}: {e}" | |
) | |
logger.info(f"Video will be downloaded to {self._download_directory}") | |
self.vl_model = model | |
self.vl_agent = ChatAgent( | |
model=self.vl_model, output_language="English" | |
) | |
self.audio_models = OpenAIAudioModels() | |
def _extract_audio_from_video( | |
self, video_path: str, output_format: str = "mp3" | |
) -> str: | |
r"""Extract audio from the video. | |
Args: | |
video_path (str): The path to the video file. | |
output_format (str): The format of the audio file to be saved. | |
(default: :obj:`"mp3"`) | |
Returns: | |
str: The path to the audio file.""" | |
output_path = video_path.rsplit('.', 1)[0] + f".{output_format}" | |
try: | |
( | |
ffmpeg.input(video_path) | |
.output(output_path, vn=None, acodec="libmp3lame") | |
.run() | |
) | |
return output_path | |
except ffmpeg.Error as e: | |
raise RuntimeError(f"FFmpeg-Python failed: {e}") | |
def _transcribe_audio(self, audio_path: str) -> str: | |
r"""Transcribe the audio of the video.""" | |
audio_transcript = self.audio_models.speech_to_text(audio_path) | |
return audio_transcript | |
def _extract_keyframes( | |
self, video_path: str, num_frames: int, threshold: float = 25.0 | |
) -> List[Image.Image]: | |
r"""Extract keyframes from a video based on scene changes | |
and return them as PIL.Image.Image objects. | |
Args: | |
video_path (str): Path to the video file. | |
num_frames (int): Number of keyframes to extract. | |
threshold (float): The threshold value for scene change detection. | |
Returns: | |
list: A list of PIL.Image.Image objects representing | |
the extracted keyframes. | |
""" | |
video_manager = VideoManager([video_path]) | |
scene_manager = SceneManager() | |
scene_manager.add_detector(ContentDetector(threshold=threshold)) | |
video_manager.set_duration() | |
video_manager.start() | |
scene_manager.detect_scenes(video_manager) | |
scenes = scene_manager.get_scene_list() | |
keyframes: List[Image.Image] = [] | |
for start_time, _ in scenes: | |
if len(keyframes) >= num_frames: | |
break | |
frame = _capture_screenshot(video_path, start_time) | |
keyframes.append(frame) | |
print(len(keyframes)) | |
return keyframes | |
def ask_question_about_video( | |
self, | |
video_path: str, | |
question: str, | |
num_frames: int = 28, | |
# 28 is the maximum number of frames | |
# that can be displayed in a single message for | |
# the Qwen-VL-Max model | |
) -> str: | |
r"""Ask a question about the video. | |
Args: | |
video_path (str): The path to the video file. | |
It can be a local file or a URL (such as Youtube website). | |
question (str): The question to ask about the video. | |
num_frames (int): The number of frames to extract from the video. | |
To be adjusted based on the length of the video. | |
(default: :obj:`28`) | |
Returns: | |
str: The answer to the question. | |
""" | |
from urllib.parse import urlparse | |
parsed_url = urlparse(video_path) | |
is_url = all([parsed_url.scheme, parsed_url.netloc]) | |
if is_url: | |
video_path = self.video_downloader_toolkit.download_video( | |
video_path | |
) | |
audio_path = self._extract_audio_from_video(video_path) | |
video_frames = self._extract_keyframes(video_path, num_frames) | |
audio_transcript = self._transcribe_audio(audio_path) | |
prompt = VIDEO_QA_PROMPT.format( | |
audio_transcription=audio_transcript, | |
question=question, | |
) | |
print(prompt) | |
msg = BaseMessage.make_user_message( | |
role_name="User", | |
content=prompt, | |
image_list=video_frames, | |
) | |
response = self.vl_agent.step(msg) | |
answer = response.msgs[0].content | |
return answer | |
def get_tools(self) -> List[FunctionTool]: | |
r"""Returns a list of FunctionTool objects representing the | |
functions in the toolkit. | |
Returns: | |
List[FunctionTool]: A list of FunctionTool objects representing | |
the functions in the toolkit. | |
""" | |
return [FunctionTool(self.ask_question_about_video)] |