Update README.md
Browse files
README.md
CHANGED
|
@@ -133,10 +133,24 @@ pip install flash-attn==2.7.4.post1 --no-build-isolation --no-cache-dir
|
|
| 133 |
|
| 134 |
Using 🤗Transformers for Inference:
|
| 135 |
```python
|
|
|
|
|
|
|
| 136 |
import torch
|
| 137 |
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 138 |
from qwen_vl_utils import process_vision_info
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
# Load model and processor
|
| 141 |
model = AutoModelForImageTextToText.from_pretrained(
|
| 142 |
"TencentARC/TimeLens-8B",
|
|
@@ -152,8 +166,8 @@ processor = AutoProcessor.from_pretrained(
|
|
| 152 |
)
|
| 153 |
|
| 154 |
# Prepare input
|
| 155 |
-
query = "A man
|
| 156 |
-
video_path = "https://huggingface.co/datasets/JungleGym/TimeLens-Assets/
|
| 157 |
|
| 158 |
GROUNDER_PROMPT = "Please find the visual event described by the sentence '{}', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
|
| 159 |
|
|
|
|
| 133 |
|
| 134 |
Using 🤗Transformers for Inference:
|
| 135 |
```python
|
| 136 |
+
import requests
|
| 137 |
+
import os
|
| 138 |
import torch
|
| 139 |
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 140 |
from qwen_vl_utils import process_vision_info
|
| 141 |
|
| 142 |
+
|
| 143 |
+
def download_video(url):
|
| 144 |
+
save_path = os.path.basename(url)
|
| 145 |
+
if not os.path.exists(save_path):
|
| 146 |
+
print(f"Downloading video from {url}...")
|
| 147 |
+
response = requests.get(url, stream=True)
|
| 148 |
+
response.raise_for_status()
|
| 149 |
+
with open(save_path, 'wb') as f:
|
| 150 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 151 |
+
f.write(chunk)
|
| 152 |
+
return save_path
|
| 153 |
+
|
| 154 |
# Load model and processor
|
| 155 |
model = AutoModelForImageTextToText.from_pretrained(
|
| 156 |
"TencentARC/TimeLens-8B",
|
|
|
|
| 166 |
)
|
| 167 |
|
| 168 |
# Prepare input
|
| 169 |
+
query = "A man drinks water with a glass"
|
| 170 |
+
video_path = download_video("https://huggingface.co/datasets/JungleGym/TimeLens-Assets/resolve/main/2Y8XQ.mp4")
|
| 171 |
|
| 172 |
GROUNDER_PROMPT = "Please find the visual event described by the sentence '{}', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
|
| 173 |
|