JungleGym commited on
Commit
07accc3
·
verified ·
1 Parent(s): 1047061

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -2
README.md CHANGED
@@ -133,10 +133,24 @@ pip install flash-attn==2.7.4.post1 --no-build-isolation --no-cache-dir
133
 
134
  Using 🤗Transformers for Inference:
135
  ```python
 
 
136
  import torch
137
  from transformers import AutoModelForImageTextToText, AutoProcessor
138
  from qwen_vl_utils import process_vision_info
139
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  # Load model and processor
141
  model = AutoModelForImageTextToText.from_pretrained(
142
  "TencentARC/TimeLens-8B",
@@ -152,8 +166,8 @@ processor = AutoProcessor.from_pretrained(
152
  )
153
 
154
  # Prepare input
155
- query = "A man is sitting on a chair"
156
- video_path = "https://huggingface.co/datasets/JungleGym/TimeLens-Assets/blob/main/2Y8XQ.mp4"
157
 
158
  GROUNDER_PROMPT = "Please find the visual event described by the sentence '{}', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
159
 
 
133
 
134
  Using 🤗Transformers for Inference:
135
  ```python
136
+ import requests
137
+ import os
138
  import torch
139
  from transformers import AutoModelForImageTextToText, AutoProcessor
140
  from qwen_vl_utils import process_vision_info
141
 
142
+
143
+ def download_video(url):
144
+ save_path = os.path.basename(url)
145
+ if not os.path.exists(save_path):
146
+ print(f"Downloading video from {url}...")
147
+ response = requests.get(url, stream=True)
148
+ response.raise_for_status()
149
+ with open(save_path, 'wb') as f:
150
+ for chunk in response.iter_content(chunk_size=8192):
151
+ f.write(chunk)
152
+ return save_path
153
+
154
  # Load model and processor
155
  model = AutoModelForImageTextToText.from_pretrained(
156
  "TencentARC/TimeLens-8B",
 
166
  )
167
 
168
  # Prepare input
169
+ query = "A man drinks water with a glass"
170
+ video_path = download_video("https://huggingface.co/datasets/JungleGym/TimeLens-Assets/resolve/main/2Y8XQ.mp4")
171
 
172
  GROUNDER_PROMPT = "Please find the visual event described by the sentence '{}', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
173