Spaces:
Build error
Build error
Miquel Farre
commited on
Commit
·
4c7362f
1
Parent(s):
2f5ae20
initial test
Browse files- app.py +9 -1
- video_highlight_detector.py +48 -9
app.py
CHANGED
|
@@ -8,6 +8,8 @@ from typing import Tuple, Optional
|
|
| 8 |
import torch
|
| 9 |
from pathlib import Path
|
| 10 |
import time
|
|
|
|
|
|
|
| 11 |
|
| 12 |
from video_highlight_detector import (
|
| 13 |
load_model,
|
|
@@ -159,5 +161,11 @@ def create_ui(examples_path: str):
|
|
| 159 |
return app
|
| 160 |
|
| 161 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
app = create_ui("video_spec.json")
|
| 163 |
-
app.launch(
|
|
|
|
| 8 |
import torch
|
| 9 |
from pathlib import Path
|
| 10 |
import time
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
|
| 14 |
from video_highlight_detector import (
|
| 15 |
load_model,
|
|
|
|
| 161 |
return app
|
| 162 |
|
| 163 |
if __name__ == "__main__":
|
| 164 |
+
# Initialize CUDA
|
| 165 |
+
if not torch.cuda.is_available():
|
| 166 |
+
raise RuntimeError("This application requires a GPU to run")
|
| 167 |
+
torch.cuda.init()
|
| 168 |
+
torch.cuda.empty_cache()
|
| 169 |
+
|
| 170 |
app = create_ui("video_spec.json")
|
| 171 |
+
app.launch()
|
video_highlight_detector.py
CHANGED
|
@@ -732,35 +732,74 @@ class BatchedVideoHighlightDetector:
|
|
| 732 |
|
| 733 |
def load_model(
|
| 734 |
checkpoint_path: Optional[str] = None,
|
| 735 |
-
base_model_id: str = "HuggingFaceTB/
|
| 736 |
device: str = "cuda"
|
| 737 |
):
|
| 738 |
"""Load the model and processor."""
|
| 739 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
video_target_size = 384
|
| 741 |
-
|
| 742 |
processor = AutoProcessor.from_pretrained(base_model_id)
|
| 743 |
-
# Configure the image processor
|
| 744 |
processor.image_processor.size = {"longest_edge": video_target_size}
|
| 745 |
processor.image_processor.do_resize = True
|
| 746 |
processor.image_processor.do_image_splitting = False
|
| 747 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
if checkpoint_path:
|
| 749 |
model = SmolVLMForConditionalGeneration.from_pretrained(
|
| 750 |
checkpoint_path,
|
| 751 |
-
|
| 752 |
-
device_map=device
|
| 753 |
)
|
| 754 |
else:
|
| 755 |
-
model =
|
| 756 |
base_model_id,
|
| 757 |
-
|
| 758 |
-
device_map=device
|
| 759 |
)
|
| 760 |
|
| 761 |
return model, processor
|
| 762 |
|
| 763 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
def main():
|
| 765 |
checkpoint_path = "/fsx/miquel/smolvlmvideo/checkpoints/final-visionUnfrozen-balanced/checkpoint-6550"
|
| 766 |
base_model_id = "HuggingFaceTB/SmolVLM-2.2B-Instruct"
|
|
|
|
| 732 |
|
| 733 |
def load_model(
|
| 734 |
checkpoint_path: Optional[str] = None,
|
| 735 |
+
base_model_id: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
| 736 |
device: str = "cuda"
|
| 737 |
):
|
| 738 |
"""Load the model and processor."""
|
| 739 |
+
if device == "cuda" and not torch.cuda.is_available():
|
| 740 |
+
raise RuntimeError("CUDA requested but not available")
|
| 741 |
+
|
| 742 |
+
if device == "cuda":
|
| 743 |
+
torch.cuda.empty_cache()
|
| 744 |
+
# Initialize CUDA
|
| 745 |
+
torch.cuda.init()
|
| 746 |
+
|
| 747 |
video_target_size = 384
|
|
|
|
| 748 |
processor = AutoProcessor.from_pretrained(base_model_id)
|
|
|
|
| 749 |
processor.image_processor.size = {"longest_edge": video_target_size}
|
| 750 |
processor.image_processor.do_resize = True
|
| 751 |
processor.image_processor.do_image_splitting = False
|
| 752 |
|
| 753 |
+
model_kwargs = {
|
| 754 |
+
"torch_dtype": torch.bfloat16,
|
| 755 |
+
"device_map": device
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
if checkpoint_path:
|
| 759 |
model = SmolVLMForConditionalGeneration.from_pretrained(
|
| 760 |
checkpoint_path,
|
| 761 |
+
**model_kwargs
|
|
|
|
| 762 |
)
|
| 763 |
else:
|
| 764 |
+
model = SmolVLMForConditionalGeneration.from_pretrained(
|
| 765 |
base_model_id,
|
| 766 |
+
**model_kwargs
|
|
|
|
| 767 |
)
|
| 768 |
|
| 769 |
return model, processor
|
| 770 |
|
| 771 |
|
| 772 |
+
# def load_model(
|
| 773 |
+
# checkpoint_path: Optional[str] = None,
|
| 774 |
+
# base_model_id: str = "HuggingFaceTB/SmolVLM-2.2B-Instruct",
|
| 775 |
+
# device: str = "cuda"
|
| 776 |
+
# ):
|
| 777 |
+
# """Load the model and processor."""
|
| 778 |
+
# # For demonstration, we set the target size
|
| 779 |
+
# video_target_size = 384
|
| 780 |
+
|
| 781 |
+
# processor = AutoProcessor.from_pretrained(base_model_id)
|
| 782 |
+
# # Configure the image processor
|
| 783 |
+
# processor.image_processor.size = {"longest_edge": video_target_size}
|
| 784 |
+
# processor.image_processor.do_resize = True
|
| 785 |
+
# processor.image_processor.do_image_splitting = False
|
| 786 |
+
|
| 787 |
+
# if checkpoint_path:
|
| 788 |
+
# model = SmolVLMForConditionalGeneration.from_pretrained(
|
| 789 |
+
# checkpoint_path,
|
| 790 |
+
# torch_dtype=torch.bfloat16,
|
| 791 |
+
# device_map=device
|
| 792 |
+
# )
|
| 793 |
+
# else:
|
| 794 |
+
# model = SmolVLMForConditionalGeneration.from_pretrained(
|
| 795 |
+
# base_model_id,
|
| 796 |
+
# torch_dtype=torch.bfloat16,
|
| 797 |
+
# device_map=device
|
| 798 |
+
# )
|
| 799 |
+
|
| 800 |
+
# return model, processor
|
| 801 |
+
|
| 802 |
+
|
| 803 |
def main():
|
| 804 |
checkpoint_path = "/fsx/miquel/smolvlmvideo/checkpoints/final-visionUnfrozen-balanced/checkpoint-6550"
|
| 805 |
base_model_id = "HuggingFaceTB/SmolVLM-2.2B-Instruct"
|