|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
from .clip_encoder import CLIPVisionTower |
|
from .dino_encoder import DINOVisionTower |
|
|
|
def build_vision_tower(vision_tower_cfg, **kwargs): |
|
|
|
vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) |
|
|
|
if vision_tower is None: |
|
raise ValueError("No vision tower specified in configuration.") |
|
|
|
is_absolute_path_exists = os.path.exists(vision_tower) |
|
|
|
if is_absolute_path_exists or vision_tower.startswith("openai") or \ |
|
vision_tower.startswith("facebook") or vision_tower.startswith("microsoft"): |
|
|
|
if "clip" in vision_tower.lower(): |
|
return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) |
|
elif "dino" in vision_tower.lower(): |
|
return DINOVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) |
|
else: |
|
raise ValueError(f'Unknown vision model type in vision_tower: {vision_tower}') |
|
|
|
raise ValueError(f'Unknown vision tower: {vision_tower}') |