import gradio as gr from huggingface_hub import InferenceClient import json import uuid from PIL import Image from bs4 import BeautifulSoup import requests import random from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer from threading import Thread import re import time import torch import cv2 model_id = "llava-hf/llava-interleave-qwen-0.5b-hf" processor = LlavaProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id) model.to("cpu") def replace_video_with_images(text, frames): return text.replace("