Update app.py
Browse files
app.py
CHANGED
|
@@ -16,11 +16,12 @@ from langchain_community.llms import OpenAI
|
|
| 16 |
from PIL import Image
|
| 17 |
from decord import VideoReader, cpu
|
| 18 |
import requests
|
|
|
|
|
|
|
| 19 |
|
| 20 |
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
| 21 |
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
| 22 |
|
| 23 |
-
# Load MiniCPM-V-2_6 with 4-bit quantization
|
| 24 |
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
|
| 25 |
device_map="auto", torch_dtype=torch.bfloat16)
|
| 26 |
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
|
|
@@ -28,9 +29,15 @@ tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_co
|
|
| 28 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 29 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Initialize voice-only mode
|
| 36 |
def play_voice_output(response):
|
|
@@ -177,7 +184,7 @@ def initialize_tools():
|
|
| 177 |
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
| 178 |
text_model.to(device='cuda', dtype=torch.bfloat16)
|
| 179 |
tts_model.to("cuda")
|
| 180 |
-
|
| 181 |
image_pipe.to("cuda")
|
| 182 |
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
| 183 |
if voice_only:
|
|
|
|
| 16 |
from PIL import Image
|
| 17 |
from decord import VideoReader, cpu
|
| 18 |
import requests
|
| 19 |
+
from huggingface_hub import hf_hub_download
|
| 20 |
+
from safetensors.torch import load_file
|
| 21 |
|
| 22 |
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
| 23 |
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
| 24 |
|
|
|
|
| 25 |
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
|
| 26 |
device_map="auto", torch_dtype=torch.bfloat16)
|
| 27 |
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
|
|
|
|
| 29 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 30 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 31 |
|
| 32 |
+
# Corrected image model and pipeline setup
|
| 33 |
+
base = "stabilityai/stable-diffusion-xl-base-1.0"
|
| 34 |
+
repo = "ByteDance/SDXL-Lightning"
|
| 35 |
+
ckpt = "sdxl_lightning_4step_unet.safetensors"
|
| 36 |
+
|
| 37 |
+
unet = UNet2DConditionModel.from_config(base, subfolder="unet").to("cuda", torch.float16)
|
| 38 |
+
unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda"))
|
| 39 |
+
image_pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant="fp16").to("cuda")
|
| 40 |
+
image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
|
| 41 |
|
| 42 |
# Initialize voice-only mode
|
| 43 |
def play_voice_output(response):
|
|
|
|
| 184 |
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
| 185 |
text_model.to(device='cuda', dtype=torch.bfloat16)
|
| 186 |
tts_model.to("cuda")
|
| 187 |
+
unet.to("cuda", torch.float16)
|
| 188 |
image_pipe.to("cuda")
|
| 189 |
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
| 190 |
if voice_only:
|