Bils commited on
Commit
6b4c086
·
verified ·
1 Parent(s): baeb6b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -16
app.py CHANGED
@@ -5,37 +5,35 @@ import gradio as gr
5
  from dotenv import load_dotenv
6
  import torch
7
  from scipy.io.wavfile import write
8
- from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
15
- # Initialize pipelines globally (in CPU mode)
 
16
  captioning_pipeline = pipeline(
17
  "image-to-text",
18
- model="nlpconnect/vit-gpt2-image-captioning"
 
19
  )
20
 
21
  pipe = DiffusionPipeline.from_pretrained(
22
  "cvssp/audioldm2",
23
  use_auth_token=hf_token
24
  )
 
25
 
26
  @spaces.GPU(duration=120)
27
  def analyze_image_with_free_model(image_file):
28
  try:
29
- # Move captioning pipeline to GPU
30
- captioning_pipeline.to("cuda")
31
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
  temp_file.write(image_file)
33
  temp_image_path = temp_file.name
34
 
35
  results = captioning_pipeline(temp_image_path)
36
- # Move back to CPU (optional)
37
- captioning_pipeline.to("cpu")
38
-
39
  if not results or not isinstance(results, list):
40
  return "Error: Could not generate caption.", True
41
 
@@ -50,7 +48,6 @@ def analyze_image_with_free_model(image_file):
50
  @spaces.GPU(duration=120)
51
  def get_audioldm_from_caption(caption):
52
  try:
53
- # Move AudioLDM pipeline to GPU
54
  pipe.to("cuda")
55
  audio_output = pipe(
56
  prompt=caption,
@@ -61,7 +58,7 @@ def get_audioldm_from_caption(caption):
61
  audio = audio_output.audios[0]
62
 
63
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
64
- write(temp_wav.name, 16000, audio)
65
  return temp_wav.name
66
 
67
  except Exception as e:
@@ -78,12 +75,10 @@ css = """
78
  with gr.Blocks(css=css) as demo:
79
  with gr.Column(elem_id="col-container"):
80
  gr.HTML("""
81
- <h1 style="text-align: center;">
82
- 🎶 Generate Sound Effects from Image
83
- </h1>
84
- <p style="text-align: center;">
85
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
86
- </p>
87
  """)
88
 
89
  gr.Markdown("""
 
5
  from dotenv import load_dotenv
6
  import torch
7
  from scipy.io.wavfile import write
8
+ from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
  from pathlib import Path
11
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
15
+ device_id = 0 if torch.cuda.is_available() else -1
16
+
17
  captioning_pipeline = pipeline(
18
  "image-to-text",
19
+ model="nlpconnect/vit-gpt2-image-captioning",
20
+ device=device_id
21
  )
22
 
23
  pipe = DiffusionPipeline.from_pretrained(
24
  "cvssp/audioldm2",
25
  use_auth_token=hf_token
26
  )
27
+ # The AudioLDM pipeline can be moved to CUDA/CPU explicitly inside the function.
28
 
29
  @spaces.GPU(duration=120)
30
  def analyze_image_with_free_model(image_file):
31
  try:
 
 
32
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
33
  temp_file.write(image_file)
34
  temp_image_path = temp_file.name
35
 
36
  results = captioning_pipeline(temp_image_path)
 
 
 
37
  if not results or not isinstance(results, list):
38
  return "Error: Could not generate caption.", True
39
 
 
48
  @spaces.GPU(duration=120)
49
  def get_audioldm_from_caption(caption):
50
  try:
 
51
  pipe.to("cuda")
52
  audio_output = pipe(
53
  prompt=caption,
 
58
  audio = audio_output.audios[0]
59
 
60
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
61
+ write(temp_wav.name, 16000, audio)
62
  return temp_wav.name
63
 
64
  except Exception as e:
 
75
  with gr.Blocks(css=css) as demo:
76
  with gr.Column(elem_id="col-container"):
77
  gr.HTML("""
78
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
79
+ <p style="text-align: center;">
80
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
81
+ </p>
 
 
82
  """)
83
 
84
  gr.Markdown("""