Bils commited on
Commit
9f4cca0
·
verified ·
1 Parent(s): 24da5c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -25
app.py CHANGED
@@ -8,31 +8,19 @@ from diffusers import DiffusionPipeline
8
  from transformers import pipeline
9
  from pathlib import Path
10
 
11
- # Load environment variables from .env file if needed
12
  load_dotenv()
13
 
14
- # If you have any Hugging Face tokens for private models (AudioLDM2 requires HF_TKN)
15
  hf_token = os.getenv("HF_TKN")
16
 
17
- # ------------------------------------------------
18
- # 1) INITIALIZE FREE IMAGE CAPTIONING PIPELINE
19
- # ------------------------------------------------
20
- # Replace "nlpconnect/vit-gpt2-image-captioning" with any other free image captioning model you prefer.
21
  captioning_pipeline = pipeline(
22
  "image-to-text",
23
  model="nlpconnect/vit-gpt2-image-captioning",
24
- # If the model is private or requires auth, pass the token here: use_auth_token=hf_token,
25
  )
26
 
27
- # ------------------------------------------------
28
- # 2) INITIALIZE AUDIO LDM-2 PIPELINE
29
- # ------------------------------------------------
30
- # AudioLDM2 is also from Hugging Face. If it’s a private model, pass your token via use_auth_token.
31
- # If you’re using the public version, you may not need the token at all.
32
  device = "cuda" if torch.cuda.is_available() else "cpu"
33
  pipe = DiffusionPipeline.from_pretrained(
34
  "cvssp/audioldm2",
35
- use_auth_token=hf_token # remove or comment out if not needed
36
  )
37
  pipe = pipe.to(device)
38
 
@@ -42,17 +30,14 @@ def analyze_image_with_free_model(image_file):
42
  Returns: (caption_text, is_error_flag)
43
  """
44
  try:
45
- # Save uploaded image to a temporary file
46
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
47
  temp_file.write(image_file)
48
  temp_image_path = temp_file.name
49
 
50
- # Run the image captioning pipeline
51
  results = captioning_pipeline(temp_image_path)
52
  if not results or not isinstance(results, list):
53
  return "Error: Could not generate caption.", True
54
 
55
- # Typically, pipeline returns a list of dicts with a "generated_text" key
56
  caption = results[0].get("generated_text", "").strip()
57
  if not caption:
58
  return "No caption was generated.", True
@@ -68,7 +53,6 @@ def get_audioldm_from_caption(caption):
68
  Returns the filename (path) of the generated .wav file.
69
  """
70
  try:
71
- # Generate audio from the caption
72
  audio_output = pipe(
73
  prompt=caption,
74
  num_inference_steps=50,
@@ -76,7 +60,6 @@ def get_audioldm_from_caption(caption):
76
  )
77
  audio = audio_output.audios[0]
78
 
79
- # Write the audio to a temporary .wav file
80
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
81
  write(temp_wav.name, 16000, audio)
82
  return temp_wav.name
@@ -85,9 +68,6 @@ def get_audioldm_from_caption(caption):
85
  print(f"Error generating audio from caption: {e}")
86
  return None
87
 
88
- # ------------------------------------------------
89
- # 3) GRADIO INTERFACE
90
- # ------------------------------------------------
91
  css = """
92
  #col-container{
93
  margin: 0 auto;
@@ -96,7 +76,6 @@ css = """
96
  """
97
 
98
  with gr.Blocks(css=css) as demo:
99
- # Main Title and App Description
100
  with gr.Column(elem_id="col-container"):
101
  gr.HTML("""
102
  <h1 style="text-align: center;">
@@ -145,15 +124,13 @@ with gr.Blocks(css=css) as demo:
145
  Enjoy exploring the auditory landscape of your images!
146
  """)
147
 
148
- # Function to update the caption display based on the uploaded image
149
  def update_caption(image_file):
150
  description, error_flag = analyze_image_with_free_model(image_file)
151
  return description
152
 
153
- # Function to generate sound from the description
154
  def generate_sound(description):
155
  if not description or description.startswith("Error"):
156
- return None # or some default sound
157
  audio_path = get_audioldm_from_caption(description)
158
  return audio_path
159
 
 
8
  from transformers import pipeline
9
  from pathlib import Path
10
 
 
11
  load_dotenv()
12
 
 
13
  hf_token = os.getenv("HF_TKN")
14
 
 
 
 
 
15
  captioning_pipeline = pipeline(
16
  "image-to-text",
17
  model="nlpconnect/vit-gpt2-image-captioning",
 
18
  )
19
 
 
 
 
 
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
  pipe = DiffusionPipeline.from_pretrained(
22
  "cvssp/audioldm2",
23
+ use_auth_token=hf_token
24
  )
25
  pipe = pipe.to(device)
26
 
 
30
  Returns: (caption_text, is_error_flag)
31
  """
32
  try:
 
33
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
34
  temp_file.write(image_file)
35
  temp_image_path = temp_file.name
36
 
 
37
  results = captioning_pipeline(temp_image_path)
38
  if not results or not isinstance(results, list):
39
  return "Error: Could not generate caption.", True
40
 
 
41
  caption = results[0].get("generated_text", "").strip()
42
  if not caption:
43
  return "No caption was generated.", True
 
53
  Returns the filename (path) of the generated .wav file.
54
  """
55
  try:
 
56
  audio_output = pipe(
57
  prompt=caption,
58
  num_inference_steps=50,
 
60
  )
61
  audio = audio_output.audios[0]
62
 
 
63
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
64
  write(temp_wav.name, 16000, audio)
65
  return temp_wav.name
 
68
  print(f"Error generating audio from caption: {e}")
69
  return None
70
 
 
 
 
71
  css = """
72
  #col-container{
73
  margin: 0 auto;
 
76
  """
77
 
78
  with gr.Blocks(css=css) as demo:
 
79
  with gr.Column(elem_id="col-container"):
80
  gr.HTML("""
81
  <h1 style="text-align: center;">
 
124
  Enjoy exploring the auditory landscape of your images!
125
  """)
126
 
 
127
  def update_caption(image_file):
128
  description, error_flag = analyze_image_with_free_model(image_file)
129
  return description
130
 
 
131
  def generate_sound(description):
132
  if not description or description.startswith("Error"):
133
+ return None
134
  audio_path = get_audioldm_from_caption(description)
135
  return audio_path
136