aitik2000 commited on
Commit
4d1a984
·
verified ·
1 Parent(s): b90ae10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -13
app.py CHANGED
@@ -14,17 +14,18 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
14
  # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
15
  # "/3bcb8321394f671bd948ebf0d086d694dda95464")
16
 
 
 
 
 
 
 
17
  caption_image = pipeline("image-to-text",
18
  model="Salesforce/blip-image-captioning-large", device=device)
19
 
20
  narrator = pipeline("text-to-speech",
21
  model="kakao-enterprise/vits-ljs")
22
 
23
- # caption_image = pipeline("image-to-text",
24
- # model=model_path, device=device)
25
-
26
- # narrator = pipeline("text-to-speech",
27
- # model=tts_model_path)
28
 
29
  def generate_audio(text):
30
  # Generate the narrated text
@@ -38,12 +39,24 @@ def generate_audio(text):
38
 
39
 
40
  def caption_my_image(pil_image):
 
41
  semantics = caption_image(images=pil_image)[0]['generated_text']
42
- return generate_audio(semantics)
43
-
44
- demo = gr.Interface(fn=caption_my_image,
45
- inputs=[gr.Image(label="Select Image",type="pil")],
46
- outputs=[gr.Audio(label="Image Caption")],
47
- title="Story Generation From Images",
48
- description="THIS APPLICATION WILL BE USED TO GENERATE STORY OF THE IMAGE.")
49
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
14
  # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
15
  # "/3bcb8321394f671bd948ebf0d086d694dda95464")
16
 
17
+ # caption_image = pipeline("image-to-text",
18
+ # model=model_path, device=device)
19
+
20
+ # narrator = pipeline("text-to-speech",
21
+ # model=tts_model_path)
22
+
23
  caption_image = pipeline("image-to-text",
24
  model="Salesforce/blip-image-captioning-large", device=device)
25
 
26
  narrator = pipeline("text-to-speech",
27
  model="kakao-enterprise/vits-ljs")
28
 
 
 
 
 
 
29
 
30
  def generate_audio(text):
31
  # Generate the narrated text
 
39
 
40
 
41
  def caption_my_image(pil_image):
42
+ # Generate the caption
43
  semantics = caption_image(images=pil_image)[0]['generated_text']
44
+ # Generate the audio for the caption
45
+ audio_path = generate_audio(semantics)
46
+ # Return both the caption and the audio
47
+ return semantics, audio_path
48
+
49
+
50
+ # Define the Gradio interface
51
+ demo = gr.Interface(
52
+ fn=caption_my_image,
53
+ inputs=[gr.Image(label="Select Image", type="pil")],
54
+ outputs=[
55
+ gr.Textbox(label="Generated Caption"),
56
+ gr.Audio(label="Image Caption")
57
+ ],
58
+ title="Story Generation From Images",
59
+ description="THIS APPLICATION WILL BE USED TO GENERATE STORY OF THE IMAGE.")
60
+ )
61
+
62
+ demo.launch()