Spaces:

aitik2000
/

Story-Generator-from-Images

Running

aitik2000 commited on Jan 24

Commit

4d1a984

verified ·

1 Parent(s): b90ae10

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,17 +14,18 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
 #                   "/3bcb8321394f671bd948ebf0d086d694dda95464")
 caption_image = pipeline("image-to-text",
                 model="Salesforce/blip-image-captioning-large", device=device)
 narrator = pipeline("text-to-speech",
                     model="kakao-enterprise/vits-ljs")
-# caption_image = pipeline("image-to-text",
-#                 model=model_path, device=device)
-# narrator = pipeline("text-to-speech",
-#                     model=tts_model_path)
 def generate_audio(text):
     # Generate the narrated text
@@ -38,12 +39,24 @@ def generate_audio(text):
 def caption_my_image(pil_image):
     semantics = caption_image(images=pil_image)[0]['generated_text']
-    return generate_audio(semantics)
-demo = gr.Interface(fn=caption_my_image,
-                    inputs=[gr.Image(label="Select Image",type="pil")],
-                    outputs=[gr.Audio(label="Image Caption")],
-                    title="Story Generation From Images",
-                    description="THIS APPLICATION WILL BE USED TO GENERATE STORY OF THE IMAGE.")
-demo.launch()

 # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
 #                   "/3bcb8321394f671bd948ebf0d086d694dda95464")
+# caption_image = pipeline("image-to-text",
+#                          model=model_path, device=device)
+# narrator = pipeline("text-to-speech",
+#                     model=tts_model_path)
 caption_image = pipeline("image-to-text",
                 model="Salesforce/blip-image-captioning-large", device=device)
 narrator = pipeline("text-to-speech",
                     model="kakao-enterprise/vits-ljs")
 def generate_audio(text):
     # Generate the narrated text
 def caption_my_image(pil_image):
+    # Generate the caption
     semantics = caption_image(images=pil_image)[0]['generated_text']
+    # Generate the audio for the caption
+    audio_path = generate_audio(semantics)
+    # Return both the caption and the audio
+    return semantics, audio_path
+# Define the Gradio interface
+demo = gr.Interface(
+    fn=caption_my_image,
+    inputs=[gr.Image(label="Select Image", type="pil")],
+    outputs=[
+        gr.Textbox(label="Generated Caption"),
+        gr.Audio(label="Image Caption")
+    ],
+    title="Story Generation From Images",
+    description="THIS APPLICATION WILL BE USED TO GENERATE STORY OF THE IMAGE.")
+)
+demo.launch()