Update app.py
Browse files
app.py
CHANGED
@@ -14,17 +14,18 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
14 |
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
|
15 |
# "/3bcb8321394f671bd948ebf0d086d694dda95464")
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
caption_image = pipeline("image-to-text",
|
18 |
model="Salesforce/blip-image-captioning-large", device=device)
|
19 |
|
20 |
narrator = pipeline("text-to-speech",
|
21 |
model="kakao-enterprise/vits-ljs")
|
22 |
|
23 |
-
# caption_image = pipeline("image-to-text",
|
24 |
-
# model=model_path, device=device)
|
25 |
-
|
26 |
-
# narrator = pipeline("text-to-speech",
|
27 |
-
# model=tts_model_path)
|
28 |
|
29 |
def generate_audio(text):
|
30 |
# Generate the narrated text
|
@@ -38,12 +39,24 @@ def generate_audio(text):
|
|
38 |
|
39 |
|
40 |
def caption_my_image(pil_image):
|
|
|
41 |
semantics = caption_image(images=pil_image)[0]['generated_text']
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
demo.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
|
15 |
# "/3bcb8321394f671bd948ebf0d086d694dda95464")
|
16 |
|
17 |
+
# caption_image = pipeline("image-to-text",
|
18 |
+
# model=model_path, device=device)
|
19 |
+
|
20 |
+
# narrator = pipeline("text-to-speech",
|
21 |
+
# model=tts_model_path)
|
22 |
+
|
23 |
caption_image = pipeline("image-to-text",
|
24 |
model="Salesforce/blip-image-captioning-large", device=device)
|
25 |
|
26 |
narrator = pipeline("text-to-speech",
|
27 |
model="kakao-enterprise/vits-ljs")
|
28 |
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def generate_audio(text):
|
31 |
# Generate the narrated text
|
|
|
39 |
|
40 |
|
41 |
def caption_my_image(pil_image):
|
42 |
+
# Generate the caption
|
43 |
semantics = caption_image(images=pil_image)[0]['generated_text']
|
44 |
+
# Generate the audio for the caption
|
45 |
+
audio_path = generate_audio(semantics)
|
46 |
+
# Return both the caption and the audio
|
47 |
+
return semantics, audio_path
|
48 |
+
|
49 |
+
|
50 |
+
# Define the Gradio interface
|
51 |
+
demo = gr.Interface(
|
52 |
+
fn=caption_my_image,
|
53 |
+
inputs=[gr.Image(label="Select Image", type="pil")],
|
54 |
+
outputs=[
|
55 |
+
gr.Textbox(label="Generated Caption"),
|
56 |
+
gr.Audio(label="Image Caption")
|
57 |
+
],
|
58 |
+
title="Story Generation From Images",
|
59 |
+
description="THIS APPLICATION WILL BE USED TO GENERATE STORY OF THE IMAGE.")
|
60 |
+
)
|
61 |
+
|
62 |
+
demo.launch()
|