File size: 2,606 Bytes
193d60a
 
 
 
0815464
 
 
 
193d60a
 
0815464
 
b736fc0
 
 
0815464
 
 
b736fc0
 
 
0815464
b736fc0
 
c5af96c
b736fc0
c5af96c
b736fc0
 
 
 
 
 
 
 
c5af96c
b736fc0
 
c5af96c
 
 
 
b736fc0
44dd12d
b736fc0
 
c5af96c
b736fc0
 
 
 
c5af96c
b736fc0
 
 
 
c5af96c
b736fc0
 
 
44dd12d
b736fc0
 
c5af96c
44dd12d
b736fc0
 
 
 
 
c5af96c
 
b736fc0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
from transformers import pipeline

# function part
# img2text
def img2text(url):
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    text = image_to_text_model(url)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    story_generator = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
    story_text = story_generator(text, max_length=150, num_return_sequences=1)
    return story_text[0]["generated_text"]

# text2audio
def text2audio(story_text):
    tts_model = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    audio_data = tts_model(story_text)
    return audio_data


# Main part
def main():
    st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
    st.header("Turn Your Image to Audio Story")

    if "scenario" not in st.session_state:
        st.session_state.scenario = None
    if "story" not in st.session_state:
        st.session_state.story = None
    if "audio_data" not in st.session_state:
        st.session_state.audio_data = None

    uploaded_file = st.file_uploader("Select an Image...")

    if uploaded_file is not None and st.session_state.scenario is None:
        print(uploaded_file)
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)

        st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)

        # Stage 1: Image to Text
        st.text('Processing img2text...')
        st.session_state.scenario = img2text(uploaded_file.name)
        st.write(st.session_state.scenario)

        # Stage 2: Text to Story
        st.text('Generating a story...')
        st.session_state.story = text2story(st.session_state.scenario)
        st.write(st.session_state.story)

        # Stage 3: Story to Audio Data
        st.text('Generating audio data...')
        st.session_state.audio_data = text2audio(st.session_state.story)

    elif st.session_state.scenario:
        st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
        st.write("Image Caption: ", st.session_state.scenario)
        st.write("Generated Story: ", st.session_state.story)

    # Play button (No reprocessing)
    if st.session_state.audio_data and st.button("Play Audio"):
        st.audio(st.session_state.audio_data['audio'], 
                 format="audio/wav", 
                 start_time=0, 
                 sample_rate=st.session_state.audio_data['sampling_rate'])

if __name__ == "__main__":
    main()