File size: 3,999 Bytes
e7bd9fb
0d4817b
e15f81c
3e74eb0
e7bd9fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a2abdd
58537d5
 
 
 
1d5a944
a4bb483
 
 
 
 
 
 
 
 
e7bd9fb
a4bb483
e7bd9fb
a4bb483
 
e7bd9fb
d427810
e7bd9fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9fdb45
e7bd9fb
 
f9fdb45
 
33ee975
6250287
 
33ee975
e7bd9fb
f9fdb45
e7bd9fb
 
 
 
 
 
f9fdb45
e7bd9fb
 
31709fb
e7bd9fb
 
 
 
 
 
 
 
 
 
 
 
 
f9fdb45
e7bd9fb
 
e15f81c
e7bd9fb
 
 
 
f0dbb23
e7bd9fb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Imports
import spaces
import os
import streamlit as st
import requests
from transformers import pipeline
import openai

# Suppressing all warnings
import warnings
warnings.filterwarnings("ignore")

# Image-to-text
def img2txt(url):
    print("Initializing captioning model...")
    captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    
    print("Generating text from the image...")
    text = captioning_model(url, max_new_tokens=20)[0]["generated_text"]
    
    print(text)
    return text

# Text-to-story
def txt2story(img_text, top_k, top_p, temperature):

    headers = {"Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"}

    data = {
        "model": "togethercomputer/llama-2-70b-chat",
        "messages": [
            {"role": "system", "content": '''As an experienced short story writer, write story title and then create a meaningful story influenced by provided words. 
        Ensure stories conclude positively within 100 words. Remember the story must end within 100 words''', "temperature": temperature},
            {"role": "user", "content": f"Here is input set of words: {img_text}", "temperature": temperature} 
        ],
        "top_k": top_k,
        "top_p": top_p,
        "temperature": temperature
    }
    
    response = requests.post("https://api.together.xyz/inference", headers=headers, json=data)
    
    story = response.json()["output"]["choices"][0]["text"] 
    return story


# Text-to-speech
def txt2speech(text):
    print("Initializing text-to-speech conversion...")
    API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
    headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
    payloads = {'inputs': text}

    response = requests.post(API_URL, headers=headers, json=payloads)
    
    with open('audio_story.mp3', 'wb') as file:
        file.write(response.content)

        
# Streamlit web app main function
def main():
    st.set_page_config(page_title="🎨 Image-to-Audio Story 🎧", page_icon="πŸ–ΌοΈ")
    st.title("Turn the Image into Audio Story")

    # Allows users to upload an image file
    uploaded_file = st.file_uploader("# πŸ“· Upload an image...", type=["jpg", "jpeg", "png"])

    # Parameters for LLM model (in the sidebar)
    st.sidebar.markdown("# LLM Inference Configuration Parameters")
    top_k = st.sidebar.number_input("Top-K", min_value=1, max_value=100, value=5)
    top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
    temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)

    if uploaded_file is not None:
        # Reads and saves uploaded image file
        bytes_data = uploaded_file.read()
        with open("uploaded_image.jpg", "wb") as file:
            file.write(bytes_data)

        st.image(uploaded_file, caption='πŸ–ΌοΈ Uploaded Image', use_column_width=True)

        # Initiates AI processing and story generation
        with st.spinner("## πŸ€– AI is at Work! "):
            scenario = img2txt("uploaded_image.jpg")  # Extracts text from the image
            story = txt2story(scenario, top_k, top_p, temperature)  # Generates a story based on the image text, LLM params
            txt2speech(story)  # Converts the story to audio

            st.markdown("---")
            st.markdown("## πŸ“œ Image Caption")
            st.write(scenario)

            st.markdown("---")
            st.markdown("## πŸ“– Story")
            st.write(story)

            st.markdown("---")
            st.markdown("## 🎧 Audio Story")
            st.audio("audio_story.mp3")

if __name__ == '__main__':
    main()

# Credits
st.markdown("### Credits")
st.caption('''
            Made with ❀️ by @Aditya-Neural-Net-Ninja\n 
            Utilizes Image-to-Text, Text Generation, Text-to-Speech Transformer Models\n
            Gratitude to Streamlit, πŸ€— Spaces for Deployment & Hosting
            ''')