File size: 1,414 Bytes

b0f6ad7
63eb096
321df2e
63eb096
cb0665f
b0f6ad7
63eb096
321df2e
 
cb0665f
d5986ec
63eb096
321df2e
63eb096
 
 
 
 
 
 
 
 
 
 
 
 
 
b7b26d6
63eb096
 
 
 
 
b7b26d6
5429a99
 
 
b7b26d6
5429a99
 
 
 
 
 
b7b26d6
321df2e
5429a99
321df2e
 
 
5429a99
 
b7b26d6

import streamlit as st
from transformers import pipeline
from huggingface_hub import InferenceClient
from PIL import Image
import os


api_key = os.getenv("HUGGINGFACE_TOKEN")
client = InferenceClient(api_key=api_key)

st.header("Character Captions (IN PROGRESS!)")
st.write("Have a character caption any image you upload!")
character = st.selectbox("Choose a character", ["rapper", "shrek", "unintelligible"])

uploaded_img = st.file_uploader("Upload an image")

if uploaded_img is not None:

    image = Image.open(uploaded_img)
    st.image(image)

    image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

    response = image_captioner(image)
    caption = response[0]['generated_text']

    character_prompts = {
        "rapper": f"Describe this scene like you're a rapper: {caption}.",
        "shrek": f"Describe this scene like you're Shrek: {caption}.",
        "unintelligible": f"Describe this scene in a way that makes no sense: {caption}."
    }

    prompt = character_prompts[character]

    messages = [
        { "role": "user", "content": prompt }
    ]

    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.2-3B-Instruct", 
        messages=messages, 
        max_tokens=500,
        stream=True
    )

    response = ''
    for chunk in stream:
        response += chunk.choices[0].delta.content
    
    st.write(response)