wd / gpt_captions.py
AlotaibiFahad's picture
End of training
16b48db verified
import base64
from mimetypes import guess_type
from openai import AzureOpenAI
import os
# Function to encode a local image into a data URL
def local_image_to_data_url(image_path):
# Guess the MIME type of the image based on the file extension
mime_type, _ = guess_type(image_path)
# If MIME type is not found or the file is .webp, set it explicitly
if mime_type is None or mime_type == 'application/octet-stream':
if image_path.lower().endswith('.webp'):
mime_type = 'image/webp' # Explicitly set for .webp images
else:
mime_type = 'application/octet-stream' # Default MIME type if none is found
# Read and encode the image file
with open(image_path, "rb") as image_file:
base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')
# Construct the data URL
return f"data:{mime_type};base64,{base64_encoded_data}"
# Images Path
images_path = "/eph/nvme0/azureml/cr/j/8569d5e3aa08485780b67a53d671e109/exe/wd/1_2M_Dataset"
# Images list
imgs_list = [file for file in os.listdir(images_path)]
# Azure - OpenAI Credential
api_base = "https://allam-swn-gpt-01.openai.azure.com/" # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
api_key="8af2cca79fb34601ab829b44b7fa6dcf"
deployment_name = "gpt-4o-900ptu"
api_version = "2024-02-15-preview" # this might change in the future
# Define a client
client = AzureOpenAI(
api_key=api_key,
api_version=api_version,
base_url=f"{api_base}openai/deployments/{deployment_name}",
)
# Iterate over all images
for img_name in imgs_list:
# Get image path
img_path = os.path.join(images_path, img_name)
# Get txt file
txt_file_name = img_name.split(".")[0] + ".txt"
txt_path = os.path.join(images_path, txt_file_name)
# Make the local image to a url link to be accepted by the model
data_url = local_image_to_data_url(img_path)
response = client.chat.completions.create(
model=deployment_name,
messages=[
{ "role": "system", "content": "You are an image captioning assistant." },
{ "role": "user", "content": [
{
"type": "text",
"text": """You are my captioning model, I will give you a punch of images with their main subject,
and I want you to write a detailed caption based on what you see in the images alone. Take these consideration when writing the caption:
Order the terms in the caption and use commas. The order of the words in the caption directly corresponds to their weight when generating the final image,
so a main subject should always be at the start of the prompt. If we want to add more details,
do it in a "narrative style" and using commas to help separate the terms for the FLUX model to read. The tag of this image is: 1/2M cup"""
},
{
"type": "image_url",
"image_url": {
"url": data_url
}
}
] }
],
max_tokens=2000
)
with open(txt_path, "w") as f:
f.write(response.choices[0].message.content)