|
import base64 |
|
from mimetypes import guess_type |
|
from openai import AzureOpenAI |
|
import os |
|
|
|
|
|
def local_image_to_data_url(image_path): |
|
|
|
mime_type, _ = guess_type(image_path) |
|
|
|
|
|
if mime_type is None or mime_type == 'application/octet-stream': |
|
if image_path.lower().endswith('.webp'): |
|
mime_type = 'image/webp' |
|
else: |
|
mime_type = 'application/octet-stream' |
|
|
|
|
|
with open(image_path, "rb") as image_file: |
|
base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
|
|
return f"data:{mime_type};base64,{base64_encoded_data}" |
|
|
|
|
|
images_path = "/eph/nvme0/azureml/cr/j/8569d5e3aa08485780b67a53d671e109/exe/wd/1_2M_Dataset" |
|
|
|
|
|
imgs_list = [file for file in os.listdir(images_path)] |
|
|
|
|
|
api_base = "https://allam-swn-gpt-01.openai.azure.com/" |
|
api_key="8af2cca79fb34601ab829b44b7fa6dcf" |
|
deployment_name = "gpt-4o-900ptu" |
|
api_version = "2024-02-15-preview" |
|
|
|
|
|
client = AzureOpenAI( |
|
api_key=api_key, |
|
api_version=api_version, |
|
base_url=f"{api_base}openai/deployments/{deployment_name}", |
|
) |
|
|
|
|
|
|
|
for img_name in imgs_list: |
|
|
|
img_path = os.path.join(images_path, img_name) |
|
|
|
|
|
txt_file_name = img_name.split(".")[0] + ".txt" |
|
txt_path = os.path.join(images_path, txt_file_name) |
|
|
|
|
|
data_url = local_image_to_data_url(img_path) |
|
|
|
response = client.chat.completions.create( |
|
model=deployment_name, |
|
messages=[ |
|
{ "role": "system", "content": "You are an image captioning assistant." }, |
|
{ "role": "user", "content": [ |
|
{ |
|
"type": "text", |
|
"text": """You are my captioning model, I will give you a punch of images with their main subject, |
|
and I want you to write a detailed caption based on what you see in the images alone. Take these consideration when writing the caption: |
|
Order the terms in the caption and use commas. The order of the words in the caption directly corresponds to their weight when generating the final image, |
|
so a main subject should always be at the start of the prompt. If we want to add more details, |
|
do it in a "narrative style" and using commas to help separate the terms for the FLUX model to read. The tag of this image is: 1/2M cup""" |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": data_url |
|
} |
|
} |
|
] } |
|
], |
|
max_tokens=2000 |
|
) |
|
|
|
with open(txt_path, "w") as f: |
|
f.write(response.choices[0].message.content) |