clamp2 / process_data /gpt4_summarize.py
sander-wood's picture
Upload 32 files
3c428bc verified
raw
history blame
10.3 kB
input_dir = "<path_to_your_metadata_json_files>" # Replace with the path to your folder containing metadata (.json) files
base_url = "<your_base_url>" # Replace with the base URL for the API
api_key = "<your_api_key>" # Replace with your API key
model = "<your_model>" # Replace with your model name
import os
import json
import random
from openai import OpenAI
# Initialize the OpenAI client
client = OpenAI(base_url=base_url, api_key=api_key)
def log_error(file_path, error_message):
"""Logs error messages to a specified log file."""
os.makedirs("logs", exist_ok=True)
with open("logs/gpt4_summarize_error_log.txt", 'a', encoding='utf-8') as log_file:
log_file.write(f"Error processing {file_path}: {error_message}\n")
def process_json(metadata, language):
"""
Processes the given metadata of a music piece using GPT-4 API.
This function sends the metadata and target language to the GPT-4 model to generate
a structured summary. The summary is provided in both English and the specified
non-English language from the 'nen_language' field.
If the provided metadata lacks sufficient music-related details, the function returns `None`.
Parameters:
- metadata (dict): A dictionary containing the metadata of the music piece.
- language (str): The target non-English language for the summary.
Returns:
- str: A JSON-formatted string containing the English and non-English summaries,
or `None` if there is insufficient information.
"""
system = """Your task is to provide a concise, comprehensive, and coherent summary of the music piece using the provided metadata. Please write the summary in English first, and then write an equivalent summary in the specified non-English language from the "nen_language" field. Use this JSON format:
{
"summary_en": "Your English summary here.",
"summary_nen": {
"language": "Specified non-English language.",
"summary": "Your non-English summary here."
}
If there is not enough music-related information, return `None` instead.
}
"""
user1 = """{
"title": "Brejeiro",
"composer": "Ernesto Nazareth",
"genres": ["Choro", "Classical", "Instrumental"],
"description": "\"Brejeiro\" is in A major and 2/4 time. A joyful melody begins at bar six, and a lively tango rhythm starts at bar fourteen. It has a D.C. al Fine at bar fifty-three and ends on two quarter notes in bar thirty-seven. The piece, with its vibrant melodies and rhythms, reflects celebration and carefreeness, embodying the spirit of Brazilian music.",
"tags": ["Brazilian", "Choro", "Piano"],
"ensembles": ["Solo Piano", "Small Ensemble"],
"instruments": ["Piano"],
"nen_language": "Japanese"
}
"""
assistant1 = """{
"summary_en": "Brejeiro, composed by Ernesto Nazareth, is a lively choro piece in A major and 2/4 time. It features a joyful melody that begins at bar six and a vibrant tango rhythm introduced at bar fourteen. The piece includes a D.C. al Fine at bar fifty-three, concluding on two quarter notes in bar thirty-seven. With its themes of celebration and carefreeness, Brejeiro beautifully captures the essence of Brazilian music and is well-suited for solo piano and small ensembles.",
"summary_nen": {
"language": "Japanese",
"summary": "「ブレジェイロ」は、エルネスト・ナザレが作曲した活気あふれるショーロの作品で、イ長調の2/4拍子で書かれています。第6小節から始まる喜びに満ちたメロディーと、第14小節で導入される活気あるタンゴのリズムが特徴です。この曲には、第53小節でのD.C. al Fineが含まれ、また第37小節で二つの四分音符で締めくくられています。「ブレジェイロ」は、お祝いと無邪気さのテーマを持ち、ブラジル音楽の本質を美しく捉えており、ソロピアノや小編成のアンサンブルにぴったりの作品です。"
}
}
"""
user2 = """{
"title": "Untitled",
"composer": "Unknown",
"description": "This is a good song.",
"nen_language": "Russian"
}
"""
assistant2 = "None"
filepaths = metadata.pop('filepaths')
metadata = {k: v for k, v in metadata.items() if v is not None}
metadata["nen_language"] = language
metadata = json.dumps(metadata, ensure_ascii=False, indent=4)
summaries = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user1},
{"role": "assistant", "content": assistant1},
{"role": "user", "content": user2},
{"role": "assistant", "content": assistant2},
{"role": "user", "content": metadata},
]
).choices[0].message.content
if summaries == "None":
raise ValueError("Received 'None' as summaries response")
metadata = json.loads(metadata)
summaries = json.loads(summaries)
if metadata["nen_language"] == summaries["summary_nen"]["language"]:
metadata.pop("nen_language")
metadata["summary_en"] = summaries["summary_en"]
metadata["summary_nen"] = summaries["summary_nen"]
metadata["filepaths"] = filepaths
return metadata
else:
raise ValueError("Language mismatch: nen_language does not match summary_nen language")
def process_files(input_dir):
# Create output directory with _summarized suffix
output_dir = input_dir + "_summarized"
# Define available languages
languages = """Afrikaans
Amharic
Arabic
Assamese
Azerbaijani
Belarusian
Bulgarian
Bengali
Bengali (Romanized)
Breton
Bosnian
Catalan
Czech
Welsh
Danish
German
Greek
Esperanto
Spanish
Estonian
Basque
Persian
Finnish
French
Western Frisian
Irish
Scottish Gaelic
Galician
Gujarati
Hausa
Hebrew
Hindi
Hindi (Romanized)
Croatian
Hungarian
Armenian
Indonesian
Icelandic
Italian
Japanese
Javanese
Georgian
Kazakh
Khmer
Kannada
Korean
Kurdish (Kurmanji)
Kyrgyz
Latin
Lao
Lithuanian
Latvian
Malagasy
Macedonian
Malayalam
Mongolian
Marathi
Malay
Burmese
Burmese (Romanized)
Nepali
Dutch
Norwegian
Oromo
Oriya
Punjabi
Polish
Pashto
Portuguese
Romanian
Russian
Sanskrit
Sindhi
Sinhala
Slovak
Slovenian
Somali
Albanian
Serbian
Sundanese
Swedish
Swahili
Tamil
Tamil (Romanized)
Telugu
Telugu (Romanized)
Thai
Filipino
Turkish
Uyghur
Ukrainian
Urdu
Urdu (Romanized)
Uzbek
Vietnamese
Xhosa
Yiddish
Chinese (Simplified)
Chinese (Traditional)
Cantonese"""
languages = [language.strip() for language in languages.split("\n")]
# Walk through the input directory
for root, _, files in os.walk(input_dir):
# Construct the corresponding path in the output folder
relative_path = os.path.relpath(root, input_dir)
output_path = os.path.join(output_dir, relative_path)
# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
for file in files:
if file.endswith('.json'):
input_file = os.path.join(root, file)
output_file = os.path.join(output_path, file)
try:
# Read the JSON file
with open(input_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
# Randomly select a language from the list of languages
language = random.choice(languages)
# Process the JSON data
processed_metadata = process_json(metadata, language)
# Write the processed JSON to the output file
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(processed_metadata, f, indent=4, ensure_ascii=False)
print(f"Processed: {input_file} -> {output_file}")
except Exception as e:
print(f"Failed to process {input_file}: {e}")
log_error(input_file, str(e))
if __name__ == "__main__":
process_files(input_dir)