|
input_dir = "<path_to_your_metadata_json_files>"
|
|
base_url = "<your_base_url>"
|
|
api_key = "<your_api_key>"
|
|
model = "<your_model>"
|
|
|
|
import os
|
|
import json
|
|
import random
|
|
from openai import OpenAI
|
|
|
|
|
|
client = OpenAI(base_url=base_url, api_key=api_key)
|
|
|
|
def log_error(file_path, error_message):
|
|
"""Logs error messages to a specified log file."""
|
|
os.makedirs("logs", exist_ok=True)
|
|
with open("logs/gpt4_summarize_error_log.txt", 'a', encoding='utf-8') as log_file:
|
|
log_file.write(f"Error processing {file_path}: {error_message}\n")
|
|
|
|
def process_json(metadata, language):
|
|
"""
|
|
Processes the given metadata of a music piece using GPT-4 API.
|
|
|
|
This function sends the metadata and target language to the GPT-4 model to generate
|
|
a structured summary. The summary is provided in both English and the specified
|
|
non-English language from the 'nen_language' field.
|
|
|
|
If the provided metadata lacks sufficient music-related details, the function returns `None`.
|
|
|
|
Parameters:
|
|
- metadata (dict): A dictionary containing the metadata of the music piece.
|
|
- language (str): The target non-English language for the summary.
|
|
|
|
Returns:
|
|
- str: A JSON-formatted string containing the English and non-English summaries,
|
|
or `None` if there is insufficient information.
|
|
"""
|
|
system = """Your task is to provide a concise, comprehensive, and coherent summary of the music piece using the provided metadata. Please write the summary in English first, and then write an equivalent summary in the specified non-English language from the "nen_language" field. Use this JSON format:
|
|
{
|
|
"summary_en": "Your English summary here.",
|
|
"summary_nen": {
|
|
"language": "Specified non-English language.",
|
|
"summary": "Your non-English summary here."
|
|
}
|
|
If there is not enough music-related information, return `None` instead.
|
|
}
|
|
"""
|
|
user1 = """{
|
|
"title": "Brejeiro",
|
|
"composer": "Ernesto Nazareth",
|
|
"genres": ["Choro", "Classical", "Instrumental"],
|
|
"description": "\"Brejeiro\" is in A major and 2/4 time. A joyful melody begins at bar six, and a lively tango rhythm starts at bar fourteen. It has a D.C. al Fine at bar fifty-three and ends on two quarter notes in bar thirty-seven. The piece, with its vibrant melodies and rhythms, reflects celebration and carefreeness, embodying the spirit of Brazilian music.",
|
|
"tags": ["Brazilian", "Choro", "Piano"],
|
|
"ensembles": ["Solo Piano", "Small Ensemble"],
|
|
"instruments": ["Piano"],
|
|
"nen_language": "Japanese"
|
|
}
|
|
"""
|
|
assistant1 = """{
|
|
"summary_en": "Brejeiro, composed by Ernesto Nazareth, is a lively choro piece in A major and 2/4 time. It features a joyful melody that begins at bar six and a vibrant tango rhythm introduced at bar fourteen. The piece includes a D.C. al Fine at bar fifty-three, concluding on two quarter notes in bar thirty-seven. With its themes of celebration and carefreeness, Brejeiro beautifully captures the essence of Brazilian music and is well-suited for solo piano and small ensembles.",
|
|
"summary_nen": {
|
|
"language": "Japanese",
|
|
"summary": "「ブレジェイロ」は、エルネスト・ナザレが作曲した活気あふれるショーロの作品で、イ長調の2/4拍子で書かれています。第6小節から始まる喜びに満ちたメロディーと、第14小節で導入される活気あるタンゴのリズムが特徴です。この曲には、第53小節でのD.C. al Fineが含まれ、また第37小節で二つの四分音符で締めくくられています。「ブレジェイロ」は、お祝いと無邪気さのテーマを持ち、ブラジル音楽の本質を美しく捉えており、ソロピアノや小編成のアンサンブルにぴったりの作品です。"
|
|
}
|
|
}
|
|
"""
|
|
user2 = """{
|
|
"title": "Untitled",
|
|
"composer": "Unknown",
|
|
"description": "This is a good song.",
|
|
"nen_language": "Russian"
|
|
}
|
|
"""
|
|
assistant2 = "None"
|
|
filepaths = metadata.pop('filepaths')
|
|
metadata = {k: v for k, v in metadata.items() if v is not None}
|
|
|
|
metadata["nen_language"] = language
|
|
metadata = json.dumps(metadata, ensure_ascii=False, indent=4)
|
|
summaries = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user1},
|
|
{"role": "assistant", "content": assistant1},
|
|
{"role": "user", "content": user2},
|
|
{"role": "assistant", "content": assistant2},
|
|
{"role": "user", "content": metadata},
|
|
]
|
|
).choices[0].message.content
|
|
|
|
if summaries == "None":
|
|
raise ValueError("Received 'None' as summaries response")
|
|
|
|
metadata = json.loads(metadata)
|
|
summaries = json.loads(summaries)
|
|
|
|
if metadata["nen_language"] == summaries["summary_nen"]["language"]:
|
|
metadata.pop("nen_language")
|
|
metadata["summary_en"] = summaries["summary_en"]
|
|
metadata["summary_nen"] = summaries["summary_nen"]
|
|
metadata["filepaths"] = filepaths
|
|
return metadata
|
|
else:
|
|
raise ValueError("Language mismatch: nen_language does not match summary_nen language")
|
|
|
|
def process_files(input_dir):
|
|
|
|
output_dir = input_dir + "_summarized"
|
|
|
|
|
|
languages = """Afrikaans
|
|
Amharic
|
|
Arabic
|
|
Assamese
|
|
Azerbaijani
|
|
Belarusian
|
|
Bulgarian
|
|
Bengali
|
|
Bengali (Romanized)
|
|
Breton
|
|
Bosnian
|
|
Catalan
|
|
Czech
|
|
Welsh
|
|
Danish
|
|
German
|
|
Greek
|
|
Esperanto
|
|
Spanish
|
|
Estonian
|
|
Basque
|
|
Persian
|
|
Finnish
|
|
French
|
|
Western Frisian
|
|
Irish
|
|
Scottish Gaelic
|
|
Galician
|
|
Gujarati
|
|
Hausa
|
|
Hebrew
|
|
Hindi
|
|
Hindi (Romanized)
|
|
Croatian
|
|
Hungarian
|
|
Armenian
|
|
Indonesian
|
|
Icelandic
|
|
Italian
|
|
Japanese
|
|
Javanese
|
|
Georgian
|
|
Kazakh
|
|
Khmer
|
|
Kannada
|
|
Korean
|
|
Kurdish (Kurmanji)
|
|
Kyrgyz
|
|
Latin
|
|
Lao
|
|
Lithuanian
|
|
Latvian
|
|
Malagasy
|
|
Macedonian
|
|
Malayalam
|
|
Mongolian
|
|
Marathi
|
|
Malay
|
|
Burmese
|
|
Burmese (Romanized)
|
|
Nepali
|
|
Dutch
|
|
Norwegian
|
|
Oromo
|
|
Oriya
|
|
Punjabi
|
|
Polish
|
|
Pashto
|
|
Portuguese
|
|
Romanian
|
|
Russian
|
|
Sanskrit
|
|
Sindhi
|
|
Sinhala
|
|
Slovak
|
|
Slovenian
|
|
Somali
|
|
Albanian
|
|
Serbian
|
|
Sundanese
|
|
Swedish
|
|
Swahili
|
|
Tamil
|
|
Tamil (Romanized)
|
|
Telugu
|
|
Telugu (Romanized)
|
|
Thai
|
|
Filipino
|
|
Turkish
|
|
Uyghur
|
|
Ukrainian
|
|
Urdu
|
|
Urdu (Romanized)
|
|
Uzbek
|
|
Vietnamese
|
|
Xhosa
|
|
Yiddish
|
|
Chinese (Simplified)
|
|
Chinese (Traditional)
|
|
Cantonese"""
|
|
languages = [language.strip() for language in languages.split("\n")]
|
|
|
|
|
|
for root, _, files in os.walk(input_dir):
|
|
|
|
relative_path = os.path.relpath(root, input_dir)
|
|
output_path = os.path.join(output_dir, relative_path)
|
|
|
|
|
|
os.makedirs(output_path, exist_ok=True)
|
|
|
|
for file in files:
|
|
if file.endswith('.json'):
|
|
input_file = os.path.join(root, file)
|
|
output_file = os.path.join(output_path, file)
|
|
|
|
try:
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
metadata = json.load(f)
|
|
|
|
|
|
language = random.choice(languages)
|
|
|
|
|
|
processed_metadata = process_json(metadata, language)
|
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(processed_metadata, f, indent=4, ensure_ascii=False)
|
|
|
|
print(f"Processed: {input_file} -> {output_file}")
|
|
|
|
except Exception as e:
|
|
print(f"Failed to process {input_file}: {e}")
|
|
log_error(input_file, str(e))
|
|
|
|
if __name__ == "__main__":
|
|
process_files(input_dir)
|
|
|