ACMCMC
commited on
Commit
·
4aad28a
1
Parent(s):
e79532a
Bugfix
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ import matplotlib.pyplot as plt
|
|
| 11 |
from utils import (
|
| 12 |
process_chat_file,
|
| 13 |
transform_conversations_dataset_into_training_examples,
|
| 14 |
-
convert_gpt_to_gemini_format, #
|
| 15 |
)
|
| 16 |
from validation import check_format_errors, estimate_cost, get_distributions
|
| 17 |
|
|
@@ -71,6 +71,7 @@ def file_upload_callback(
|
|
| 71 |
user_role,
|
| 72 |
model_role,
|
| 73 |
whatsapp_name,
|
|
|
|
| 74 |
datetime_dayfirst,
|
| 75 |
message_line_format,
|
| 76 |
minutes_threshold,
|
|
@@ -79,7 +80,6 @@ def file_upload_callback(
|
|
| 79 |
split_conversation_threshold,
|
| 80 |
progress=gr.Progress(),
|
| 81 |
):
|
| 82 |
-
output_format = "GPT"
|
| 83 |
logger.info(f"Processing {files}")
|
| 84 |
full_system_prompt = f"""# Task
|
| 85 |
You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
|
|
@@ -131,6 +131,30 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
|
|
| 131 |
f"Total number of generated examples: {total_number_of_generated_examples}"
|
| 132 |
)
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
# Split into training and validation datasets (80% and 20%)
|
| 135 |
try:
|
| 136 |
split_examples_ds = full_examples_ds.train_test_split(
|
|
@@ -194,18 +218,12 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
|
|
| 194 |
training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
|
| 195 |
validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
|
| 196 |
|
| 197 |
-
#
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
# Save as JSON files with Gemini format
|
| 202 |
-
file_path = f"training_examples_gemini_{uuid}.json"
|
| 203 |
-
with open(file_path, 'w', encoding='utf-8') as f:
|
| 204 |
-
json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
|
| 205 |
|
| 206 |
-
file_path_validation = f"validation_examples_gemini_{uuid}.
|
| 207 |
-
|
| 208 |
-
json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
|
| 209 |
else:
|
| 210 |
# Original GPT format - JSONL
|
| 211 |
file_path = f"training_examples_{uuid}.jsonl"
|
|
@@ -413,7 +431,7 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 413 |
user_role,
|
| 414 |
model_role,
|
| 415 |
whatsapp_name,
|
| 416 |
-
|
| 417 |
datetime_dayfirst,
|
| 418 |
message_line_format,
|
| 419 |
minutes_threshold,
|
|
|
|
| 11 |
from utils import (
|
| 12 |
process_chat_file,
|
| 13 |
transform_conversations_dataset_into_training_examples,
|
| 14 |
+
convert_gpt_to_gemini_format, # Use the fixed version
|
| 15 |
)
|
| 16 |
from validation import check_format_errors, estimate_cost, get_distributions
|
| 17 |
|
|
|
|
| 71 |
user_role,
|
| 72 |
model_role,
|
| 73 |
whatsapp_name,
|
| 74 |
+
output_format,
|
| 75 |
datetime_dayfirst,
|
| 76 |
message_line_format,
|
| 77 |
minutes_threshold,
|
|
|
|
| 80 |
split_conversation_threshold,
|
| 81 |
progress=gr.Progress(),
|
| 82 |
):
|
|
|
|
| 83 |
logger.info(f"Processing {files}")
|
| 84 |
full_system_prompt = f"""# Task
|
| 85 |
You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
|
|
|
|
| 131 |
f"Total number of generated examples: {total_number_of_generated_examples}"
|
| 132 |
)
|
| 133 |
|
| 134 |
+
# Remove messages where we don't have the pattern user -> model -> user -> model (there should be at least one user message and one model message after every user message)
|
| 135 |
+
def has_valid_message_pattern(example):
|
| 136 |
+
messages = example["messages"]
|
| 137 |
+
if not messages:
|
| 138 |
+
return False
|
| 139 |
+
# The first message should be a system message, so we can skip it
|
| 140 |
+
if messages and messages[0]["role"] == "system":
|
| 141 |
+
messages = messages[1:]
|
| 142 |
+
# If there are less than 2 messages, we can't have a valid pattern
|
| 143 |
+
if len(messages) < 2:
|
| 144 |
+
return False
|
| 145 |
+
for i in range(0, len(messages) - 1, 2):
|
| 146 |
+
# Check if the current message is from the user and the next one is from the model
|
| 147 |
+
if messages[i]["role"] == user_role and messages[i + 1]["role"] == model_role:
|
| 148 |
+
continue
|
| 149 |
+
# If we reach here, it means the pattern is broken
|
| 150 |
+
return False
|
| 151 |
+
return True
|
| 152 |
+
|
| 153 |
+
full_examples_ds = full_examples_ds.filter(has_valid_message_pattern)
|
| 154 |
+
logger.info(
|
| 155 |
+
f"Number of examples after filtering for valid message pattern: {len(full_examples_ds)}"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
# Split into training and validation datasets (80% and 20%)
|
| 159 |
try:
|
| 160 |
split_examples_ds = full_examples_ds.train_test_split(
|
|
|
|
| 218 |
training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
|
| 219 |
validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
|
| 220 |
|
| 221 |
+
# Save as JSONL files with Gemini format - one JSON object per line
|
| 222 |
+
file_path = f"training_examples_gemini_{uuid}.jsonl"
|
| 223 |
+
training_examples_gemini.to_json(path_or_buf=file_path, force_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
file_path_validation = f"validation_examples_gemini_{uuid}.jsonl"
|
| 226 |
+
validation_examples_gemini.to_json(path_or_buf=file_path_validation, force_ascii=False)
|
|
|
|
| 227 |
else:
|
| 228 |
# Original GPT format - JSONL
|
| 229 |
file_path = f"training_examples_{uuid}.jsonl"
|
|
|
|
| 431 |
user_role,
|
| 432 |
model_role,
|
| 433 |
whatsapp_name,
|
| 434 |
+
output_format,
|
| 435 |
datetime_dayfirst,
|
| 436 |
message_line_format,
|
| 437 |
minutes_threshold,
|
utils.py
CHANGED
|
@@ -525,52 +525,43 @@ def transform_conversations_dataset_into_training_examples(
|
|
| 525 |
|
| 526 |
def convert_gpt_to_gemini_format(gpt_dataset):
|
| 527 |
"""
|
| 528 |
-
Convert GPT format
|
| 529 |
|
| 530 |
GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
| 531 |
Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
|
| 532 |
"""
|
| 533 |
-
def
|
| 534 |
-
|
| 535 |
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
messages = json.loads(messages)
|
| 541 |
-
|
| 542 |
-
# Find system message and other contents
|
| 543 |
-
system_instruction = None
|
| 544 |
-
contents = []
|
| 545 |
-
|
| 546 |
-
for msg in messages:
|
| 547 |
-
if msg["role"] == "system":
|
| 548 |
-
system_instruction = {
|
| 549 |
-
"role": "system",
|
| 550 |
-
"parts": [{"text": msg["content"]}]
|
| 551 |
-
}
|
| 552 |
-
elif msg["role"] in ["user", "assistant", "model"]:
|
| 553 |
-
# Convert assistant to model for Gemini
|
| 554 |
-
role = "model" if msg["role"] == "assistant" else msg["role"]
|
| 555 |
-
contents.append({
|
| 556 |
-
"role": role,
|
| 557 |
-
"parts": [{"text": msg["content"]}]
|
| 558 |
-
})
|
| 559 |
-
|
| 560 |
-
gemini_example = {"contents": contents}
|
| 561 |
-
if system_instruction:
|
| 562 |
-
gemini_example["systemInstruction"] = system_instruction
|
| 563 |
-
|
| 564 |
-
gemini_examples.append(gemini_example)
|
| 565 |
|
| 566 |
-
#
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
|
| 576 |
-
|
|
|
|
|
|
| 525 |
|
| 526 |
def convert_gpt_to_gemini_format(gpt_dataset):
|
| 527 |
"""
|
| 528 |
+
Convert GPT format training examples to Gemini 2.0 format using Dataset
|
| 529 |
|
| 530 |
GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
| 531 |
Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
|
| 532 |
"""
|
| 533 |
+
def convert_example(example):
|
| 534 |
+
messages = example["messages"]
|
| 535 |
|
| 536 |
+
# Parse JSON messages if they're strings
|
| 537 |
+
if isinstance(messages, str):
|
| 538 |
+
import json
|
| 539 |
+
messages = json.loads(messages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
|
| 541 |
+
# Find system message and other contents
|
| 542 |
+
system_instruction = None
|
| 543 |
+
contents = []
|
| 544 |
+
|
| 545 |
+
for msg in messages:
|
| 546 |
+
if msg["role"] == "system":
|
| 547 |
+
system_instruction = {
|
| 548 |
+
"role": "system",
|
| 549 |
+
"parts": [{"text": msg["content"]}]
|
| 550 |
+
}
|
| 551 |
+
elif msg["role"] in ["user", "assistant", "model"]:
|
| 552 |
+
# Convert assistant to model for Gemini
|
| 553 |
+
role = "model" if msg["role"] == "assistant" else msg["role"]
|
| 554 |
+
contents.append({
|
| 555 |
+
"role": role,
|
| 556 |
+
"parts": [{"text": msg["content"]}]
|
| 557 |
+
})
|
| 558 |
+
|
| 559 |
+
# Build Gemini example - always include contents, optionally include systemInstruction
|
| 560 |
+
gemini_example = {"contents": contents}
|
| 561 |
+
if system_instruction:
|
| 562 |
+
gemini_example["systemInstruction"] = system_instruction
|
| 563 |
+
|
| 564 |
+
return gemini_example
|
| 565 |
|
| 566 |
+
# Use Dataset.map to convert each example individually
|
| 567 |
+
return gpt_dataset.map(convert_example, remove_columns=["messages"])
|