Vodalus

Runtime error

App Files Files Community

BeTaLabs commited on Jun 29, 2024

Commit

113eb1c

verified ·

1 Parent(s): 8158179

Update app.py

Browse files

Files changed (1) hide show

app.py +493 -72

app.py CHANGED Viewed

@@ -1,20 +1,54 @@
 import gradio as gr
 import json
 import re
 from datetime import datetime
 from typing import Literal
 import os
 import importlib
-from llm_handler import send_to_llm, agent, settings
 from main import generate_data, PROMPT_1
 from topics import TOPICS
 from system_messages import SYSTEM_MESSAGES_VODALUS
 import random
 ANNOTATION_CONFIG_FILE = "annotation_config.json"
 OUTPUT_FILE_PATH = "dataset.jsonl"
 def load_annotation_config():
     try:
         with open(ANNOTATION_CONFIG_FILE, 'r') as f:
@@ -57,6 +91,19 @@ def load_annotation_config():
             ]
         }
 def save_annotation_config(config):
     with open(ANNOTATION_CONFIG_FILE, 'w') as f:
         json.dump(config, f, indent=2)
@@ -66,8 +113,44 @@ def load_jsonl_dataset(file_path):
         return []
     with open(file_path, 'r') as f:
         return [json.loads(line.strip()) for line in f if line.strip()]
 def save_row(file_path, index, row_data):
     with open(file_path, 'r') as f:
         lines = f.readlines()
@@ -75,8 +158,23 @@ def save_row(file_path, index, row_data):
     with open(file_path, 'w') as f:
         f.writelines(lines)
-    return f"Row {index} saved successfully"
 def get_row(file_path, index):
     data = load_jsonl_dataset(file_path)
@@ -106,19 +204,19 @@ def markdown_to_json(markdown_str):
     }
     return json.dumps(json_data, indent=2)
-def navigate_rows(file_path: str, current_index: int, direction: Literal[-1, 1], metadata_config):
-    new_index = max(0, current_index + direction)
     return load_and_show_row(file_path, new_index, metadata_config)
 def load_and_show_row(file_path, index, metadata_config):
     row_data, total = get_row(file_path, index)
     if not row_data:
-        return ("", index, total, "3", [], [], [], "")
     try:
         data = json.loads(row_data)
     except json.JSONDecodeError:
-        return (row_data, index, total, "3", [], [], [], "Error: Invalid JSON")
     metadata = data.get("metadata", {}).get("annotation", {})
@@ -128,7 +226,7 @@ def load_and_show_row(file_path, index, metadata_config):
     toxic_tags = metadata.get("tags", {}).get("toxic", [])
     other = metadata.get("free_text", {}).get("Additional Notes", "")
-    return (row_data, index, total, quality,
             high_quality_tags, low_quality_tags, toxic_tags, other)
 def save_row_with_metadata(file_path, index, row_data, config, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
@@ -182,7 +280,12 @@ def load_config_to_ui(config):
         [[field["name"], field["description"]] for field in config["free_text_fields"]]
     )
-def save_config_from_ui(name, description, scale, categories, fields):
     new_config = {
         "quality_scale": {
             "name": name,
@@ -190,7 +293,8 @@ def save_config_from_ui(name, description, scale, categories, fields):
             "scale": [{"value": row[0], "label": row[1]} for row in scale]
         },
         "tag_categories": [{"name": row[0], "type": row[1], "tags": row[2].split(", ")} for row in categories],
-        "free_text_fields": [{"name": row[0], "description": row[1]} for row in fields]
     }
     save_annotation_config(new_config)
     return "Configuration saved successfully", new_config
@@ -218,7 +322,7 @@ def generate_preview(row_data, quality, high_quality_tags, low_quality_tags, tox
         return "Error: Invalid JSON in the current row data"
 def load_dataset_config():
-    # Load VODALUS_SYSTEM_MESSAGE from system_messages.py
     with open("system_messages.py", "r") as f:
         system_messages_content = f.read()
         vodalus_system_message = re.search(r'SYSTEM_MESSAGES_VODALUS = \[(.*?)\]', system_messages_content, re.DOTALL).group(1).strip()[3:-3]  # Extract the content between triple quotes
@@ -232,9 +336,37 @@ def load_dataset_config():
     topics_module = importlib.import_module("topics")
     topics_list = topics_module.TOPICS
-    return vodalus_system_message, prompt_1, [[topic] for topic in topics_list]
-def save_dataset_config(system_messages, prompt_1, topics):
     # Save VODALUS_SYSTEM_MESSAGE to system_messages.py
     with open("system_messages.py", "w") as f:
         f.write(f'SYSTEM_MESSAGES_VODALUS = [\n"""\n{system_messages}\n""",\n]\n')
@@ -261,8 +393,17 @@ def save_dataset_config(system_messages, prompt_1, topics):
     with open("topics.py", "w") as f:
         f.write(topics_content)
     return "Dataset configuration saved successfully"
 def chat_with_llm(message, history):
@@ -273,7 +414,12 @@ def chat_with_llm(message, history):
             msg_list.append({"role": "assistant", "content": h[1]})
         msg_list.append({"role": "user", "content": message})
-        response, _ = send_to_llm(agent, msg_list)
         return history + [[message, response]]
     except Exception as e:
@@ -283,14 +429,15 @@ def chat_with_llm(message, history):
 def update_chat_context(row_data, index, total, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     context = f"""Current app state:
     Row: {index + 1}/{total}
-    Data: {row_data}
     Quality: {quality}
     High Quality Tags: {', '.join(high_quality_tags)}
     Low Quality Tags: {', '.join(low_quality_tags)}
     Toxic Tags: {', '.join(toxic_tags)}
     Additional Notes: {other}
     """
-    return [[None, context]]  # Return as a list of message pairs
 async def run_generate_dataset(num_workers, num_generations, output_file_path):
@@ -309,34 +456,191 @@ async def run_generate_dataset(num_workers, num_generations, output_file_path):
     return f"Generated {num_generations} entries and saved to {output_file_path}", "\n".join(generated_data[:5]) + "\n..."
-demo = gr.Blocks()
 with demo:
-    gr.Markdown("# JSONL Dataset Editor and Annotation Tool")
     config = gr.State(load_annotation_config())
     with gr.Row():
-        with gr.Column(scale=3):
             with gr.Tab("Dataset Editor"):
-                with gr.Row():
-                    file_path = gr.Textbox(label="JSONL File Path", value=OUTPUT_FILE_PATH)
-                    load_button = gr.Button("Load Dataset")
                 with gr.Row():
                     prev_button = gr.Button("← Previous")
-                    row_index = gr.Number(value=0, label="Current Row", precision=0)
-                    total_rows = gr.Number(value=0, label="Total Rows", precision=0)
                     next_button = gr.Button("Next →")
                 with gr.Row():
                     with gr.Column(scale=3):
-                        row_editor = gr.TextArea(label="Edit Row", lines=20)
                     with gr.Column(scale=2):
                         quality_label = gr.Radio(label="Relevance for Training", choices=[])
                         tag_components = [gr.CheckboxGroup(label=f"Tag Group {i+1}", choices=[]) for i in range(3)]
                         other_description = gr.Textbox(label="Additional annotations", lines=3)
                 with gr.Row():
                     to_markdown_button = gr.Button("Convert to Markdown")
@@ -349,50 +653,94 @@ with demo:
             with gr.Tab("Annotation Configuration"):
                 with gr.Row():
-                    with gr.Column():
-                        quality_scale_name = gr.Textbox(label="Quality Scale Name")
-                        quality_scale_description = gr.Textbox(label="Quality Scale Description")
                         quality_scale = gr.Dataframe(
                             headers=["Value", "Label"],
                             datatype=["str", "str"],
-                            label="Quality Scale",
-                            interactive=True
                         )
                 with gr.Row():
-                    tag_categories = gr.Dataframe(
-                        headers=["Name", "Type", "Tags"],
-                        datatype=["str", "str", "str"],
-                        label="Tag Categories",
-                        interactive=True
-                    )
                 with gr.Row():
-                    free_text_fields = gr.Dataframe(
-                        headers=["Name", "Description"],
-                        datatype=["str", "str"],
-                        label="Free Text Fields",
-                        interactive=True
-                    )
-                save_config_btn = gr.Button("Save Configuration")
-                config_status = gr.Textbox(label="Status")
             with gr.Tab("Dataset Configuration"):
                 with gr.Row():
-                    vodalus_system_message = gr.TextArea(label="VODALUS_SYSTEM_MESSAGE", lines=10)
-                    prompt_1 = gr.TextArea(label="PROMPT_1", lines=10)
                 with gr.Row():
-                    topics = gr.Dataframe(
-                        headers=["Topic"],
-                        datatype=["str"],
-                        label="TOPICS",
-                        interactive=True
-                    )
-                save_dataset_config_btn = gr.Button("Save Dataset Configuration")
-                dataset_config_status = gr.Textbox(label="Status")
             with gr.Tab("Dataset Generation"):
                 with gr.Row():
@@ -406,16 +754,54 @@ with demo:
                 generation_status = gr.Textbox(label="Generation Status")
                 generation_output = gr.TextArea(label="Generation Output", lines=10)
-        with gr.Column(scale=1):
-            gr.Markdown("## AI Assistant")
-            chatbot = gr.Chatbot(height=600)
-            msg = gr.Textbox(label="Chat with AI Assistant")
-            clear = gr.Button("Clear")
     load_button.click(
-        load_and_show_row,
-        inputs=[file_path, gr.Number(value=0), config],
-        outputs=[row_editor, row_index, total_rows, quality_label, *tag_components, other_description]
     ).then(
         update_annotation_ui,
         inputs=[config],
@@ -424,8 +810,8 @@ with demo:
     prev_button.click(
         navigate_rows,
-        inputs=[file_path, row_index, gr.Number(value=-1), config],
-        outputs=[row_editor, row_index, total_rows, quality_label, *tag_components, other_description]
     ).then(
         update_annotation_ui,
         inputs=[config],
@@ -434,8 +820,8 @@ with demo:
     next_button.click(
         navigate_rows,
-        inputs=[file_path, row_index, gr.Number(value=1), config],
-        outputs=[row_editor, row_index, total_rows, quality_label, *tag_components, other_description]
     ).then(
         update_annotation_ui,
         inputs=[config],
@@ -444,7 +830,7 @@ with demo:
     save_row_button.click(
         save_row_with_metadata,
-        inputs=[file_path, row_index, row_editor, config, quality_label,
                 tag_components[0], tag_components[1], tag_components[2], other_description],
         outputs=[editor_status]
     ).then(
@@ -476,7 +862,7 @@ with demo:
     save_config_btn.click(
         save_config_from_ui,
-        inputs=[quality_scale_name, quality_scale_description, quality_scale, tag_categories, free_text_fields],
         outputs=[config_status, config]
     ).then(
         update_annotation_ui,
@@ -492,12 +878,12 @@ with demo:
     demo.load(
         load_dataset_config,
-        outputs=[vodalus_system_message, prompt_1, topics]
     )
     save_dataset_config_btn.click(
         save_dataset_config,
-        inputs=[vodalus_system_message, prompt_1, topics],
         outputs=[dataset_config_status]
     )
@@ -507,10 +893,21 @@ with demo:
         outputs=[generation_status, generation_output]
     )
     msg.submit(chat_with_llm, [msg, chatbot], [chatbot])
     clear.click(lambda: None, None, chatbot, queue=False)
-    # Update chat context when navigating rows or loading dataset
     for button in [load_button, prev_button, next_button]:
         button.click(
             update_chat_context,
@@ -518,6 +915,30 @@ with demo:
             outputs=[chatbot]
         )
-if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
+from gradio import update
 import json
 import re
 from datetime import datetime
 from typing import Literal
 import os
 import importlib
+from llm_handler import send_to_llm
 from main import generate_data, PROMPT_1
 from topics import TOPICS
 from system_messages import SYSTEM_MESSAGES_VODALUS
 import random
+from params import load_params, save_params
+import pandas as pd
+import csv
 ANNOTATION_CONFIG_FILE = "annotation_config.json"
 OUTPUT_FILE_PATH = "dataset.jsonl"
+def load_llm_config():
+    params = load_params()
+    return (
+        params.get('PROVIDER', ''),
+        params.get('BASE_URL', ''),
+        params.get('WORKSPACE', ''),
+        params.get('API_KEY', ''),
+        params.get('max_tokens', 2048),
+        params.get('temperature', 0.7),
+        params.get('top_p', 0.9),
+        params.get('frequency_penalty', 0.0),
+        params.get('presence_penalty', 0.0)
+    )
+def save_llm_config(provider, base_url, workspace, api_key, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
+    save_params({
+        'PROVIDER': provider,
+        'BASE_URL': base_url,
+        'WORKSPACE': workspace,
+        'API_KEY': api_key,
+        'max_tokens': max_tokens,
+        'temperature': temperature,
+        'top_p': top_p,
+        'frequency_penalty': frequency_penalty,
+        'presence_penalty': presence_penalty
+    })
+    return "LLM configuration saved successfully"
 def load_annotation_config():
     try:
         with open(ANNOTATION_CONFIG_FILE, 'r') as f:
             ]
         }
+def load_csv_dataset(file_path):
+    data = []
+    with open(file_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            data.append(row)
+    return data
+def load_txt_dataset(file_path):
+    with open(file_path, 'r') as f:
+        return [{"content": line.strip()} for line in f if line.strip()]
 def save_annotation_config(config):
     with open(ANNOTATION_CONFIG_FILE, 'w') as f:
         json.dump(config, f, indent=2)
         return []
     with open(file_path, 'r') as f:
         return [json.loads(line.strip()) for line in f if line.strip()]
+def load_dataset(file):
+    if file is None:
+        return "", 0, 0, "No file uploaded", "3", [], [], [], ""
+    file_path = file.name
+    file_extension = os.path.splitext(file_path)[1].lower()
+    if file_extension == '.csv':
+        data = load_csv_dataset(file_path)
+    elif file_extension == '.txt':
+        data = load_txt_dataset(file_path)
+    elif file_extension == '.jsonl':
+        data = load_jsonl_dataset(file_path)
+    else:
+        return "", 0, 0, f"Unsupported file type: {file_extension}", "3", [], [], [], ""
+    if not data:
+        return "", 0, 0, "No data found in the file", "3", [], [], [], ""
+    first_row = json.dumps(data[0], indent=2)
+    return first_row, 0, len(data), f"Row: 1/{len(data)}", "3", [], [], [], ""
 def save_row(file_path, index, row_data):
+    file_extension = file_path.split('.')[-1].lower()
+    if file_extension == 'jsonl':
+        save_jsonl_row(file_path, index, row_data)
+    elif file_extension == 'csv':
+        save_csv_row(file_path, index, row_data)
+    elif file_extension == 'txt':
+        save_txt_row(file_path, index, row_data)
+    else:
+        raise ValueError(f"Unsupported file format: {file_extension}")
+    return f"Row {index} saved successfully"
+def save_jsonl_row(file_path, index, row_data):
     with open(file_path, 'r') as f:
         lines = f.readlines()
     with open(file_path, 'w') as f:
         f.writelines(lines)
+def save_csv_row(file_path, index, row_data):
+    df = pd.read_csv(file_path)
+    row_dict = json.loads(row_data)
+    for col, value in row_dict.items():
+        df.at[index, col] = value
+    df.to_csv(file_path, index=False)
+def save_txt_row(file_path, index, row_data):
+    with open(file_path, 'r') as f:
+        lines = f.readlines()
+    row_dict = json.loads(row_data)
+    lines[index] = row_dict.get('content', '') + '\n'
+    with open(file_path, 'w') as f:
+        f.writelines(lines)
 def get_row(file_path, index):
     data = load_jsonl_dataset(file_path)
     }
     return json.dumps(json_data, indent=2)
+def navigate_rows(file_path: str, current_index: int, direction: Literal["prev", "next"], metadata_config):
+    new_index = max(0, current_index + (-1 if direction == "prev" else 1))
     return load_and_show_row(file_path, new_index, metadata_config)
 def load_and_show_row(file_path, index, metadata_config):
     row_data, total = get_row(file_path, index)
     if not row_data:
+        return ("", index, total, f"Row: {index + 1}/{total}", "3", [], [], [], "")
     try:
         data = json.loads(row_data)
     except json.JSONDecodeError:
+        return (row_data, index, total, f"Row: {index + 1}/{total}", "3", [], [], [], "Error: Invalid JSON")
     metadata = data.get("metadata", {}).get("annotation", {})
     toxic_tags = metadata.get("tags", {}).get("toxic", [])
     other = metadata.get("free_text", {}).get("Additional Notes", "")
+    return (row_data, index, total, f"Row: {index + 1}/{total}", quality,
             high_quality_tags, low_quality_tags, toxic_tags, other)
 def save_row_with_metadata(file_path, index, row_data, config, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
         [[field["name"], field["description"]] for field in config["free_text_fields"]]
     )
+def save_config_from_ui(name, description, scale, categories, fields, topics, all_topics_text):
+    if all_topics_text.visible:
+        topics_list = [topic.strip() for topic in all_topics_text.split("\n") if topic.strip()]
+    else:
+        topics_list = [topic[0] for topic in topics]
     new_config = {
         "quality_scale": {
             "name": name,
             "scale": [{"value": row[0], "label": row[1]} for row in scale]
         },
         "tag_categories": [{"name": row[0], "type": row[1], "tags": row[2].split(", ")} for row in categories],
+        "free_text_fields": [{"name": row[0], "description": row[1]} for row in fields],
+        "topics": topics_list
     }
     save_annotation_config(new_config)
     return "Configuration saved successfully", new_config
         return "Error: Invalid JSON in the current row data"
 def load_dataset_config():
+    params = load_params()
     with open("system_messages.py", "r") as f:
         system_messages_content = f.read()
         vodalus_system_message = re.search(r'SYSTEM_MESSAGES_VODALUS = \[(.*?)\]', system_messages_content, re.DOTALL).group(1).strip()[3:-3]  # Extract the content between triple quotes
     topics_module = importlib.import_module("topics")
     topics_list = topics_module.TOPICS
+    return (
+        vodalus_system_message,
+        prompt_1,
+        [[topic] for topic in topics_list],
+        params.get('max_tokens', 2048),
+        params.get('temperature', 0.7),
+        params.get('top_p', 0.9),
+        params.get('frequency_penalty', 0.0),
+        params.get('presence_penalty', 0.0)
+    )
+def edit_all_topics_func(topics):
+    topics_list = [topic[0] for topic in topics]
+    jsonl_rows = "\n".join([json.dumps({"topic": topic}) for topic in topics_list])
+    return (
+        gr.update(visible=False),
+        gr.update(value=jsonl_rows, visible=True),
+        gr.update(visible=True)
+    )
+def update_topics_from_text(text):
+    try:
+        # Try parsing as JSONL
+        topics_list = [json.loads(line)["topic"] for line in text.split("\n") if line.strip()]
+    except json.JSONDecodeError:
+        # If parsing fails, treat as plain text
+        topics_list = [topic.strip() for topic in text.split("\n") if topic.strip()]
+    return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
+def save_dataset_config(system_messages, prompt_1, topics, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
     # Save VODALUS_SYSTEM_MESSAGE to system_messages.py
     with open("system_messages.py", "w") as f:
         f.write(f'SYSTEM_MESSAGES_VODALUS = [\n"""\n{system_messages}\n""",\n]\n')
     with open("topics.py", "w") as f:
         f.write(topics_content)
+    save_params({
+        'max_tokens': max_tokens,
+        'temperature': temperature,
+        'top_p': top_p,
+        'frequency_penalty': frequency_penalty,
+        'presence_penalty': presence_penalty
+    })
     return "Dataset configuration saved successfully"
 def chat_with_llm(message, history):
             msg_list.append({"role": "assistant", "content": h[1]})
         msg_list.append({"role": "user", "content": message})
+        response, _ = send_to_llm(msg_list)
+        return history + [[message, response]]
+    except Exception as e:
+        print(f"Error in chat_with_llm: {str(e)}")
+        return history + [[message, f"Error: {str(e)}"]]
         return history + [[message, response]]
     except Exception as e:
 def update_chat_context(row_data, index, total, quality, high_quality_tags, low_quality_tags, toxic_tags, other):
     context = f"""Current app state:
     Row: {index + 1}/{total}
     Quality: {quality}
     High Quality Tags: {', '.join(high_quality_tags)}
     Low Quality Tags: {', '.join(low_quality_tags)}
     Toxic Tags: {', '.join(toxic_tags)}
     Additional Notes: {other}
+    Data: {row_data}
     """
+    return [[None, context]]
 async def run_generate_dataset(num_workers, num_generations, output_file_path):
     return f"Generated {num_generations} entries and saved to {output_file_path}", "\n".join(generated_data[:5]) + "\n..."
+def add_topic_row(data):
+    if isinstance(data, pd.DataFrame):
+        return pd.concat([data, pd.DataFrame({"Topic": ["New Topic"]})], ignore_index=True)
+    else:
+        return data + [["New Topic"]]
+def remove_last_topic_row(data):
+    return data[:-1] if len(data) > 1 else data
+def edit_all_topics_func(topics):
+    topics_list = [topic[0] for topic in topics]
+    jsonl_rows = "\n".join([json.dumps({"topic": topic}) for topic in topics_list])
+    return (
+        gr.update(visible=False),
+        gr.update(value=jsonl_rows, visible=True),
+        gr.update(visible=True)
+    )
+def update_topics_from_text(text):
+    try:
+        # Try parsing as JSONL
+        topics_list = [json.loads(line)["topic"] for line in text.split("\n") if line.strip()]
+    except json.JSONDecodeError:
+        # If parsing fails, treat as plain text
+        topics_list = [topic.strip() for topic in text.split("\n") if topic.strip()]
+    return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
+def update_topics_from_text(text):
+    try:
+        # Try parsing as JSONL
+        topics_list = [json.loads(line)["topic"] for line in text.split("\n") if line.strip()]
+    except json.JSONDecodeError:
+        # If parsing fails, treat as plain text
+        topics_list = [topic.strip() for topic in text.split("\n") if topic.strip()]
+    return gr.Dataframe.update(value=[[topic] for topic in topics_list], visible=True), gr.TextArea.update(visible=False)
+css = """
+body, #root {
+    margin: 0;
+    padding: 0;
+    width: 100%;
+    height: 100%;
+    overflow-x: hidden;
+}
+.gradio-container {
+    max-width: 100% !important;
+    width: 100% !important;
+    margin: 0 auto !important;
+    padding: 0 !important;
+}
+.message-row {
+    justify-content: space-evenly !important;
+}
+.message-bubble-border {
+    border-radius: 6px !important;
+}
+.message-buttons-bot, .message-buttons-user {
+    right: 10px !important;
+    left: auto !important;
+    bottom: 2px !important;
+}
+.dark.message-bubble-border {
+    border-color: #343140 !important;
+}
+.dark.user {
+    background: #1e1c26 !important;
+}
+.dark.assistant.dark, .dark.pending.dark {
+    background: #16141c !important;
+}
+.tab-nav {
+    border-bottom: 2px solid #e0e0e0 !important;
+}
+.tab-nav button {
+    font-size: 16px !important;
+    padding: 10px 20px !important;
+}
+.input-row {
+    margin-bottom: 20px !important;
+}
+.button-row {
+    display: flex !important;
+    justify-content: space-between !important;
+    margin-top: 20px !important;
+}
+#row-editor {
+    height: 80vh !important;
+    font-size: 16px !important;
+}
+.file-upload-row {
+    height: 50px !important;
+    margin-bottom: 1rem !important;
+}
+.file-upload-row > .gr-column {
+    min-width: 0 !important;
+}
+.compact-file-upload {
+    height: 50px !important;
+    overflow: hidden !important;
+}
+.compact-file-upload > .file-preview {
+    min-height: 0 !important;
+    max-height: 50px !important;
+    padding: 0 !important;
+}
+.compact-file-upload > .file-preview > .file-preview-handler {
+    height: 50px !important;
+    padding: 0 8px !important;
+    display: flex !important;
+    align-items: center !important;
+}
+.compact-file-upload > .file-preview > .file-preview-handler > .file-preview-title {
+    white-space: nowrap !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+    flex: 1 !important;
+}
+.compact-file-upload > .file-preview > .file-preview-handler > .file-preview-remove {
+    padding: 0 !important;
+    min-width: 24px !important;
+    width: 24px !important;
+    height: 24px !important;
+}
+.compact-button {
+    height: 50px !important;
+    min-height: 40px !important;
+    width: 100% !important;
+}
+.compact-file-upload > label {
+    height: 50px !important;
+    padding: 0 8px !important;
+    display: flex !important;
+    align-items: center !important;
+    justify-content: left !important;
+}
+"""
+demo = gr.Blocks(theme='Ama434/neutral-barlow', css=css)
 with demo:
+    gr.Markdown("# Dataset Editor and Annotation Tool")
     config = gr.State(load_annotation_config())
     with gr.Row():
+        with gr.Column(min_width=1000):
             with gr.Tab("Dataset Editor"):
+                with gr.Row(elem_classes="file-upload-row"):
+                    with gr.Column(scale=3, min_width=400):
+                        file_upload = gr.File(label="Upload Dataset File (.txt, .jsonl, or .csv)", elem_classes="compact-file-upload")
+                    with gr.Column(scale=1, min_width=100):
+                        load_button = gr.Button("Load Dataset", elem_classes="compact-button")
                 with gr.Row():
                     prev_button = gr.Button("← Previous")
+                    row_index = gr.State(value=0)
+                    total_rows = gr.State(value=0)
+                    current_row_display = gr.Textbox(label="Current Row", interactive=False)
                     next_button = gr.Button("Next →")
                 with gr.Row():
                     with gr.Column(scale=3):
+                        row_editor = gr.TextArea(label="Edit Row", lines=40)
                     with gr.Column(scale=2):
                         quality_label = gr.Radio(label="Relevance for Training", choices=[])
                         tag_components = [gr.CheckboxGroup(label=f"Tag Group {i+1}", choices=[]) for i in range(3)]
                         other_description = gr.Textbox(label="Additional annotations", lines=3)
+                        # Add the AI Assistant as a dropdown
+                        with gr.Accordion("AI Assistant", open=False):
+                            chatbot = gr.Chatbot(height=300)
+                            msg = gr.Textbox(label="Chat with AI Assistant")
+                            clear = gr.Button("Clear")
                 with gr.Row():
                     to_markdown_button = gr.Button("Convert to Markdown")
             with gr.Tab("Annotation Configuration"):
                 with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### Quality Scale")
+                        quality_scale_name = gr.Textbox(label="Scale Name")
+                        quality_scale_description = gr.Textbox(label="Scale Description", lines=2)
+                    with gr.Column(scale=2):
                         quality_scale = gr.Dataframe(
                             headers=["Value", "Label"],
                             datatype=["str", "str"],
+                            label="Quality Scale Options",
+                            interactive=True,
+                            col_count=(2, "fixed"),
+                            row_count=(5, "dynamic"),
+                            height=400,
+                            wrap=True
                         )
+                gr.Markdown("### Tag Categories")
+                tag_categories = gr.Dataframe(
+                    headers=["Name", "Type", "Tags"],
+                    datatype=["str", "str", "str"],
+                    label="Tag Categories",
+                    interactive=True,
+                    col_count=(3, "fixed"),
+                    row_count=(3, "dynamic"),
+                    height=250,
+                    wrap=True
+                )
                 with gr.Row():
+                    add_tag_category = gr.Button("Add Category")
+                    remove_tag_category = gr.Button("Remove Last Category")
+                gr.Markdown("### Free Text Fields")
+                free_text_fields = gr.Dataframe(
+                    headers=["Name", "Description"],
+                    datatype=["str", "str"],
+                    label="Free Text Fields",
+                    interactive=True,
+                    col_count=(2, "fixed"),
+                    row_count=(2, "dynamic"),
+                    height=300,
+                    wrap=True
+                )
                 with gr.Row():
+                    add_free_text_field = gr.Button("Add Field")
+                    remove_free_text_field = gr.Button("Remove Last Field")
+                with gr.Row():
+                    save_config_btn = gr.Button("Save Configuration", variant="primary")
+                    config_status = gr.Textbox(label="Status", interactive=False)
             with gr.Tab("Dataset Configuration"):
                 with gr.Row():
+                    vodalus_system_message = gr.TextArea(label="System Message for JSONL Dataset", lines=10)
+                    prompt_1 = gr.TextArea(label="Dataset Gerenation Prompt", lines=10)
+                gr.Markdown("### Topics")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        topics = gr.Dataframe(
+                            headers=["Topic"],
+                            datatype=["str"],
+                            label="Topics",
+                            interactive=True,
+                            col_count=(1, "fixed"),
+                            row_count=(5, "dynamic"),
+                            height=200,
+                            wrap=True
+                        )
+                    with gr.Column(scale=1):
+                        with gr.Row():
+                            add_topic = gr.Button("Add Topic")
+                            remove_topic = gr.Button("Remove Last Topic")
+                        edit_all_topics = gr.Button("Edit All Topics")
+                        all_topics_edit = gr.TextArea(label="Edit All Topics (JSONL or Plain Text)", visible=False, lines=10)
+                        format_info = gr.Markdown("""
+                        Enter topics as JSONL (e.g., {"topic": "Example Topic"}) or plain text (one topic per line).
+                        JSONL format allows for additional metadata if needed.
+                        """, visible=False)
                 with gr.Row():
+                    save_dataset_config_btn = gr.Button("Save Dataset Configuration", variant="primary")
+                    dataset_config_status = gr.Textbox(label="Status")
             with gr.Tab("Dataset Generation"):
                 with gr.Row():
                 generation_status = gr.Textbox(label="Generation Status")
                 generation_output = gr.TextArea(label="Generation Output", lines=10)
+            with gr.Tab("LLM Configuration"):
+                with gr.Row():
+                    provider = gr.Dropdown(choices=["local-model", "anything-llm"], label="LLM Provider")
+                    base_url = gr.Textbox(label="Base URL (for local model)")
+                with gr.Row():
+                    workspace = gr.Textbox(label="Workspace (for AnythingLLM)")
+                    api_key = gr.Textbox(label="API Key (for AnythingLLM)")
+                with gr.Accordion("Advanced Options", open=False):
+                    with gr.Row():
+                        max_tokens = gr.Slider(minimum=100, maximum=4096, value=2048, step=1, label="Max Tokens")
+                        temperature = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.01, label="Temperature")
+                    with gr.Row():
+                        top_p = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.01, label="Top P")
+                        frequency_penalty = gr.Slider(minimum=0, maximum=2, value=0.0, step=0.01, label="Frequency Penalty")
+                        presence_penalty = gr.Slider(minimum=0, maximum=2, value=0.0, step=0.01, label="Presence Penalty")
+                save_llm_config_btn = gr.Button("Save LLM Configuration")
+                llm_config_status = gr.Textbox(label="Status")
+    add_topic.click(
+        lambda x: x + [["New Topic"]],
+        inputs=[topics],
+        outputs=[topics]
+    )
+    remove_topic.click(
+        lambda x: x[:-1] if len(x) > 0 else x,
+        inputs=[topics],
+        outputs=[topics]
+    )
+    edit_all_topics.click(
+        edit_all_topics_func,
+        inputs=[topics],
+        outputs=[topics, all_topics_edit, format_info]
+    )
+    all_topics_edit.submit(
+        update_topics_from_text,
+        inputs=[all_topics_edit],
+        outputs=[topics, all_topics_edit, format_info]
+    )
     load_button.click(
+        load_dataset,
+        inputs=[file_upload],
+        outputs=[row_editor, row_index, total_rows, current_row_display, quality_label, *tag_components, other_description]
     ).then(
         update_annotation_ui,
         inputs=[config],
     prev_button.click(
         navigate_rows,
+        inputs=[file_upload, row_index, gr.State("prev"), config],
+        outputs=[row_editor, row_index, total_rows, current_row_display, quality_label, *tag_components, other_description]
     ).then(
         update_annotation_ui,
         inputs=[config],
     next_button.click(
         navigate_rows,
+        inputs=[file_upload, row_index, gr.State("next"), config],
+        outputs=[row_editor, row_index, total_rows, current_row_display, quality_label, *tag_components, other_description]
     ).then(
         update_annotation_ui,
         inputs=[config],
     save_row_button.click(
         save_row_with_metadata,
+        inputs=[file_upload, row_index, row_editor, config, quality_label,
                 tag_components[0], tag_components[1], tag_components[2], other_description],
         outputs=[editor_status]
     ).then(
     save_config_btn.click(
         save_config_from_ui,
+        inputs=[quality_scale_name, quality_scale_description, quality_scale, tag_categories, free_text_fields, topics, all_topics_edit],
         outputs=[config_status, config]
     ).then(
         update_annotation_ui,
     demo.load(
         load_dataset_config,
+        outputs=[vodalus_system_message, prompt_1, topics, max_tokens, temperature, top_p, frequency_penalty, presence_penalty]
     )
     save_dataset_config_btn.click(
         save_dataset_config,
+        inputs=[vodalus_system_message, prompt_1, topics, max_tokens, temperature, top_p, frequency_penalty, presence_penalty],
         outputs=[dataset_config_status]
     )
         outputs=[generation_status, generation_output]
     )
+    demo.load(
+        load_llm_config,
+        outputs=[provider, base_url, workspace, api_key, max_tokens, temperature, top_p, frequency_penalty, presence_penalty]
+    )
+    save_llm_config_btn.click(
+        save_llm_config,
+        inputs=[provider, base_url, workspace, api_key, max_tokens, temperature, top_p, frequency_penalty, presence_penalty],
+        outputs=[llm_config_status]
+    )
     msg.submit(chat_with_llm, [msg, chatbot], [chatbot])
     clear.click(lambda: None, None, chatbot, queue=False)
     for button in [load_button, prev_button, next_button]:
         button.click(
             update_chat_context,
             outputs=[chatbot]
         )
+    demo.load(
+        lambda: (
+            initial_values := load_dataset_config(),
+            gr.update(value=initial_values[0]),  # vodalus_system_message
+            gr.update(value=initial_values[1]),  # prompt_1
+            gr.update(value=initial_values[2]),  # topics_data
+            gr.update(value=initial_values[3]),  # max_tokens_val
+            gr.update(value=initial_values[4]),  # temperature_val
+            gr.update(value=initial_values[5]),  # top_p_val
+            gr.update(value=initial_values[6]),  # frequency_penalty_val
+            gr.update(value=initial_values[7])   # presence_penalty_val
+        )[1:],  # We return a tuple slice to exclude the initial_values assignment
+        outputs=[
+            vodalus_system_message,
+            prompt_1,
+            topics,
+            max_tokens,
+            temperature,
+            top_p,
+            frequency_penalty,
+            presence_penalty
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)