|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
import os |
|
import pandas as pd |
|
from typing import List, Dict, Tuple |
|
import json |
|
import io |
|
import traceback |
|
import csv |
|
from openai import OpenAI |
|
from functools import lru_cache |
|
from concurrent.futures import ThreadPoolExecutor |
|
import math |
|
|
|
|
|
css = """ |
|
footer { |
|
visibility: hidden; |
|
} |
|
#chatbot-container, #chatbot-data-upload { |
|
height: 700px; |
|
overflow-y: scroll; |
|
} |
|
#chatbot-container .message, #chatbot-data-upload .message { |
|
font-size: 14px; |
|
} |
|
/* μ
λ ₯μ°½ λ°°κ²½μ λ° κΈμμ λ³κ²½ */ |
|
textarea, input[type="text"] { |
|
background-color: #ffffff; |
|
color: #000000; |
|
} |
|
/* νμΌ μ
λ‘λ μμ λμ΄ μ‘°μ */ |
|
#parquet-upload-area { |
|
max-height: 150px; |
|
overflow-y: auto; |
|
} |
|
/* μ΄κΈ° μ€λͺ
κΈμ¨ ν¬κΈ° μ‘°μ */ |
|
#initial-description { |
|
font-size: 14px; |
|
} |
|
/* API Key μ
λ ₯ μΉμ
μ€νμΌ */ |
|
.api-key-section { |
|
margin: 10px 0; |
|
padding: 10px; |
|
border: 1px solid #ddd; |
|
border-radius: 5px; |
|
} |
|
.api-key-status { |
|
margin-top: 5px; |
|
font-weight: bold; |
|
} |
|
""" |
|
|
|
|
|
hf_client = InferenceClient( |
|
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN") |
|
) |
|
|
|
def load_code(filename: str) -> str: |
|
try: |
|
with open(filename, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
except FileNotFoundError: |
|
return f"{filename} νμΌμ μ°Ύμ μ μμ΅λλ€." |
|
except Exception as e: |
|
return f"νμΌμ μ½λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
|
|
def load_parquet(filename: str) -> str: |
|
try: |
|
df = pd.read_parquet(filename, engine='pyarrow') |
|
return df.head(10).to_markdown(index=False) |
|
except FileNotFoundError: |
|
return f"{filename} νμΌμ μ°Ύμ μ μμ΅λλ€." |
|
except Exception as e: |
|
return f"νμΌμ μ½λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
|
|
def clean_response(text: str) -> str: |
|
"""μλ΅ ν
μ€νΈ μ μ ν¨μ""" |
|
sentences = [s.strip() for s in text.split('.') if s.strip()] |
|
unique_sentences = [] |
|
seen = set() |
|
|
|
for sentence in sentences: |
|
normalized = ' '.join(sentence.lower().split()) |
|
if normalized not in seen: |
|
seen.add(normalized) |
|
unique_sentences.append(sentence) |
|
|
|
cleaned_text = '. '.join(unique_sentences) |
|
if cleaned_text and not cleaned_text.endswith('.'): |
|
cleaned_text += '.' |
|
|
|
return cleaned_text |
|
|
|
def remove_duplicates(text: str) -> str: |
|
"""μ€λ³΅ λ¬Έμ₯ μ κ±° ν¨μ""" |
|
sentences = text.split('.') |
|
unique_sentences = [] |
|
seen = set() |
|
|
|
for sentence in sentences: |
|
sentence = sentence.strip() |
|
if sentence and sentence not in seen: |
|
seen.add(sentence) |
|
unique_sentences.append(sentence) |
|
|
|
return '. '.join(unique_sentences) |
|
|
|
def upload_csv(file_path: str) -> Tuple[str, str]: |
|
try: |
|
df = pd.read_csv(file_path, sep=',') |
|
required_columns = {'id', 'text', 'label', 'metadata'} |
|
available_columns = set(df.columns) |
|
missing_columns = required_columns - available_columns |
|
if missing_columns: |
|
return f"CSV νμΌμ λ€μ νμ 컬λΌμ΄ λλ½λμμ΅λλ€: {', '.join(missing_columns)}", "" |
|
|
|
df.drop_duplicates(inplace=True) |
|
df.fillna('', inplace=True) |
|
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'}) |
|
|
|
parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet' |
|
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy') |
|
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλκ³ λ³νλμμ΅λλ€.", parquet_filename |
|
except Exception as e: |
|
return f"CSV νμΌ μ
λ‘λ λ° λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "" |
|
|
|
def upload_parquet(file_path: str) -> Tuple[str, str, str]: |
|
try: |
|
df = pd.read_parquet(file_path, engine='pyarrow') |
|
|
|
data_info = { |
|
"μ΄ λ μ½λ μ": len(df), |
|
"μ»¬λΌ λͺ©λ‘": list(df.columns), |
|
"λ°μ΄ν° νμ
": df.dtypes.to_dict(), |
|
"κ²°μΈ‘μΉ μ 보": df.isnull().sum().to_dict() |
|
} |
|
|
|
summary = [] |
|
summary.append(f"### λ°μ΄ν°μ
κΈ°λ³Έ μ 보:") |
|
summary.append(f"- μ΄ λ μ½λ μ: {data_info['μ΄ λ μ½λ μ']}") |
|
summary.append(f"- μ»¬λΌ λͺ©λ‘: {', '.join(data_info['μ»¬λΌ λͺ©λ‘'])}") |
|
|
|
summary.append("\n### 컬λΌλ³ μ 보:") |
|
for col in df.columns: |
|
if df[col].dtype in ['int64', 'float64']: |
|
stats = df[col].describe() |
|
summary.append(f"\n{col} (μμΉν):") |
|
summary.append(f"- νκ· : {stats['mean']:.2f}") |
|
summary.append(f"- μ΅μ: {stats['min']}") |
|
summary.append(f"- μ΅λ: {stats['max']}") |
|
elif df[col].dtype == 'object' or df[col].dtype == 'string': |
|
unique_count = df[col].nunique() |
|
summary.append(f"\n{col} (ν
μ€νΈ):") |
|
summary.append(f"- κ³ μ κ° μ: {unique_count}") |
|
if unique_count < 10: |
|
value_counts = df[col].value_counts().head(5) |
|
summary.append("- μμ 5κ° κ°:") |
|
for val, count in value_counts.items(): |
|
summary.append(f" β’ {val}: {count}κ°") |
|
|
|
preview = df.head(10).to_markdown(index=False) |
|
summary.append("\n### λ°μ΄ν° 미리보기:") |
|
summary.append(preview) |
|
|
|
parquet_content = "\n".join(summary) |
|
parquet_json = df.to_json(orient='records', force_ascii=False) |
|
|
|
return "Parquet νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλμμ΅λλ€.", parquet_content, parquet_json |
|
except Exception as e: |
|
return f"Parquet νμΌ μ
λ‘λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "", "" |
|
|
|
def text_to_parquet(text: str) -> Tuple[str, str, str]: |
|
try: |
|
lines = [line.strip() for line in text.split('\n') if line.strip()] |
|
data = [] |
|
|
|
for line in lines: |
|
try: |
|
import re |
|
pattern = r'(\d+),([^,]+),([^,]+),(.+)' |
|
match = re.match(pattern, line) |
|
|
|
if match: |
|
id_val, text_val, label_val, metadata_val = match.groups() |
|
text_val = text_val.strip().strip('"') |
|
label_val = label_val.strip().strip('"') |
|
metadata_val = metadata_val.strip().strip('"') |
|
|
|
data.append({ |
|
'id': int(id_val), |
|
'text': text_val, |
|
'label': label_val, |
|
'metadata': metadata_val |
|
}) |
|
except Exception as e: |
|
print(f"λΌμΈ νμ± μ€λ₯: {line}\n{str(e)}") |
|
continue |
|
|
|
if not data: |
|
return "λ³νν λ°μ΄ν°κ° μμ΅λλ€.", "", "" |
|
|
|
df = pd.DataFrame(data) |
|
df = df.astype({ |
|
'id': 'int32', |
|
'text': 'string', |
|
'label': 'string', |
|
'metadata': 'string' |
|
}) |
|
|
|
parquet_filename = 'text_to_parquet.parquet' |
|
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy') |
|
preview = df.to_markdown(index=False) |
|
|
|
return ( |
|
f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ λ³νλμμ΅λλ€. μ΄ {len(df)}κ°μ λ μ½λκ° μ²λ¦¬λμμ΅λλ€.", |
|
preview, |
|
parquet_filename |
|
) |
|
|
|
except Exception as e: |
|
error_message = f"ν
μ€νΈ λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
print(f"{error_message}\n{traceback.format_exc()}") |
|
return error_message, "", "" |
|
|
|
def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None, api_key: str = None) -> str: |
|
if not api_key: |
|
yield "β οΈ API Keyκ° μ€μ λμ§ μμμ΅λλ€. μλΉμ€ μ΄μ©μ μν΄ API Keyλ₯Ό μ
λ ₯ν΄μ£ΌμΈμ." |
|
return |
|
|
|
|
|
client = OpenAI(api_key=api_key) |
|
|
|
system_prefix = """λ°λμ νκΈλ‘ λ΅λ³ν κ². λλ μ
λ‘λλ λ°μ΄ν°λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ λ΅λ³νλ μν μ νλ€. |
|
|
|
μ£Όμ μ§μΉ¨: |
|
1. μ§λ¬Έκ³Ό μ§μ κ΄λ ¨λ λ΄μ©λ§ κ°λ¨λͺ
λ£νκ² λ΅λ³ν κ² |
|
2. μ΄μ λ΅λ³κ³Ό μ€λ³΅λλ λ΄μ©μ μ μΈν κ² |
|
3. λΆνμν μμλ λΆμ° μ€λͺ
μ νμ§ λ§ κ² |
|
4. λμΌν λ΄μ©μ λ€λ₯Έ ννμΌλ‘ λ°λ³΅νμ§ λ§ κ² |
|
5. ν΅μ¬ μ λ³΄λ§ μ λ¬ν κ² |
|
""" |
|
|
|
if parquet_data: |
|
try: |
|
df = pd.read_json(io.StringIO(parquet_data)) |
|
data_summary = df.describe(include='all').to_string() |
|
system_prefix += f"\n\nλ°μ΄ν° μμ½:\n{data_summary}" |
|
except Exception as e: |
|
print(f"λ°μ΄ν° λ‘λ μ€λ₯: {str(e)}") |
|
|
|
messages = [{"role": "system", "content": system_prefix}] |
|
recent_history = history[-3:] if history else [] |
|
for chat in recent_history: |
|
messages.append({"role": chat["role"], "content": chat["content"]}) |
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
try: |
|
response = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=messages, |
|
max_tokens=max_tokens, |
|
temperature=temperature, |
|
top_p=top_p, |
|
stream=True |
|
) |
|
|
|
full_response = "" |
|
for chunk in response: |
|
if chunk.choices[0].delta.content: |
|
full_response += chunk.choices[0].delta.content |
|
yield clean_response(full_response) |
|
|
|
except Exception as e: |
|
error_message = f"μλ΅ μμ± μ€ μ€λ₯ λ°μ: {str(e)}" |
|
print(f"{error_message}\n{traceback.format_exc()}") |
|
yield error_message |
|
|
|
def preprocess_text_with_llm(input_text: str, api_key: str = None) -> str: |
|
if not api_key: |
|
return "β οΈ API Keyκ° μ€μ λμ§ μμμ΅λλ€. μλΉμ€ μ΄μ©μ μν΄ API Keyλ₯Ό μ
λ ₯ν΄μ£ΌμΈμ." |
|
|
|
|
|
client = OpenAI(api_key=api_key) |
|
|
|
system_prompt = """λ°λμ νκΈ(νκ΅μ΄)λ‘ λ΅λ³νμμ€. λΉμ μ λ°μ΄ν° μ μ²λ¦¬ μ λ¬Έκ°μ
λλ€. μ
λ ₯λ ν
μ€νΈλ₯Ό CSV λ°μ΄ν°μ
νμμΌλ‘ λ³ννμΈμ. |
|
|
|
κ·μΉ: |
|
1. μΆλ ₯ νμ: id,text,label,metadata |
|
2. id: 1λΆν° μμνλ μμ°¨μ λ²νΈ |
|
3. text: μλ―Έ μλ λ¨μλ‘ λΆλ¦¬λ ν
μ€νΈ |
|
4. label: ν
μ€νΈμ μ£Όμ λ μΉ΄ν
κ³ λ¦¬λ₯Ό μλ κΈ°μ€μΌλ‘ μ ννκ² ν κ°λ§ μ ν |
|
- Historical_Figure (μμ¬μ μΈλ¬Ό) |
|
- Military_History (κ΅°μ¬ μμ¬) |
|
- Technology (κΈ°μ ) |
|
- Politics (μ μΉ) |
|
- Culture (λ¬Έν) |
|
5. metadata: λ μ§, μΆμ² λ± μΆκ° μ 보""" |
|
|
|
try: |
|
response = client.chat.completions.create( |
|
model="gpt-4-0125-preview", |
|
messages=[ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": input_text} |
|
], |
|
max_tokens=4000, |
|
temperature=0.1, |
|
stream=True |
|
) |
|
|
|
full_response = "" |
|
for chunk in response: |
|
if chunk.choices[0].delta.content: |
|
full_response += chunk.choices[0].delta.content |
|
|
|
processed_text = clean_response(full_response) |
|
|
|
try: |
|
from io import StringIO |
|
import csv |
|
csv.reader(StringIO(processed_text)) |
|
return processed_text |
|
except csv.Error: |
|
return "LLMμ΄ μ¬λ°λ₯Έ CSV νμμ μμ±νμ§ λͺ»νμ΅λλ€. λ€μ μλν΄μ£ΌμΈμ." |
|
|
|
except Exception as e: |
|
error_message = f"μ μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
print(error_message) |
|
return error_message |
|
|
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
api_key_state = gr.State("") |
|
|
|
gr.Markdown("# MyEzRAG: LLMμ΄ λλ§μ λ°μ΄ν°λ‘ νμ΅ν μ½ν
μΈ μμ±/λ΅λ³", elem_id="initial-description") |
|
|
|
|
|
with gr.Row(elem_classes="api-key-section"): |
|
with gr.Column(scale=3): |
|
api_key_input = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password", |
|
show_label=True |
|
) |
|
with gr.Column(scale=1): |
|
api_key_button = gr.Button("API Key μ€μ ", variant="primary") |
|
|
|
|
|
api_key_status = gr.Markdown("β οΈ API Keyκ° μ€μ λμ§ μμμ΅λλ€. μλΉμ€ μ΄μ©μ μν΄ API Keyλ₯Ό μ
λ ₯ν΄μ£ΌμΈμ.", elem_classes="api-key-status") |
|
|
|
|
|
def set_api_key(api_key: str): |
|
if not api_key.strip(): |
|
return "β οΈ API Keyκ° μ€μ λμ§ μμμ΅λλ€. μλΉμ€ μ΄μ©μ μν΄ API Keyλ₯Ό μ
λ ₯ν΄μ£ΌμΈμ.", "" |
|
if not api_key.startswith("sk-"): |
|
return "β μ¬λ°λ₯΄μ§ μμ API Key νμμ
λλ€. λ€μ νμΈν΄μ£ΌμΈμ.", "" |
|
return "β
API Keyκ° μ±κ³΅μ μΌλ‘ μ€μ λμμ΅λλ€.", api_key |
|
|
|
|
|
api_key_button.click( |
|
set_api_key, |
|
inputs=[api_key_input], |
|
outputs=[api_key_status, api_key_state] |
|
) |
|
|
|
gr.Markdown( |
|
"### 'μ¬μ© λ°©λ²' νμ ν΅ν΄ μμΈν μ΄μ© λ°©λ²μ μ°Έκ³ νμΈμ.\n" |
|
"### Tip) 'μμ 'λ₯Ό ν΅ν΄ λ€μν νμ© λ°©λ²μ 체ννκ³ μμ©ν΄ 보μΈμ, λ°μ΄ν°μ
μ
λ‘λμ 미리보기λ 10κ±΄λ§ μΆλ ₯", |
|
elem_id="initial-description" |
|
) |
|
|
|
|
|
with gr.Tab("My λ°μ΄ν°μ
+LLM"): |
|
gr.Markdown("### LLMκ³Ό λννκΈ°") |
|
chatbot_data_upload = gr.Chatbot(label="μ±λ΄", type="messages", elem_id="chatbot-data-upload") |
|
msg_data_upload = gr.Textbox(label="λ©μμ§ μ
λ ₯", placeholder="μ¬κΈ°μ λ©μμ§λ₯Ό μ
λ ₯νμΈμ...") |
|
send_data_upload = gr.Button("μ μ‘") |
|
|
|
with gr.Accordion("μμ€ν
ν둬ννΈ λ° μ΅μ
μ€μ ", open=False): |
|
system_message = gr.Textbox(label="System Message", value="λλ AI μ‘°μΈμ μν μ΄λ€.") |
|
max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens") |
|
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature") |
|
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P") |
|
|
|
parquet_data_state = gr.State() |
|
|
|
def handle_message_data_upload(message: str, history: List[Dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, parquet_data: str, api_key: str): |
|
if not api_key: |
|
history = history or [] |
|
history.append({"role": "assistant", "content": "β οΈ API Keyκ° μ€μ λμ§ μμμ΅λλ€. μλΉμ€ μ΄μ©μ μν΄ API Keyλ₯Ό μ
λ ₯ν΄μ£ΌμΈμ."}) |
|
yield history, "" |
|
return |
|
|
|
history = history or [] |
|
recent_questions = [chat['content'].strip().lower() for chat in history[-3:] if chat['role'] == 'user'] |
|
if message.strip().lower() in recent_questions: |
|
yield history + [{"role": "assistant", "content": "λμΌν μ§λ¬Έμ΄ μ΅κ·Όμ μμμ΅λλ€. λ€λ₯Έ μ§λ¬Έμ ν΄μ£ΌμΈμ."}], "" |
|
return |
|
|
|
try: |
|
history.append({"role": "user", "content": message}) |
|
response_gen = respond( |
|
message, |
|
history, |
|
system_message, |
|
max_tokens, |
|
temperature=0.3, |
|
top_p=top_p, |
|
parquet_data=parquet_data, |
|
api_key=api_key |
|
) |
|
|
|
partial_response = "" |
|
for partial in response_gen: |
|
partial_response = partial |
|
display_history = history + [{"role": "assistant", "content": partial_response}] |
|
yield display_history, "" |
|
|
|
history.append({"role": "assistant", "content": partial_response}) |
|
except Exception as e: |
|
response = f"μ€λ₯ λ°μ: {str(e)}" |
|
history.append({"role": "assistant", "content": response}) |
|
yield history, "" |
|
|
|
send_data_upload.click( |
|
handle_message_data_upload, |
|
inputs=[ |
|
msg_data_upload, |
|
chatbot_data_upload, |
|
system_message, |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
parquet_data_state, |
|
api_key_state, |
|
], |
|
outputs=[chatbot_data_upload, msg_data_upload], |
|
queue=True |
|
) |
|
|
|
|
|
with gr.Accordion("μμ ", open=False): |
|
gr.Examples( |
|
examples=[ |
|
["μ
λ‘λλ λ°μ΄ν°μ
μ λν΄ μμ½ μ€λͺ
νλΌ."], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, λ³Έ μλΉμ€λ₯Ό SEO μ΅μ ννμ¬ λΈλ‘κ·Έ ν¬μ€νΈ(κ°μ, λ°°κ²½ λ° νμμ±, κΈ°μ‘΄ μ μ¬ μ ν/μλΉμ€μ λΉκ΅νμ¬ νΉμ₯μ , νμ©μ², κ°μΉ, κΈ°λν¨κ³Ό, κ²°λ‘ μ ν¬ν¨)λ‘ 4000 ν ν° μ΄μ μμ±νλΌ"], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, μ¬μ© λ°©λ²κ³Ό μ°¨λ³μ , νΉμ§, κ°μ μ μ€μ¬μΌλ‘ 4000 ν ν° μ΄μ μ νλΈ μμ μ€ν¬λ¦½νΈ ννλ‘ μμ±νλΌ"], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, μ ν μμΈ νμ΄μ§ νμμ λ΄μ©μ 4000 ν ν° μ΄μ μμΈν μ€λͺ
νλΌ"], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, FAQ 20건μ μμΈνκ² μμ±νλΌ. 4000ν ν° μ΄μ μ¬μ©νλΌ."], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, νΉν μΆμμ νμ©ν κΈ°μ λ° λΉμ¦λμ€ λͺ¨λΈ μΈ‘λ©΄μ ν¬ν¨νμ¬ νΉν μΆμμ ꡬμ±μ λ§κ² νμ μ μΈ μ°½μ λ°λͺ
λ΄μ©μ μ€μ¬μΌλ‘ 4000 ν ν° μ΄μ μμ±νλΌ."], |
|
], |
|
inputs=msg_data_upload, |
|
label="μμ μ ν", |
|
) |
|
|
|
|
|
gr.Markdown("### Parquet νμΌ μ
λ‘λ") |
|
with gr.Row(): |
|
with gr.Column(): |
|
parquet_upload = gr.File( |
|
label="Parquet νμΌ μ
λ‘λ", type="filepath", elem_id="parquet-upload-area" |
|
) |
|
parquet_upload_button = gr.Button("μ
λ‘λ") |
|
parquet_upload_status = gr.Textbox(label="μ
λ‘λ μν", interactive=False) |
|
parquet_preview_chat = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°") |
|
|
|
def handle_parquet_upload(file_path: str): |
|
message, parquet_content, parquet_json = upload_parquet(file_path) |
|
if parquet_json: |
|
return message, parquet_content, parquet_json |
|
else: |
|
return message, "", "" |
|
|
|
parquet_upload_button.click( |
|
handle_parquet_upload, |
|
inputs=parquet_upload, |
|
outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state] |
|
) |
|
|
|
|
|
with gr.Tab("CSV to My λ°μ΄ν°μ
"): |
|
gr.Markdown("### CSV νμΌ μ
λ‘λ λ° Parquet λ³ν") |
|
with gr.Row(): |
|
with gr.Column(): |
|
csv_file = gr.File(label="CSV νμΌ μ
λ‘λ", type="filepath") |
|
upload_button = gr.Button("μ
λ‘λ λ° λ³ν") |
|
upload_status = gr.Textbox(label="μ
λ‘λ μν", interactive=False) |
|
parquet_preview = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°") |
|
download_button = gr.File(label="Parquet νμΌ λ€μ΄λ‘λ", interactive=False) |
|
|
|
def handle_csv_upload(file_path: str): |
|
message, parquet_filename = upload_csv(file_path) |
|
if parquet_filename: |
|
parquet_content = load_parquet(parquet_filename) |
|
return message, parquet_content, parquet_filename |
|
else: |
|
return message, "", None |
|
|
|
upload_button.click( |
|
handle_csv_upload, |
|
inputs=csv_file, |
|
outputs=[upload_status, parquet_preview, download_button] |
|
) |
|
|
|
|
|
with gr.Tab("Text to My λ°μ΄ν°μ
"): |
|
gr.Markdown("### ν
μ€νΈλ₯Ό μ
λ ₯νλ©΄ CSVλ‘ λ³ν ν ParquetμΌλ‘ μλ μ νλ©λλ€.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="ν
μ€νΈ μ
λ ₯ (κ° νμ `id,text,label,metadata` νμμΌλ‘ μ
λ ₯)", |
|
lines=10, |
|
placeholder='μ: 1,"μ΄μμ ","μ₯κ΅°","κ±°λΆμ "\n2,"μκ· ","μ₯κ΅°","λͺ¨ν¨"\n3,"μ μ‘°","μ","μκΈ°"\n4,"λμν λ―Έ νλ°μμ","μ","μΉ¨λ΅"' |
|
) |
|
convert_button = gr.Button("λ³ν λ° λ€μ΄λ‘λ") |
|
convert_status = gr.Textbox(label="λ³ν μν", interactive=False) |
|
parquet_preview_convert = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°") |
|
download_parquet_convert = gr.File(label="Parquet νμΌ λ€μ΄λ‘λ", interactive=False) |
|
|
|
def handle_text_to_parquet(text: str): |
|
message, parquet_content, parquet_filename = text_to_parquet(text) |
|
if parquet_filename: |
|
return message, parquet_content, parquet_filename |
|
else: |
|
return message, "", None |
|
|
|
convert_button.click( |
|
handle_text_to_parquet, |
|
inputs=text_input, |
|
outputs=[convert_status, parquet_preview_convert, download_parquet_convert] |
|
) |
|
|
|
|
|
with gr.Tab("Text Preprocessing with LLM"): |
|
gr.Markdown("### ν
μ€νΈλ₯Ό μ
λ ₯νλ©΄ LLMμ΄ λ°μ΄ν°μ
νμμ λ§κ² μ μ²λ¦¬νμ¬ μΆλ ₯ν©λλ€.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
raw_text_input = gr.Textbox( |
|
label="ν
μ€νΈ μ
λ ₯", |
|
lines=15, |
|
placeholder="μ¬κΈ°μ μ μ²λ¦¬ν ν
μ€νΈλ₯Ό μ
λ ₯νμΈμ..." |
|
) |
|
|
|
with gr.Row(): |
|
preprocess_button = gr.Button("μ μ²λ¦¬ μ€ν", variant="primary") |
|
clear_button = gr.Button("μ΄κΈ°ν") |
|
|
|
preprocess_status = gr.Textbox( |
|
label="μ μ²λ¦¬ μν", |
|
interactive=False, |
|
value="λκΈ° μ€..." |
|
) |
|
|
|
processed_text_output = gr.Textbox( |
|
label="μ μ²λ¦¬λ λ°μ΄ν°μ
μΆλ ₯", |
|
lines=15, |
|
interactive=False |
|
) |
|
|
|
convert_to_parquet_button = gr.Button("ParquetμΌλ‘ λ³ν") |
|
download_parquet = gr.File(label="λ³νλ Parquet νμΌ λ€μ΄λ‘λ") |
|
|
|
def handle_text_preprocessing(input_text: str, api_key: str): |
|
if not api_key: |
|
yield "β οΈ API Keyκ° μ€μ λμ§ μμμ΅λλ€.", "" |
|
return |
|
|
|
if not input_text.strip(): |
|
yield "μ
λ ₯ ν
μ€νΈκ° μμ΅λλ€.", "" |
|
return |
|
|
|
try: |
|
yield "μ μ²λ¦¬λ₯Ό μμν©λλ€...", "" |
|
processed_text = preprocess_text_with_llm(input_text, api_key) |
|
|
|
if processed_text: |
|
yield "μ μ²λ¦¬κ° μλ£λμμ΅λλ€.", processed_text |
|
else: |
|
yield "μ μ²λ¦¬ κ²°κ³Όκ° μμ΅λλ€.", "" |
|
|
|
except Exception as e: |
|
yield f"μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "" |
|
|
|
def clear_inputs(): |
|
return "", "λκΈ° μ€...", "" |
|
|
|
def convert_to_parquet_file(processed_text: str): |
|
if not processed_text.strip(): |
|
return "λ³νν ν
μ€νΈκ° μμ΅λλ€.", None |
|
|
|
try: |
|
message, parquet_content, parquet_filename = text_to_parquet(processed_text) |
|
if parquet_filename: |
|
return message, parquet_filename |
|
return message, None |
|
except Exception as e: |
|
return f"Parquet λ³ν μ€ μ€λ₯ λ°μ: {str(e)}", None |
|
|
|
preprocess_button.click( |
|
handle_text_preprocessing, |
|
inputs=[raw_text_input, api_key_state], |
|
outputs=[preprocess_status, processed_text_output], |
|
queue=True |
|
) |
|
|
|
clear_button.click( |
|
clear_inputs, |
|
outputs=[raw_text_input, preprocess_status, processed_text_output] |
|
) |
|
|
|
convert_to_parquet_button.click( |
|
convert_to_parquet_file, |
|
inputs=[processed_text_output], |
|
outputs=[preprocess_status, download_parquet] |
|
) |
|
|
|
with gr.Accordion("μμ ν
μ€νΈ", open=False): |
|
gr.Examples( |
|
examples=[ |
|
["μ΄μμ μ μ‘°μ μ€κΈ°μ 무μ μ΄λ€. κ·Έλ μμ§μλ λΉμ ν΄κ΅°μ μ΄λμλ€. κ±°λΆμ μ λ§λ€μ΄ μκ΅°κ³Ό μΈμ λ€."], |
|
["μΈκ³΅μ§λ₯μ μ»΄ν¨ν° κ³Όνμ ν λΆμΌμ΄λ€. κΈ°κ³νμ΅μ μΈκ³΅μ§λ₯μ νμ λΆμΌμ΄λ€. λ₯λ¬λμ κΈ°κ³νμ΅μ ν λ°©λ²μ΄λ€."] |
|
], |
|
inputs=raw_text_input, |
|
label="μμ μ ν" |
|
) |
|
|
|
|
|
with gr.Tab("π μ¬μ© λ°©λ²"): |
|
gr.Markdown(""" |
|
# MyEzRAG μ¬μ© κ°μ΄λ |
|
|
|
## π API Key μ€μ |
|
1. OpenAI API Keyλ₯Ό μλ¨ μ
λ ₯μ°½μ μ
λ ₯ |
|
2. 'API Key μ€μ ' λ²νΌ ν΄λ¦ |
|
3. μ€μ μ±κ³΅ λ©μμ§ νμΈ |
|
|
|
## 1οΈβ£ My λ°μ΄ν°μ
+LLM ν |
|
### κΈ°λ₯ |
|
- μ
λ‘λλ Parquet λ°μ΄ν°μ
μ κΈ°λ°μΌλ‘ LLMκ³Ό λν |
|
- λ°μ΄ν°μ
μ λ΄μ©μ νμ©ν μ½ν
μΈ μμ± |
|
|
|
### μ¬μ© λ°©λ² |
|
1. Parquet νμΌ μ
λ‘λ μΉμ
μμ λ°μ΄ν°μ
νμΌμ μ
λ‘λ |
|
2. μ±ν
μ°½μ μνλ μ§λ¬Έμ΄λ μμ²μ¬ν μ
λ ₯ |
|
3. μμ λ²νΌμ νμ©νμ¬ λ€μν νμ© μ¬λ‘ 체ν |
|
|
|
### ν |
|
- μμ€ν
ν둬ννΈ μ€μ μΌλ‘ μλ΅ μ€νμΌ μ‘°μ κ°λ₯ |
|
- μμΈν μ§λ¬ΈμΌμλ‘ λ μ νν λ΅λ³ μ 곡 |
|
|
|
--- |
|
|
|
## 2οΈβ£ CSV to My λ°μ΄ν°μ
ν |
|
### κΈ°λ₯ |
|
- CSV νμΌμ Parquet νμμΌλ‘ λ³ν |
|
- λ°μ΄ν° μ΅μ ν λ° μ μ |
|
|
|
### μ¬μ© λ°©λ² |
|
1. CSV νμΌ μ€λΉ (νμ 컬λΌ: id, text, label, metadata) |
|
2. νμΌ μ
λ‘λ ν 'μ
λ‘λ λ° λ³ν' λ²νΌ ν΄λ¦ |
|
3. λ³νλ Parquet νμΌ λ€μ΄λ‘λ |
|
|
|
### μ£Όμμ¬ν |
|
- CSV νμΌμ λ°λμ νμ 컬λΌμ ν¬ν¨ν΄μΌ ν¨ |
|
- μΈμ½λ©μ UTF-8 κΆμ₯ |
|
|
|
--- |
|
|
|
## 3οΈβ£ Text to My λ°μ΄ν°μ
ν |
|
### κΈ°λ₯ |
|
- ν
μ€νΈ νμμ λ°μ΄ν°λ₯Ό ParquetμΌλ‘ λ³ν |
|
- μλ λ°μ΄ν° μ
λ ₯ μ§μ |
|
|
|
### μ¬μ© λ°©λ² |
|
1. μ§μ λ νμμΌλ‘ ν
μ€νΈ μ
λ ₯ |
|
``` |
|
1,"μ΄μμ ","μ₯κ΅°","κ±°λΆμ " |
|
2,"μκ· ","μ₯κ΅°","λͺ¨ν¨" |
|
``` |
|
2. 'λ³ν λ° λ€μ΄λ‘λ' λ²νΌ ν΄λ¦ |
|
3. λ³νλ νμΌ νμΈ λ° λ€μ΄λ‘λ |
|
|
|
### μ
λ ₯ νμ |
|
- id: μμ°¨μ λ²νΈ |
|
- text: μ€μ ν
μ€νΈ λ΄μ© |
|
- label: λΆλ₯ λΌλ²¨ |
|
- metadata: λΆκ° μ 보 |
|
|
|
--- |
|
|
|
## 4οΈβ£ Text Preprocessing with LLM ν |
|
### κΈ°λ₯ |
|
- LLMμ νμ©ν μλ ν
μ€νΈ μ μ²λ¦¬ |
|
- ꡬ쑰νλ λ°μ΄ν°μ
μμ± |
|
|
|
### μ¬μ© λ°©λ² |
|
1. μλ¬Έ ν
μ€νΈ μ
λ ₯ |
|
2. 'μ μ²λ¦¬ μ€ν' λ²νΌ ν΄λ¦ |
|
3. κ²°κ³Ό νμΈ ν νμμ Parquet λ³ν |
|
|
|
### νΉμ§ |
|
- μλ λ μ΄λΈλ§ |
|
- λ¬Έμ₯ λ¨μ λΆλ¦¬ |
|
- μ€λ³΅ μ κ±° |
|
- λ°μ΄ν° μ κ·ν |
|
|
|
## π‘ μΌλ°μ μΈ ν |
|
- API Keyλ μμ νκ² λ³΄κ΄νκ³ μ£ΌκΈ°μ μΌλ‘ κ°±μ |
|
- κ° νμ μμ λ₯Ό μ°Έκ³ νμ¬ μ¬μ©λ² μ΅νκΈ° |
|
- λ°μ΄ν° νμ§μ΄ μ’μμλ‘ λ λμ κ²°κ³Ό μ 곡 |
|
- μ€λ₯ λ°μ μ μ
λ ₯ λ°μ΄ν° νμ νμΈ |
|
- λμ©λ μ²λ¦¬ μ μ μ ν μ²ν¬ ν¬κΈ°λ‘ λΆν μ²λ¦¬ |
|
|
|
## β οΈ μ£Όμμ¬ν |
|
- API Keyλ₯Ό νμΈκ³Ό 곡μ νμ§ μκΈ° |
|
- λ―Όκ°ν κ°μΈμ 보 ν¬ν¨νμ§ μκΈ° |
|
- λ°μ΄ν° λ°±μ
κΆμ₯ |
|
- λ€νΈμν¬ μν νμΈ |
|
- λΈλΌμ°μ μΊμ μ£ΌκΈ°μ μ 리 |
|
|
|
## π λ¬Έμ ν΄κ²° |
|
- API Key μ€λ₯: ν€ νμ λ° μ ν¨μ± νμΈ |
|
- μ€λ₯ λ°μ μ μ
λ ₯ λ°μ΄ν° νμ νμΈ |
|
- νμΌ μ
λ‘λ μ€ν¨ μ νμΌ ν¬κΈ° λ° νμ νμΈ |
|
- λ³ν μ€ν¨ μ λ°μ΄ν° μΈμ½λ© νμΈ |
|
- μλ΅μ΄ λ릴 κ²½μ° λ°μ΄ν° ν¬κΈ° μ‘°μ |
|
""") |
|
|
|
gr.Markdown("### [email protected]", elem_id="initial-description") |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |
|
|