|
|
|
from datasets import load_dataset |
|
from datasets import load_from_disk |
|
import pandas as pd |
|
from huggingface_hub import interpreter_login |
|
|
|
interpreter_login() |
|
|
|
|
|
|
|
dataset = load_dataset("tatvamasi/medquad-std", split="train") |
|
|
|
print(dataset) |
|
|
|
|
|
df = pd.DataFrame(dataset) |
|
|
|
|
|
print(df.head(2)) |
|
|
|
|
|
|
|
def format_row(row): |
|
question = row['Question'] |
|
answer = row['Answer'] |
|
formatted_string = f"[INST] {question} [/INST] {answer} " |
|
return formatted_string |
|
|
|
|
|
|
|
df['Formatted'] = df.apply(format_row, axis=1) |
|
|
|
|
|
print(df['Formatted']) |
|
|
|
new_df = df.rename(columns={'Formatted': 'Text'}) |
|
new_df = new_df[['Text']] |
|
print(new_df.head(3)) |
|
|
|
new_df.to_csv('formatted_qna_set.csv', index=False) |
|
|
|
df_from_disk = pd.read_csv("formatted_qna_set.csv") |
|
|
|
print(df_from_disk.head(2)) |