In [19]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import pandas as pd
from huggingface_hub import notebook_login
from transformers import DataCollatorForSeq2Seq

MODEL_NAME = "t5-small"
DATASET_NAME = "embedding-data/amazon-QA"
FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
HF_TOKEN = "your_huggingface_token"

chatbot_pipe = None

def show_dataset_head(dataset, num_rows=5):

    print("\nDataset Preview:")
    if isinstance(dataset, dict):
        for split in dataset.keys():
            print(f"\nSplit: {split}")
            df = pd.DataFrame(dataset[split][:num_rows])
            cols = [col for col in ['query', 'pos', 'question', 'answer'] if col in df.columns]
            if cols:
                print(df[cols].to_markdown(index=False))
    else:
        df = pd.DataFrame(dataset[:num_rows])
        cols = [col for col in ['query', 'pos', 'question', 'answer'] if col in df.columns]
        if cols:
            print(df[cols].to_markdown(index=False))
    return dataset

def load_and_preprocess_data():
    print("Loading dataset...")
    dataset = load_dataset(DATASET_NAME)
    show_dataset_head(dataset)

    df = pd.DataFrame(dataset['train'])

    if 'query' in df.columns and 'pos' in df.columns:
        df = df.rename(columns={'query': 'question', 'pos': 'answer'})
    elif 'question' not in df.columns or 'answer' not in df.columns:
        df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})

    df = df[['question', 'answer']].dropna()
    df = df[:5000]

    df['answer'] = df['answer'].astype(str).str.replace(r'\[\^|\].*', '', regex=True)

    processed_dataset = Dataset.from_pandas(df)
    show_dataset_head(processed_dataset, num_rows=3)
    return processed_dataset.train_test_split(test_size=0.1)

def tokenize_data(dataset):
    print("Tokenizing data...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def preprocess_function(examples):
        inputs = [f"question: {q} answer:" for q in examples["question"]]
        targets = [str(a) for a in examples["answer"]]

        model_inputs = tokenizer(
            inputs,
            max_length=128,
            truncation=True,
            padding='max_length'
        )
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding='max_length'
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return dataset.map(preprocess_function, batched=True)


def fine_tune_model(tokenized_dataset):
    print("Fine-tuning model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        padding='longest',
        max_length=128,
        pad_to_multiple_of=8
    )

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        save_total_limit=3,
        fp16=torch.cuda.is_available(),
        push_to_hub=False,
        report_to="none",
        logging_steps=100,
        save_steps=500,
        gradient_accumulation_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    trainer.train()
    return model

def initialize_chatbot():
    global chatbot_pipe
    print("Loading chatbot...")
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    chatbot_pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )
    return chatbot_pipe

def generate_response(message, history):
    response = chatbot_pipe(
        f"question: {message} answer:",
        max_length=128,
        do_sample=True,
        temperature=0.7
    )[0]['generated_text']
    return response.split("answer:")[-1].strip()

def deploy_chatbot():
    demo = gr.ChatInterface(
        fn=generate_response,
        title="Mujtaba's Shopify Assistant",
        description="Ask about products, shipping, or store policies",
        examples=[
            "Will this work with iPhone 15?",
            "What's the return window?",
            "Do you ship to Lahore?"
        ],
        theme="soft",
        cache_examples=False
    )
    return demo

if __name__ == "__main__":
    notebook_login()
    dataset = load_and_preprocess_data()
    tokenized_data = tokenize_data(dataset)

    model = fine_tune_model(tokenized_data)

    initialize_chatbot()
    deploy_chatbot().launch()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading dataset...

Dataset Preview:

Split: train
| query                                                      | pos                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Fine-tuning model...


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.9606,2.700889


Loading chatbot...


Device set to use cpu
  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a3e682dd7134cd6ae5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [7]:
show_dataset_head()


Dataset Preview:
| query                                                      | pos                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

Dataset({
    features: ['query', 'pos'],
    num_rows: 5000
})