mojad121 commited on
Commit
b8b26b5
·
verified ·
1 Parent(s): 94647e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -72
app.py CHANGED
@@ -1,72 +1,103 @@
1
- # app.py - Complete Chatbot with Fine-tuning and Deployment
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, Trainer, TrainingArguments
4
  from datasets import load_dataset, Dataset
5
  import torch
6
  import pandas as pd
7
- from huggingface_hub import notebook_login, Repository
 
8
 
9
- # Configuration
10
- MODEL_NAME = "t5-small" # Lightweight model good for chatbots
11
- DATASET_NAME = "AmazonQA"
12
  FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
13
- HF_TOKEN = "your_huggingface_token" # Replace with your token
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # --- Step 1: Load and Prepare Dataset ---
16
  def load_and_preprocess_data():
17
- print("Loading AmazonQA dataset...")
18
  dataset = load_dataset(DATASET_NAME)
 
19
 
20
- # Convert to pandas for easier processing
21
  df = pd.DataFrame(dataset['train'])
22
 
23
- # Preprocessing - create consistent Q&A pairs
 
 
 
 
24
  df = df[['question', 'answer']].dropna()
25
- df = df[:5000] # Use subset for faster training
26
 
27
- # Convert back to Hugging Face Dataset
28
- processed_dataset = Dataset.from_pandas(df)
29
 
30
- # Split into train and eval
31
- split_dataset = processed_dataset.train_test_split(test_size=0.1)
32
- return split_dataset
33
 
34
- # --- Step 2: Tokenization ---
35
  def tokenize_data(dataset):
36
- print("Tokenizing data...")
37
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
 
39
  def preprocess_function(examples):
40
  inputs = [f"question: {q} answer:" for q in examples["question"]]
41
- targets = examples["answer"]
42
 
43
- model_inputs = tokenizer(inputs, max_length=128, truncation=True)
44
- labels = tokenizer(targets, max_length=128, truncation=True)
 
 
 
 
 
 
 
 
 
 
45
 
46
  model_inputs["labels"] = labels["input_ids"]
47
  return model_inputs
48
 
49
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
50
- return tokenized_dataset
51
 
52
- # --- Step 3: Fine-tuning ---
53
  def fine_tune_model(tokenized_dataset):
54
- print("Fine-tuning model...")
 
55
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
56
 
 
 
 
 
 
 
 
 
57
  training_args = TrainingArguments(
58
  output_dir="./results",
59
- evaluation_strategy="epoch",
60
- learning_rate=2e-5,
61
- per_device_train_batch_size=8,
62
- per_device_eval_batch_size=8,
63
  num_train_epochs=3,
64
  weight_decay=0.01,
65
  save_total_limit=3,
66
  fp16=torch.cuda.is_available(),
67
- push_to_hub=True,
68
- hub_model_id=FINETUNED_MODEL_NAME,
69
- hub_token=HF_TOKEN,
 
 
70
  )
71
 
72
  trainer = Trainer(
@@ -74,70 +105,77 @@ def fine_tune_model(tokenized_dataset):
74
  args=training_args,
75
  train_dataset=tokenized_dataset["train"],
76
  eval_dataset=tokenized_dataset["test"],
 
 
77
  )
78
 
79
  trainer.train()
80
- trainer.push_to_hub()
 
 
81
  return model
82
 
83
- # --- Step 4: Chatbot Interface ---
84
  def initialize_chatbot():
85
- print("Loading chatbot...")
 
86
  try:
87
- # Try loading fine-tuned model first
88
  model = AutoModelForSeq2SeqLM.from_pretrained(FINETUNED_MODEL_NAME)
89
  tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
90
- except:
91
- # Fallback to pre-trained model
92
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
93
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
94
-
95
- chatbot_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
96
  return chatbot_pipe
97
 
98
  def generate_response(message, history):
99
- # Format the input for the model
100
- input_text = f"question: {message} answer:"
101
-
102
- # Generate response
103
- response = chatbot_pipe(input_text, max_length=128, do_sample=True)[0]['generated_text']
104
 
105
- # Clean up the response
106
- if "answer:" in response:
107
- response = response.split("answer:")[-1].strip()
108
- return response
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # --- Step 5: Deployment ---
111
  def deploy_chatbot():
112
- print("Launching chatbot interface...")
113
  demo = gr.ChatInterface(
114
  fn=generate_response,
115
- title="Mujtaba's Shopify Chatbot",
116
- description="Ask me anything about products, shipping, or returns!",
117
  examples=[
118
- "What's the return policy?",
119
- "How long does shipping take to Karachi?",
120
- "Do you have size charts for kurtas?"
121
  ],
122
- theme="soft"
 
123
  )
124
  return demo
125
 
126
- # --- Main Execution ---
127
  if __name__ == "__main__":
128
- # Login to Hugging Face Hub
129
  notebook_login()
130
-
131
- # Dataset preparation
132
  dataset = load_and_preprocess_data()
133
- tokenized_dataset = tokenize_data(dataset)
134
-
135
- # Fine-tuning (uncomment to run)
136
- # fine_tune_model(tokenized_dataset)
137
 
138
- # Initialize chatbot
139
- chatbot_pipe = initialize_chatbot()
140
 
141
- # Launch interface
142
- demo = deploy_chatbot()
143
- demo.launch(share=True)
 
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, Trainer, TrainingArguments
3
  from datasets import load_dataset, Dataset
4
  import torch
5
  import pandas as pd
6
+ from huggingface_hub import notebook_login
7
+ from transformers import DataCollatorForSeq2Seq
8
 
9
+ MODEL_NAME = "microsoft/DialoGPT-small"
10
+ DATASET_NAME = "embedding-data/amazon-QA"
 
11
  FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
12
+ HF_TOKEN = "your_huggingface_token"
13
+
14
+ chatbot_pipe = None
15
+
16
+ def show_dataset_head(dataset, num_rows=5):
17
+ print("Displaying dataset preview ", dataset)
18
+ if isinstance(dataset, dict):
19
+ for split in dataset.keys():
20
+ print("Current split ", split)
21
+ df = pd.DataFrame(dataset[split][:num_rows])
22
+ cols = [col for col in ['query', 'pos', 'question', 'answer'] if col in df.columns]
23
+ if cols:
24
+ print("Dataset columns ", cols)
25
 
 
26
  def load_and_preprocess_data():
27
+ print("Loading dataset from ", DATASET_NAME)
28
  dataset = load_dataset(DATASET_NAME)
29
+ show_dataset_head(dataset)
30
 
 
31
  df = pd.DataFrame(dataset['train'])
32
 
33
+ if 'query' in df.columns and 'pos' in df.columns:
34
+ df = df.rename(columns={'query': 'question', 'pos': 'answer'})
35
+ elif 'question' not in df.columns or 'answer' not in df.columns:
36
+ df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})
37
+
38
  df = df[['question', 'answer']].dropna()
39
+ df = df[:5000]
40
 
41
+ df['answer'] = df['answer'].astype(str).str.replace(r'\[\^|\].*', '', regex=True)
 
42
 
43
+ processed_dataset = Dataset.from_pandas(df)
44
+ show_dataset_head(processed_dataset)
45
+ return processed_dataset.train_test_split(test_size=0.1)
46
 
 
47
  def tokenize_data(dataset):
48
+ print("Tokenizing data with model ", MODEL_NAME)
49
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
50
 
51
  def preprocess_function(examples):
52
  inputs = [f"question: {q} answer:" for q in examples["question"]]
53
+ targets = [str(a) for a in examples["answer"]]
54
 
55
+ model_inputs = tokenizer(
56
+ inputs,
57
+ max_length=128,
58
+ truncation=True,
59
+ padding='max_length'
60
+ )
61
+ labels = tokenizer(
62
+ targets,
63
+ max_length=128,
64
+ truncation=True,
65
+ padding='max_length'
66
+ )
67
 
68
  model_inputs["labels"] = labels["input_ids"]
69
  return model_inputs
70
 
71
+ return dataset.map(preprocess_function, batched=True)
 
72
 
 
73
  def fine_tune_model(tokenized_dataset):
74
+ print("Starting fine-tuning process")
75
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
76
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
77
 
78
+ data_collator = DataCollatorForSeq2Seq(
79
+ tokenizer,
80
+ model=model,
81
+ padding='longest',
82
+ max_length=128,
83
+ pad_to_multiple_of=8
84
+ )
85
+
86
  training_args = TrainingArguments(
87
  output_dir="./results",
88
+ eval_strategy="epoch",
89
+ learning_rate=5e-5,
90
+ per_device_train_batch_size=4,
91
+ per_device_eval_batch_size=4,
92
  num_train_epochs=3,
93
  weight_decay=0.01,
94
  save_total_limit=3,
95
  fp16=torch.cuda.is_available(),
96
+ push_to_hub=False,
97
+ report_to="none",
98
+ logging_steps=100,
99
+ save_steps=500,
100
+ gradient_accumulation_steps=1
101
  )
102
 
103
  trainer = Trainer(
 
105
  args=training_args,
106
  train_dataset=tokenized_dataset["train"],
107
  eval_dataset=tokenized_dataset["test"],
108
+ data_collator=data_collator,
109
+ tokenizer=tokenizer
110
  )
111
 
112
  trainer.train()
113
+ print("Training completed, saving model")
114
+ model.save_pretrained(FINETUNED_MODEL_NAME)
115
+ tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
116
  return model
117
 
 
118
  def initialize_chatbot():
119
+ global chatbot_pipe
120
+ print("Initializing chatbot with model ", FINETUNED_MODEL_NAME)
121
  try:
 
122
  model = AutoModelForSeq2SeqLM.from_pretrained(FINETUNED_MODEL_NAME)
123
  tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
124
+ chatbot_pipe = pipeline(
125
+ "text2text-generation",
126
+ model=model,
127
+ tokenizer=tokenizer,
128
+ device=0 if torch.cuda.is_available() else -1
129
+ )
130
+ print("Chatbot initialized successfully")
131
+ except Exception as e:
132
+ print("Error initializing chatbot ", e)
133
+ return None
134
  return chatbot_pipe
135
 
136
  def generate_response(message, history):
137
+ if chatbot_pipe is None:
138
+ print("Chatbot pipeline not initialized")
139
+ return "System error: Chatbot not ready"
 
 
140
 
141
+ try:
142
+ print("Generating response for query ", message)
143
+ response = chatbot_pipe(
144
+ f"question: {message} answer:",
145
+ max_length=128,
146
+ do_sample=True,
147
+ temperature=0.7,
148
+ top_p=0.9
149
+ )[0]['generated_text']
150
+ final_response = response.split("answer:")[-1].strip()
151
+ print("Generated response ", final_response)
152
+ return final_response
153
+ except Exception as e:
154
+ print("Error generating response ", e)
155
+ return "Sorry, I encountered an error processing your request"
156
 
 
157
  def deploy_chatbot():
158
+ print("Launching chatbot interface")
159
  demo = gr.ChatInterface(
160
  fn=generate_response,
161
+ title="Mujtaba's Shopify Assistant",
162
+ description="Ask about products, shipping, or store policies",
163
  examples=[
164
+ "Will this work with iPhone 15?",
165
+ "What's the return window?",
166
+ "Do you ship to Lahore?"
167
  ],
168
+ theme="soft",
169
+ cache_examples=False
170
  )
171
  return demo
172
 
 
173
  if __name__ == "__main__":
 
174
  notebook_login()
 
 
175
  dataset = load_and_preprocess_data()
176
+ tokenized_data = tokenize_data(dataset)
 
 
 
177
 
178
+ model = fine_tune_model(tokenized_data)
 
179
 
180
+ initialize_chatbot()
181
+ deploy_chatbot().launch()