mojad121 commited on
Commit
934b0c6
·
verified ·
1 Parent(s): 2320b23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -116
app.py CHANGED
@@ -1,180 +1,180 @@
1
-
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, Trainer, TrainingArguments
 
 
 
 
 
 
 
4
  from datasets import load_dataset, Dataset
5
  import torch
6
  import pandas as pd
7
- from transformers import DataCollatorForSeq2Seq
8
 
9
- MODEL_NAME = "microsoft/DialoGPT-small"
10
- DATASET_NAME = "embedding-data/amazon-QA"
 
11
  FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
 
 
12
 
13
  chatbot_pipe = None
 
14
 
15
  def show_dataset_head(dataset, num_rows=5):
16
- print("Displaying dataset preview ", dataset)
17
  if isinstance(dataset, dict):
18
  for split in dataset.keys():
19
- print("Current split ", split)
20
  df = pd.DataFrame(dataset[split][:num_rows])
21
- cols = [col for col in ['query', 'pos', 'question', 'answer'] if col in df.columns]
22
- if cols:
23
- print("Dataset columns ", cols)
24
 
25
  def load_and_preprocess_data():
26
- print("Loading dataset from ", DATASET_NAME)
27
- dataset = load_dataset(DATASET_NAME)
28
- show_dataset_head(dataset)
29
-
30
- df = pd.DataFrame(dataset['train'])
31
-
32
- if 'query' in df.columns and 'pos' in df.columns:
33
- df = df.rename(columns={'query': 'question', 'pos': 'answer'})
34
- elif 'question' not in df.columns or 'answer' not in df.columns:
35
- df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})
36
-
37
- df = df[['question', 'answer']].dropna()
38
- df = df[:5000]
39
-
40
- df['answer'] = df['answer'].astype(str).str.replace(r'\[\^|\].*', '', regex=True)
41
-
42
- processed_dataset = Dataset.from_pandas(df)
43
- show_dataset_head(processed_dataset)
44
- return processed_dataset.train_test_split(test_size=0.1)
45
-
46
- def tokenize_data(dataset):
47
- print("Tokenizing data with model ", MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
48
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
49
-
 
50
  def preprocess_function(examples):
51
- inputs = [f"question: {q} answer:" for q in examples["question"]]
52
- targets = [str(a) for a in examples["answer"]]
53
-
54
- model_inputs = tokenizer(
55
- inputs,
56
- max_length=128,
57
- truncation=True,
58
- padding='max_length'
59
- )
60
- labels = tokenizer(
61
- targets,
62
- max_length=128,
63
  truncation=True,
64
- padding='max_length'
 
65
  )
66
-
67
- model_inputs["labels"] = labels["input_ids"]
68
- return model_inputs
69
 
70
- return dataset.map(preprocess_function, batched=True)
 
 
 
 
 
 
 
71
 
72
- def fine_tune_model(tokenized_dataset):
73
- print("Starting fine-tuning process")
74
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
75
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
76
-
77
- data_collator = DataCollatorForSeq2Seq(
78
- tokenizer,
79
- model=model,
80
- padding='longest',
81
- max_length=128,
82
- pad_to_multiple_of=8
83
- )
84
-
85
  training_args = TrainingArguments(
86
  output_dir="./results",
87
- eval_strategy="epoch",
88
- learning_rate=5e-5,
89
- per_device_train_batch_size=4,
90
- per_device_eval_batch_size=4,
91
- num_train_epochs=3,
 
92
  weight_decay=0.01,
93
- save_total_limit=3,
94
  fp16=torch.cuda.is_available(),
95
- push_to_hub=False,
96
- report_to="none",
97
  logging_steps=100,
98
- save_steps=500,
99
- gradient_accumulation_steps=1
 
 
100
  )
101
-
102
  trainer = Trainer(
103
  model=model,
104
  args=training_args,
105
- train_dataset=tokenized_dataset["train"],
106
- eval_dataset=tokenized_dataset["test"],
107
- data_collator=data_collator,
108
- tokenizer=tokenizer
109
  )
110
-
111
  trainer.train()
112
- print("Training completed, saving model")
113
  model.save_pretrained(FINETUNED_MODEL_NAME)
114
  tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
115
  return model
116
 
117
  def initialize_chatbot():
118
- global chatbot_pipe
119
- print("Initializing chatbot with model ", FINETUNED_MODEL_NAME)
 
120
  try:
121
- model = AutoModelForSeq2SeqLM.from_pretrained(FINETUNED_MODEL_NAME)
122
  tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
 
 
 
123
  chatbot_pipe = pipeline(
124
- "text2text-generation",
125
  model=model,
126
  tokenizer=tokenizer,
127
  device=0 if torch.cuda.is_available() else -1
128
  )
129
- print("Chatbot initialized successfully")
130
  except Exception as e:
131
- print("Error initializing chatbot ", e)
132
- return None
133
- return chatbot_pipe
134
 
135
  def generate_response(message, history):
136
- if chatbot_pipe is None:
137
- print("Chatbot pipeline not initialized")
138
- return "System error: Chatbot not ready"
139
-
140
  try:
141
- print("Generating response for query ", message)
142
  response = chatbot_pipe(
143
- f"question: {message} answer:",
144
- max_length=128,
145
  do_sample=True,
146
  temperature=0.7,
147
- top_p=0.9
 
 
 
148
  )[0]['generated_text']
149
- final_response = response.split("answer:")[-1].strip()
150
- print("Generated response ", final_response)
151
- return final_response
152
  except Exception as e:
153
- print("Error generating response ", e)
154
- return "Sorry, I encountered an error processing your request"
155
 
156
  def deploy_chatbot():
157
- print("Launching chatbot interface")
158
  demo = gr.ChatInterface(
159
  fn=generate_response,
160
- title="Mujtaba's Shopify Assistant",
161
- description="Ask about products, shipping, or store policies",
162
  examples=[
163
- "Will this work with iPhone 15?",
164
- "What's the return window?",
165
- "Do you ship to Lahore?"
166
- ],
167
- theme="soft",
168
- cache_examples=False
169
  )
170
  return demo
171
 
172
  if __name__ == "__main__":
173
-
174
- dataset = load_and_preprocess_data()
175
- tokenized_data = tokenize_data(dataset)
176
-
177
- model = fine_tune_model(tokenized_data)
178
-
179
  initialize_chatbot()
180
  deploy_chatbot().launch()
 
 
1
  import gradio as gr
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForCausalLM,
5
+ pipeline,
6
+ Trainer,
7
+ TrainingArguments,
8
+ DataCollatorForLanguageModeling
9
+ )
10
  from datasets import load_dataset, Dataset
11
  import torch
12
  import pandas as pd
13
+ from sklearn.model_selection import train_test_split
14
 
15
+ # Configuration
16
+ MODEL_NAME = "microsoft/DialoGPT-medium"
17
+ DATASET_NAME = "embedding-data/Amazon-QA"
18
  FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
19
+ MAX_LENGTH = 128
20
+ BATCH_SIZE = 8
21
 
22
  chatbot_pipe = None
23
+ tokenizer = None
24
 
25
  def show_dataset_head(dataset, num_rows=5):
26
+ """Dataset preview"""
27
  if isinstance(dataset, dict):
28
  for split in dataset.keys():
 
29
  df = pd.DataFrame(dataset[split][:num_rows])
30
+ print(f"\n{split} split preview:")
31
+ print(df[['question', 'answer']].head() if 'question' in df.columns else df.head())
 
32
 
33
  def load_and_preprocess_data():
34
+ """Data loading with cleaning"""
35
+ print(f"Loading {DATASET_NAME}")
36
+ try:
37
+ dataset = load_dataset(DATASET_NAME)
38
+ show_dataset_head(dataset)
39
+
40
+ df = pd.DataFrame(dataset['train'])
41
+
42
+ # Column normalization
43
+ if 'query' in df.columns and 'pos' in df.columns:
44
+ df = df.rename(columns={'query': 'question', 'pos': 'answer'})
45
+ elif 'question' not in df.columns or 'answer' not in df.columns:
46
+ if len(df.columns) >= 2:
47
+ df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})
48
+ else:
49
+ raise ValueError("Dataset must have at least two columns for question and answer")
50
+
51
+ # Cleaning
52
+ df = df[['question', 'answer']].dropna()
53
+ df = df[~df['answer'].str.contains(r'\[|\^|\]', regex=True, na=False)]
54
+ df = df[df['answer'].str.len() > 10]
55
+ df = df[:10000]
56
+
57
+ # Split
58
+ train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
59
+ return Dataset.from_pandas(train_df), Dataset.from_pandas(test_df)
60
+ except Exception as e:
61
+ print(f"Data error: {str(e)}")
62
+ raise
63
+
64
+ def tokenize_data(train_dataset, test_dataset):
65
+ """Basic tokenization"""
66
+ global tokenizer
67
+ print(f"Tokenizing with {MODEL_NAME}")
68
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
69
+ tokenizer.pad_token = tokenizer.eos_token
70
+
71
  def preprocess_function(examples):
72
+ texts = [f"{q} {tokenizer.eos_token} {a}" for q, a in zip(examples["question"], examples["answer"])]
73
+ return tokenizer(
74
+ texts,
75
+ max_length=MAX_LENGTH,
 
 
 
 
 
 
 
 
76
  truncation=True,
77
+ padding="max_length",
78
+ return_tensors="pt"
79
  )
 
 
 
80
 
81
+ train_tokenized = train_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer'])
82
+ test_tokenized = test_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer'])
83
+ return train_tokenized, test_tokenized
84
+
85
+ def fine_tune_model(train_data, test_data):
86
+ """Optimized training"""
87
+ print("Starting fine-tuning")
88
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  training_args = TrainingArguments(
91
  output_dir="./results",
92
+ eval_strategy="steps",
93
+ eval_steps=500,
94
+ learning_rate=3e-5,
95
+ per_device_train_batch_size=BATCH_SIZE,
96
+ per_device_eval_batch_size=BATCH_SIZE,
97
+ num_train_epochs=4,
98
  weight_decay=0.01,
99
+ warmup_ratio=0.1,
100
  fp16=torch.cuda.is_available(),
 
 
101
  logging_steps=100,
102
+ save_steps=1000,
103
+ save_total_limit=2,
104
+ load_best_model_at_end=True,
105
+ report_to="none" # Disable W&B logging
106
  )
107
+
108
  trainer = Trainer(
109
  model=model,
110
  args=training_args,
111
+ train_dataset=train_data,
112
+ eval_dataset=test_data,
113
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
 
114
  )
115
+
116
  trainer.train()
 
117
  model.save_pretrained(FINETUNED_MODEL_NAME)
118
  tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
119
  return model
120
 
121
  def initialize_chatbot():
122
+ """Initialize generation pipeline"""
123
+ global chatbot_pipe, tokenizer
124
+ print(f"Loading {FINETUNED_MODEL_NAME}")
125
  try:
 
126
  tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
127
+ tokenizer.pad_token = tokenizer.eos_token
128
+ model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL_NAME)
129
+
130
  chatbot_pipe = pipeline(
131
+ "text-generation",
132
  model=model,
133
  tokenizer=tokenizer,
134
  device=0 if torch.cuda.is_available() else -1
135
  )
 
136
  except Exception as e:
137
+ print(f"Initialization failed: {str(e)}")
138
+ raise
 
139
 
140
  def generate_response(message, history):
141
+ """Direct generation without prompt engineering"""
142
+ if not chatbot_pipe:
143
+ return "System initializing..."
144
+
145
  try:
 
146
  response = chatbot_pipe(
147
+ message,
148
+ max_length=MAX_LENGTH,
149
  do_sample=True,
150
  temperature=0.7,
151
+ top_k=50,
152
+ top_p=0.9,
153
+ repetition_penalty=1.2,
154
+ num_return_sequences=1
155
  )[0]['generated_text']
156
+
157
+ return response.split(tokenizer.eos_token)[-1].strip()
 
158
  except Exception as e:
159
+ print(f"Generation error: {str(e)}")
160
+ return "Please try again later."
161
 
162
  def deploy_chatbot():
163
+ """Gradio interface"""
164
  demo = gr.ChatInterface(
165
  fn=generate_response,
166
+ title="Shopify Assistant",
 
167
  examples=[
168
+ "Does this work with iPhone 15?",
169
+ "What's the return policy?",
170
+ "Do you ship internationally?"
171
+ ]
 
 
172
  )
173
  return demo
174
 
175
  if __name__ == "__main__":
176
+ train_data, test_data = load_and_preprocess_data()
177
+ train_tokenized, test_tokenized = tokenize_data(train_data, test_data)
178
+ model = fine_tune_model(train_tokenized, test_tokenized)
 
 
 
179
  initialize_chatbot()
180
  deploy_chatbot().launch()