In [4]:
!pip install transformers datasets accelerate -q

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 1. Load dataset
csv_path = "/kaggle/input/python-dataset/Dataset_Python_Question_Answer.csv"
df = pd.read_csv(csv_path)

# Clean Answer column
df['Answer'] = df['Answer'].str.strip("[]").str.strip('"').str.strip("'")

# Combine Q&A into single text field
df['text'] = "Question: " + df['Question'] + "\nAnswer: " + df['Answer']

dataset = Dataset.from_pandas(df[['text']])

# 2. Load tokenizer & model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 fix

# 3. Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 4. Train-test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)

# 5. Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 6. Load model
model = AutoModelForCausalLM.from_pretrained(model_name)

# 7. Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    overwrite_output_dir=True,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,  # Just 2 epochs
    weight_decay=0.01,
    logging_dir="/kaggle/working/logs",
    save_total_limit=1,
    report_to="none"  # No wandb
)

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 9. Train model
trainer.train()

# 10. Save model
model_save_path = "/kaggle/working/distilgpt2-finetuned-pythonqa"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✅ Model saved to: {model_save_path}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/419 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss


✅ Model saved to: /kaggle/working/distilgpt2-finetuned-pythonqa


In [26]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

model_path = "/kaggle/working/distilgpt2-finetuned-pythonqa"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Question: What is a list in python?\nAnswer:"
output = generator(prompt, max_new_tokens=50, num_return_sequences=1)[0]["generated_text"]

print(output)


Device set to use cuda:0


Question: What is a list in python?
Answer: A list is a collection of elements that are created independently of a specific type. A list can be created with different types of values, or with different types of values. A list can be created with different types of values, or with different types of
