{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":12737505,"sourceType":"datasetVersion","datasetId":8051539}],"dockerImageVersionId":31090,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers datasets accelerate -q\n\nimport pandas as pd\nfrom datasets import Dataset\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling\n\n# 1. Load dataset\ncsv_path = \"/kaggle/input/python-dataset/Dataset_Python_Question_Answer.csv\"\ndf = pd.read_csv(csv_path)\n\n# Clean Answer column\ndf['Answer'] = df['Answer'].str.strip(\"[]\").str.strip('\"').str.strip(\"'\")\n\n# Combine Q&A into single text field\ndf['text'] = \"Question: \" + df['Question'] + \"\\nAnswer: \" + df['Answer']\n\ndataset = Dataset.from_pandas(df[['text']])\n\n# 2. Load tokenizer & model\nmodel_name = \"distilgpt2\"\ntokenizer = AutoTokenizer.from_pretrained(model_name)\ntokenizer.pad_token = tokenizer.eos_token # GPT2 fix\n\n# 3. Tokenize dataset\ndef tokenize_function(examples):\n return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=128)\n\ntokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=[\"text\"])\n\n# 4. Train-test split\nsplit_dataset = tokenized_dataset.train_test_split(test_size=0.1)\n\n# 5. Data collator\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n\n# 6. Load model\nmodel = AutoModelForCausalLM.from_pretrained(model_name)\n\n# 7. Training arguments\ntraining_args = TrainingArguments(\n output_dir=\"/kaggle/working/results\",\n overwrite_output_dir=True,\n learning_rate=5e-5,\n per_device_train_batch_size=2,\n per_device_eval_batch_size=2,\n num_train_epochs=2, # Just 2 epochs\n weight_decay=0.01,\n logging_dir=\"/kaggle/working/logs\",\n save_total_limit=1,\n report_to=\"none\" # No wandb\n)\n\n# 8. Trainer\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=split_dataset[\"train\"],\n eval_dataset=split_dataset[\"test\"],\n tokenizer=tokenizer,\n data_collator=data_collator,\n)\n\n# 9. Train model\ntrainer.train()\n\n# 10. Save model\nmodel_save_path = \"/kaggle/working/distilgpt2-finetuned-pythonqa\"\ntrainer.save_model(model_save_path)\ntokenizer.save_pretrained(model_save_path)\n\nprint(f\"✅ Model saved to: {model_save_path}\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-11T20:51:01.866267Z","iopub.execute_input":"2025-08-11T20:51:01.866620Z","iopub.status.idle":"2025-08-11T20:51:31.189517Z","shell.execute_reply.started":"2025-08-11T20:51:01.866590Z","shell.execute_reply":"2025-08-11T20:51:31.188448Z"}},"outputs":[{"name":"stderr","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Map: 0%| | 0/419 [00:00","text/html":"\n
\n \n \n [378/378 00:22, Epoch 2/2]\n
\n \n \n \n \n \n \n \n \n \n
StepTraining Loss

"},"metadata":{}},{"name":"stdout","text":"✅ Model saved to: /kaggle/working/distilgpt2-finetuned-pythonqa\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer\n\nmodel_path = \"/kaggle/working/distilgpt2-finetuned-pythonqa\"\ntokenizer = AutoTokenizer.from_pretrained(model_path)\nmodel = AutoModelForCausalLM.from_pretrained(model_path)\n\ngenerator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer)\n\nprompt = \"Question: What is a list in python?\\nAnswer:\"\noutput = generator(prompt, max_new_tokens=50, num_return_sequences=1)[0][\"generated_text\"]\n\nprint(output)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-11T21:04:48.467240Z","iopub.execute_input":"2025-08-11T21:04:48.467834Z","iopub.status.idle":"2025-08-11T21:04:49.171722Z","shell.execute_reply.started":"2025-08-11T21:04:48.467811Z","shell.execute_reply":"2025-08-11T21:04:49.171007Z"}},"outputs":[{"name":"stderr","text":"Device set to use cuda:0\n","output_type":"stream"},{"name":"stdout","text":"Question: What is a list in python?\nAnswer: A list is a collection of elements that are created independently of a specific type. A list can be created with different types of values, or with different types of values. A list can be created with different types of values, or with different types of\n","output_type":"stream"}],"execution_count":26},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}