File size: 12,555 Bytes
d58462e c68fb4b d58462e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 |
# -*- coding: utf-8 -*-
"""Untitled15.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH
"""
import pandas as pd
# Mount Google Drive (optional if you want to save files there)
from google.colab import drive
drive.mount('/content/drive')
# Define file paths
input_csv_path = "/content/drive/MyDrive/judicial_cases.csv" # Ensure you have uploaded this file
train_csv_path = "/content/training_judicial_cases.csv"
val_csv_path = "/content/validation_judicial_cases.csv"
# Load the dataset
df = pd.read_csv(input_csv_path)
# Split dataset (80% training, 20% validation)
train_df = df.sample(frac=0.8, random_state=42) # Random sampling for training
val_df = df.drop(train_df.index) # Remaining 20% for validation
# Save training and validation sets as CSV
train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)
print(f"β
Training set saved: {train_csv_path}")
print(f"β
Validation set saved: {val_csv_path}")
# Copy to Google Drive (optional)
train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv"
val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv"
!cp {train_csv_path} {train_drive_path}
!cp {val_csv_path} {val_drive_path}
print(f"π Training set also saved to Google Drive: {train_drive_path}")
print(f"π Validation set also saved to Google Drive: {val_drive_path}")
import os
file_path = "/content/drive/MyDrive/training_data.jsonl"
if os.path.exists(file_path):
print("β
File exists, proceeding with upload...")
else:
print("β File not found! Check file path.")
import torch
if torch.cuda.is_available():
print("β
GPU is available:", torch.cuda.get_device_name(0))
else:
print("β No GPU found! Go to Runtime β Change runtime type β Select GPU.")
import pandas as pd
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv")
# Display first few rows
print(df.head())
!pip install datasets
!pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece
from huggingface_hub import login
login(token="") # Paste your HF token here
print("β
Hugging Face login successful!")
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True)
print("β
LLaMA 2 model loaded successfully!")
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
# Define QLoRA configuration
lora_config = LoraConfig(
r=16, # Low-rank adaptation size
lora_alpha=32, # Scaling factor
lora_dropout=0.05, # Dropout to prevent overfitting
target_modules=["q_proj", "v_proj"] # Apply LoRA to attention layers
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
json_path = "/content/drive/MyDrive/judicial_cases.json"
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": json_path})
print("β
Dataset loaded successfully!")
import os
json_path = "/content/drive/MyDrive/judicial_cases.json" # Update the path if needed
if os.path.exists(json_path):
print(f"β
JSON file found: {json_path}")
else:
print(f"β JSON file not found! You need to generate it first.")
!pip install --upgrade datasets transformers
import datasets
from datasets import load_dataset
print("β
Hugging Face `datasets` library is installed and working!")
import datasets
from datasets import load_dataset
print("β
Hugging Face `datasets` library is installed and working!")
from datasets import load_dataset
# Load dataset from JSON file
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
# Split dataset into training (80%) and evaluation (20%)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"] # Required for evaluation
print("β
Dataset split into training and evaluation sets!")
from google.colab import drive
drive.mount('/content/drive')
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": json_path})
print("β
Dataset loaded successfully!")
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token="")
# Load model without offloading
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
#device_map="auto", # Remove automatic device mapping
#offload_folder="offload" # Remove offloading
)
# Manually move the model to the desired device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move entire model to GPU if available, else CPU
print("β
Model loaded successfully!")
from datasets import load_dataset
# Load dataset from JSON file
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
# Split dataset into training (80%) and evaluation (20%)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"] # Required for evaluation
print("β
Dataset split into training and evaluation sets!")
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="/content/fine_tuned_llama2",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=100,
max_steps=500,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_strategy="epoch",
eval_strategy="epoch", # Fix deprecation warning
push_to_hub=False
)
from transformers import Trainer
trainer = Trainer(
model=model, # Do NOT move manually
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset # Include evaluation dataset if available
)
print("β
Trainer initialized successfully!")
model.save_pretrained("/content/fine_tuned_llama2")
tokenizer.save_pretrained("/content/fine_tuned_llama2")
print("β
Model saved successfully!")
# Optional: Upload to Hugging Face
from huggingface_hub import notebook_login
notebook_login()
# Replace "your-hf-username" with your actual Hugging Face username
model.push_to_hub("and89/fine_tuned_llama2")
tokenizer.push_to_hub("and89/fine_tuned_llama2")
print("π Model uploaded to Hugging Face!")
from huggingface_hub import HfApi
api = HfApi()
datasets = api.list_repo_files("and89/fine_tuned_llama2")
print("β
Uploaded dataset files:", datasets)
api.upload_file(
path_or_fileobj="/content/drive/MyDrive/training_data.jsonl", # Update file path
path_in_repo="training_data.jsonl",
repo_id="and89/fine_tuned_llama2"
)
from transformers import Trainer
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["facts"], padding="max_length", truncation=True)
# Assuming "facts" is the column you want to use for input
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
# Now initialize the Trainer
trainer = Trainer(
model=model, # Do NOT move manually
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset # Include evaluation dataset if available
)
print("β
Trainer initialized successfully!")
from datasets import load_dataset
# Replace with your dataset name
dataset = load_dataset("and89/fine_tuned_llama2")
# Check dataset format
print(dataset)
print(dataset["train"][0]) # Print first row to check structure
print(dataset) # Prints dataset details
print("Sample row:", dataset["train"][0]) # Prints the first row
from datasets import load_dataset
dataset = load_dataset("and89/fine_tuned_llama2")
print("β
Dataset loaded successfully!")
print(dataset)
from transformers import AutoTokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(dataset["train"].features)
def preprocess_function(examples):
text_column = list(dataset["train"].features.keys())[0] # Get the text column name
# Ensure the input is a list of strings
texts = examples[text_column]
# Convert all values to strings in case they are not
texts = [str(text) for text in texts]
return tokenizer(texts, padding="max_length", truncation=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
print("β
Tokenization successful!")
tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")
print("β
Tokenization successful!")
print(tokenized_datasets)
tokenized_datasets.save_to_disk("tokenized_dataset")
# Reload and verify
from datasets import load_from_disk
reloaded_dataset = load_from_disk("tokenized_dataset")
print("β
Reloaded Tokenized Dataset:", reloaded_dataset)
print(tokenized_datasets) # Prints available dataset splits
from datasets import load_dataset
# Load dataset
dataset = load_dataset("and89/fine_tuned_llama2")
# Split dataset (90% train, 10% test)
train_test_split = dataset["train"].train_test_split(test_size=0.1)
# Verify new splits
print(train_test_split)
from datasets import DatasetDict
# Split dataset into train and test (90% train, 10% test)
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
# Convert to DatasetDict
tokenized_datasets = DatasetDict({
"train": train_test_split["train"],
"test": train_test_split["test"]
})
print("β
Train-Test split created:", tokenized_datasets)
print(tokenized_datasets["train"][0])
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
hub_model_id="your_username/your_model_name",
remove_unused_columns=False # Ensure input columns are kept
)
from huggingface_hub import notebook_login
# Authenticate with Hugging Face
notebook_login()
# Push model and tokenizer
model.push_to_hub("and89/fine_tuned_llama2")
tokenizer.push_to_hub("and89/fine_tuned_llama2")
from transformers import pipeline
# Load model from Hugging Face
classifier = pipeline("text-classification", model="and89/fine_tuned_llama2")
# Run inference
result = classifier("Your input text here")
print(result)
!pip install gradio
import gradio as gr
def predict(text):
return classifier(text)
demo = gr.Interface(fn=predict, inputs="text", outputs="text")
demo.launch()
from transformers import pipeline
# Load the fine-tuned model
model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
def predict(text):
return classifier(text)[0]["label"] # Extracts the predicted label
# Test the function
print("β
Model loaded successfully!")
print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed"))
from huggingface_hub import login
login() # This will automatically use the HF_TOKEN secret
from google.colab import runtime
runtime.unassign()
import gradio as gr
from transformers import pipeline
# Load the fine-tuned model
model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
# Define label mapping (adjust based on your dataset)
label_mapping = {
"LABEL_0": "Not Guilty",
"LABEL_1": "Guilty"
}
def predict(text):
result = classifier(text)[0] # Extract the first result
label = result["label"] # Get the predicted label (e.g., "LABEL_1")
score = result["score"] # Confidence score
# Map label to meaningful text
label_text = label_mapping.get(label, "Unknown")
return f"Prediction: {label_text} (Confidence: {score:.2f})"
# Gradio UI
demo = gr.Interface(
fn=predict,
inputs="text",
outputs="text",
title="Legal Case Decision Predictor",
description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'."
)
# Launch the Gradio app
demo.launch()
|