|
|
|
"""Untitled15.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH |
|
""" |
|
|
|
import pandas as pd |
|
|
|
|
|
from google.colab import drive |
|
drive.mount('/content/drive') |
|
|
|
|
|
input_csv_path = "/content/drive/MyDrive/judicial_cases.csv" |
|
train_csv_path = "/content/training_judicial_cases.csv" |
|
val_csv_path = "/content/validation_judicial_cases.csv" |
|
|
|
|
|
df = pd.read_csv(input_csv_path) |
|
|
|
|
|
train_df = df.sample(frac=0.8, random_state=42) |
|
val_df = df.drop(train_df.index) |
|
|
|
|
|
train_df.to_csv(train_csv_path, index=False) |
|
val_df.to_csv(val_csv_path, index=False) |
|
|
|
print(f"β
Training set saved: {train_csv_path}") |
|
print(f"β
Validation set saved: {val_csv_path}") |
|
|
|
|
|
train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv" |
|
val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv" |
|
|
|
!cp {train_csv_path} {train_drive_path} |
|
!cp {val_csv_path} {val_drive_path} |
|
|
|
print(f"π Training set also saved to Google Drive: {train_drive_path}") |
|
print(f"π Validation set also saved to Google Drive: {val_drive_path}") |
|
|
|
import os |
|
|
|
file_path = "/content/drive/MyDrive/training_data.jsonl" |
|
|
|
if os.path.exists(file_path): |
|
print("β
File exists, proceeding with upload...") |
|
else: |
|
print("β File not found! Check file path.") |
|
|
|
import torch |
|
|
|
if torch.cuda.is_available(): |
|
print("β
GPU is available:", torch.cuda.get_device_name(0)) |
|
else: |
|
print("β No GPU found! Go to Runtime β Change runtime type β Select GPU.") |
|
|
|
import pandas as pd |
|
|
|
|
|
df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv") |
|
|
|
|
|
print(df.head()) |
|
|
|
!pip install datasets |
|
|
|
!pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece |
|
|
|
from huggingface_hub import login |
|
|
|
login(token="") |
|
print("β
Hugging Face login successful!") |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
model_name = "meta-llama/Llama-2-7b-hf" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True) |
|
|
|
print("β
LLaMA 2 model loaded successfully!") |
|
|
|
from peft import LoraConfig, get_peft_model |
|
from transformers import TrainingArguments |
|
|
|
|
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
lora_dropout=0.05, |
|
target_modules=["q_proj", "v_proj"] |
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
model.print_trainable_parameters() |
|
|
|
json_path = "/content/drive/MyDrive/judicial_cases.json" |
|
|
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset("json", data_files={"train": json_path}) |
|
print("β
Dataset loaded successfully!") |
|
|
|
import os |
|
|
|
json_path = "/content/drive/MyDrive/judicial_cases.json" |
|
|
|
if os.path.exists(json_path): |
|
print(f"β
JSON file found: {json_path}") |
|
else: |
|
print(f"β JSON file not found! You need to generate it first.") |
|
|
|
!pip install --upgrade datasets transformers |
|
|
|
import datasets |
|
from datasets import load_dataset |
|
|
|
print("β
Hugging Face `datasets` library is installed and working!") |
|
|
|
import datasets |
|
from datasets import load_dataset |
|
|
|
print("β
Hugging Face `datasets` library is installed and working!") |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"}) |
|
|
|
|
|
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) |
|
|
|
train_dataset = split_dataset["train"] |
|
eval_dataset = split_dataset["test"] |
|
|
|
print("β
Dataset split into training and evaluation sets!") |
|
|
|
from google.colab import drive |
|
drive.mount('/content/drive') |
|
|
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset("json", data_files={"train": json_path}) |
|
|
|
print("β
Dataset loaded successfully!") |
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
model_name = "meta-llama/Llama-2-7b-hf" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, token="") |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype="auto", |
|
|
|
|
|
) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
print("β
Model loaded successfully!") |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"}) |
|
|
|
|
|
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) |
|
|
|
train_dataset = split_dataset["train"] |
|
eval_dataset = split_dataset["test"] |
|
|
|
print("β
Dataset split into training and evaluation sets!") |
|
|
|
from transformers import TrainingArguments |
|
|
|
training_args = TrainingArguments( |
|
output_dir="/content/fine_tuned_llama2", |
|
per_device_train_batch_size=2, |
|
gradient_accumulation_steps=4, |
|
warmup_steps=100, |
|
max_steps=500, |
|
learning_rate=2e-4, |
|
fp16=True, |
|
logging_steps=10, |
|
save_strategy="epoch", |
|
eval_strategy="epoch", |
|
push_to_hub=False |
|
) |
|
|
|
from transformers import Trainer |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset |
|
) |
|
|
|
print("β
Trainer initialized successfully!") |
|
|
|
model.save_pretrained("/content/fine_tuned_llama2") |
|
tokenizer.save_pretrained("/content/fine_tuned_llama2") |
|
|
|
print("β
Model saved successfully!") |
|
|
|
|
|
from huggingface_hub import notebook_login |
|
notebook_login() |
|
|
|
|
|
model.push_to_hub("and89/fine_tuned_llama2") |
|
tokenizer.push_to_hub("and89/fine_tuned_llama2") |
|
print("π Model uploaded to Hugging Face!") |
|
|
|
from huggingface_hub import HfApi |
|
|
|
api = HfApi() |
|
datasets = api.list_repo_files("and89/fine_tuned_llama2") |
|
|
|
print("β
Uploaded dataset files:", datasets) |
|
|
|
api.upload_file( |
|
path_or_fileobj="/content/drive/MyDrive/training_data.jsonl", |
|
path_in_repo="training_data.jsonl", |
|
repo_id="and89/fine_tuned_llama2" |
|
) |
|
|
|
from transformers import Trainer |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["facts"], padding="max_length", truncation=True) |
|
|
|
|
|
|
|
train_dataset = train_dataset.map(tokenize_function, batched=True) |
|
eval_dataset = eval_dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset |
|
) |
|
|
|
print("β
Trainer initialized successfully!") |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("and89/fine_tuned_llama2") |
|
|
|
|
|
print(dataset) |
|
|
|
print(dataset["train"][0]) |
|
|
|
print(dataset) |
|
print("Sample row:", dataset["train"][0]) |
|
|
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset("and89/fine_tuned_llama2") |
|
print("β
Dataset loaded successfully!") |
|
print(dataset) |
|
|
|
from transformers import AutoTokenizer |
|
|
|
model_name = "bert-base-uncased" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
print(dataset["train"].features) |
|
|
|
def preprocess_function(examples): |
|
text_column = list(dataset["train"].features.keys())[0] |
|
|
|
|
|
texts = examples[text_column] |
|
|
|
|
|
texts = [str(text) for text in texts] |
|
|
|
return tokenizer(texts, padding="max_length", truncation=True) |
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True) |
|
|
|
print("β
Tokenization successful!") |
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset") |
|
|
|
print("β
Tokenization successful!") |
|
print(tokenized_datasets) |
|
|
|
tokenized_datasets.save_to_disk("tokenized_dataset") |
|
|
|
|
|
from datasets import load_from_disk |
|
reloaded_dataset = load_from_disk("tokenized_dataset") |
|
|
|
print("β
Reloaded Tokenized Dataset:", reloaded_dataset) |
|
|
|
print(tokenized_datasets) |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("and89/fine_tuned_llama2") |
|
|
|
|
|
train_test_split = dataset["train"].train_test_split(test_size=0.1) |
|
|
|
|
|
print(train_test_split) |
|
|
|
from datasets import DatasetDict |
|
|
|
|
|
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1) |
|
|
|
|
|
tokenized_datasets = DatasetDict({ |
|
"train": train_test_split["train"], |
|
"test": train_test_split["test"] |
|
}) |
|
|
|
print("β
Train-Test split created:", tokenized_datasets) |
|
|
|
print(tokenized_datasets["train"][0]) |
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
push_to_hub=True, |
|
hub_model_id="your_username/your_model_name", |
|
remove_unused_columns=False |
|
) |
|
|
|
from huggingface_hub import notebook_login |
|
|
|
|
|
notebook_login() |
|
|
|
|
|
model.push_to_hub("and89/fine_tuned_llama2") |
|
tokenizer.push_to_hub("and89/fine_tuned_llama2") |
|
|
|
from transformers import pipeline |
|
|
|
|
|
classifier = pipeline("text-classification", model="and89/fine_tuned_llama2") |
|
|
|
|
|
result = classifier("Your input text here") |
|
print(result) |
|
|
|
!pip install gradio |
|
|
|
import gradio as gr |
|
|
|
def predict(text): |
|
return classifier(text) |
|
|
|
demo = gr.Interface(fn=predict, inputs="text", outputs="text") |
|
demo.launch() |
|
|
|
from transformers import pipeline |
|
|
|
|
|
model_name = "and89/fine_tuned_llama2" |
|
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name) |
|
|
|
def predict(text): |
|
return classifier(text)[0]["label"] |
|
|
|
|
|
print("β
Model loaded successfully!") |
|
print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed")) |
|
|
|
from huggingface_hub import login |
|
login() |
|
|
|
from google.colab import runtime |
|
runtime.unassign() |
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
|
|
model_name = "and89/fine_tuned_llama2" |
|
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name) |
|
|
|
|
|
label_mapping = { |
|
"LABEL_0": "Not Guilty", |
|
"LABEL_1": "Guilty" |
|
} |
|
|
|
def predict(text): |
|
result = classifier(text)[0] |
|
label = result["label"] |
|
score = result["score"] |
|
|
|
|
|
label_text = label_mapping.get(label, "Unknown") |
|
|
|
return f"Prediction: {label_text} (Confidence: {score:.2f})" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=predict, |
|
inputs="text", |
|
outputs="text", |
|
title="Legal Case Decision Predictor", |
|
description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'." |
|
) |
|
|
|
|
|
demo.launch() |
|
|