Spaces:

and89
/

demo3-app

Sleeping

File size: 12,555 Bytes

# -*- coding: utf-8 -*-
"""Untitled15.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH
"""

import pandas as pd

# Mount Google Drive (optional if you want to save files there)
from google.colab import drive
drive.mount('/content/drive')

# Define file paths
input_csv_path = "/content/drive/MyDrive/judicial_cases.csv"  # Ensure you have uploaded this file
train_csv_path = "/content/training_judicial_cases.csv"
val_csv_path = "/content/validation_judicial_cases.csv"

# Load the dataset
df = pd.read_csv(input_csv_path)

# Split dataset (80% training, 20% validation)
train_df = df.sample(frac=0.8, random_state=42)  # Random sampling for training
val_df = df.drop(train_df.index)  # Remaining 20% for validation

# Save training and validation sets as CSV
train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)

print(f"✅ Training set saved: {train_csv_path}")
print(f"✅ Validation set saved: {val_csv_path}")

# Copy to Google Drive (optional)
train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv"
val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv"

!cp {train_csv_path} {train_drive_path}
!cp {val_csv_path} {val_drive_path}

print(f"📂 Training set also saved to Google Drive: {train_drive_path}")
print(f"📂 Validation set also saved to Google Drive: {val_drive_path}")

import os

file_path = "/content/drive/MyDrive/training_data.jsonl"

if os.path.exists(file_path):
    print("✅ File exists, proceeding with upload...")
else:
    print("❌ File not found! Check file path.")

import torch

if torch.cuda.is_available():
    print("✅ GPU is available:", torch.cuda.get_device_name(0))
else:
    print("❌ No GPU found! Go to Runtime → Change runtime type → Select GPU.")

import pandas as pd

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv")

# Display first few rows
print(df.head())

!pip install datasets

!pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece

from huggingface_hub import login

login(token="")  # Paste your HF token here
print("✅ Hugging Face login successful!")

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True)

print("✅ LLaMA 2 model loaded successfully!")

from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments

# Define QLoRA configuration
lora_config = LoraConfig(
    r=16,  # Low-rank adaptation size
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.05,  # Dropout to prevent overfitting
    target_modules=["q_proj", "v_proj"]  # Apply LoRA to attention layers
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

json_path = "/content/drive/MyDrive/judicial_cases.json"

from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": json_path})
print("✅ Dataset loaded successfully!")

import os

json_path = "/content/drive/MyDrive/judicial_cases.json"  # Update the path if needed

if os.path.exists(json_path):
    print(f"✅ JSON file found: {json_path}")
else:
    print(f"❌ JSON file not found! You need to generate it first.")

!pip install --upgrade datasets transformers

import datasets
from datasets import load_dataset

print("✅ Hugging Face `datasets` library is installed and working!")

import datasets
from datasets import load_dataset

print("✅ Hugging Face `datasets` library is installed and working!")

from datasets import load_dataset

# Load dataset from JSON file
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})

# Split dataset into training (80%) and evaluation (20%)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]  # Required for evaluation

print("✅ Dataset split into training and evaluation sets!")

from google.colab import drive
drive.mount('/content/drive')

from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": json_path})

print("✅ Dataset loaded successfully!")

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token="")

# Load model without offloading
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    #device_map="auto",  # Remove automatic device mapping
    #offload_folder="offload"  # Remove offloading
)

# Manually move the model to the desired device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move entire model to GPU if available, else CPU

print("✅ Model loaded successfully!")

from datasets import load_dataset

# Load dataset from JSON file
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})

# Split dataset into training (80%) and evaluation (20%)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]  # Required for evaluation

print("✅ Dataset split into training and evaluation sets!")

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/fine_tuned_llama2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=500,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",  # Fix deprecation warning
    push_to_hub=False
)

from transformers import Trainer

trainer = Trainer(
    model=model,  # Do NOT move manually
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # Include evaluation dataset if available
)

print("✅ Trainer initialized successfully!")

model.save_pretrained("/content/fine_tuned_llama2")
tokenizer.save_pretrained("/content/fine_tuned_llama2")

print("✅ Model saved successfully!")

# Optional: Upload to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# Replace "your-hf-username" with your actual Hugging Face username
model.push_to_hub("and89/fine_tuned_llama2")
tokenizer.push_to_hub("and89/fine_tuned_llama2")
print("🚀 Model uploaded to Hugging Face!")

from huggingface_hub import HfApi

api = HfApi()
datasets = api.list_repo_files("and89/fine_tuned_llama2")

print("✅ Uploaded dataset files:", datasets)

api.upload_file(
    path_or_fileobj="/content/drive/MyDrive/training_data.jsonl",  # Update file path
    path_in_repo="training_data.jsonl",
    repo_id="and89/fine_tuned_llama2"
)

from transformers import Trainer

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["facts"], padding="max_length", truncation=True)

# Assuming "facts" is the column you want to use for input

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Now initialize the Trainer
trainer = Trainer(
    model=model,  # Do NOT move manually
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # Include evaluation dataset if available
)

print("✅ Trainer initialized successfully!")

from datasets import load_dataset

# Replace with your dataset name
dataset = load_dataset("and89/fine_tuned_llama2")

# Check dataset format
print(dataset)

print(dataset["train"][0])  # Print first row to check structure

print(dataset)  # Prints dataset details
print("Sample row:", dataset["train"][0])  # Prints the first row

from datasets import load_dataset

dataset = load_dataset("and89/fine_tuned_llama2")
print("✅ Dataset loaded successfully!")
print(dataset)

from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(dataset["train"].features)

def preprocess_function(examples):
    text_column = list(dataset["train"].features.keys())[0]  # Get the text column name

    # Ensure the input is a list of strings
    texts = examples[text_column]

    # Convert all values to strings in case they are not
    texts = [str(text) for text in texts]

    return tokenizer(texts, padding="max_length", truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

print("✅ Tokenization successful!")

tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")

print("✅ Tokenization successful!")
print(tokenized_datasets)

tokenized_datasets.save_to_disk("tokenized_dataset")

# Reload and verify
from datasets import load_from_disk
reloaded_dataset = load_from_disk("tokenized_dataset")

print("✅ Reloaded Tokenized Dataset:", reloaded_dataset)

print(tokenized_datasets)  # Prints available dataset splits

from datasets import load_dataset

# Load dataset
dataset = load_dataset("and89/fine_tuned_llama2")

# Split dataset (90% train, 10% test)
train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Verify new splits
print(train_test_split)

from datasets import DatasetDict

# Split dataset into train and test (90% train, 10% test)
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)

# Convert to DatasetDict
tokenized_datasets = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

print("✅ Train-Test split created:", tokenized_datasets)

print(tokenized_datasets["train"][0])

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    hub_model_id="your_username/your_model_name",
    remove_unused_columns=False  # Ensure input columns are kept
)

from huggingface_hub import notebook_login

# Authenticate with Hugging Face
notebook_login()

# Push model and tokenizer
model.push_to_hub("and89/fine_tuned_llama2")
tokenizer.push_to_hub("and89/fine_tuned_llama2")

from transformers import pipeline

# Load model from Hugging Face
classifier = pipeline("text-classification", model="and89/fine_tuned_llama2")

# Run inference
result = classifier("Your input text here")
print(result)

!pip install gradio

import gradio as gr

def predict(text):
    return classifier(text)

demo = gr.Interface(fn=predict, inputs="text", outputs="text")
demo.launch()

from transformers import pipeline

# Load the fine-tuned model
model_name = "and89/fine_tuned_llama2"  # Replace with your actual model name
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

def predict(text):
    return classifier(text)[0]["label"]  # Extracts the predicted label

# Test the function
print("✅ Model loaded successfully!")
print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed"))

from huggingface_hub import login
login()  # This will automatically use the HF_TOKEN secret

from google.colab import runtime
runtime.unassign()

import gradio as gr
from transformers import pipeline

# Load the fine-tuned model
model_name = "and89/fine_tuned_llama2"  # Replace with your actual model name
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

# Define label mapping (adjust based on your dataset)
label_mapping = {
    "LABEL_0": "Not Guilty",
    "LABEL_1": "Guilty"
}

def predict(text):
    result = classifier(text)[0]  # Extract the first result
    label = result["label"]  # Get the predicted label (e.g., "LABEL_1")
    score = result["score"]  # Confidence score

    # Map label to meaningful text
    label_text = label_mapping.get(label, "Unknown")

    return f"Prediction: {label_text} (Confidence: {score:.2f})"

# Gradio UI
demo = gr.Interface(
    fn=predict,
    inputs="text",
    outputs="text",
    title="Legal Case Decision Predictor",
    description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'."
)

# Launch the Gradio app
demo.launch()