demo3-app / app.py
and89's picture
Update app.py
c68fb4b verified
# -*- coding: utf-8 -*-
"""Untitled15.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH
"""
import pandas as pd
# Mount Google Drive (optional if you want to save files there)
from google.colab import drive
drive.mount('/content/drive')
# Define file paths
input_csv_path = "/content/drive/MyDrive/judicial_cases.csv" # Ensure you have uploaded this file
train_csv_path = "/content/training_judicial_cases.csv"
val_csv_path = "/content/validation_judicial_cases.csv"
# Load the dataset
df = pd.read_csv(input_csv_path)
# Split dataset (80% training, 20% validation)
train_df = df.sample(frac=0.8, random_state=42) # Random sampling for training
val_df = df.drop(train_df.index) # Remaining 20% for validation
# Save training and validation sets as CSV
train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)
print(f"βœ… Training set saved: {train_csv_path}")
print(f"βœ… Validation set saved: {val_csv_path}")
# Copy to Google Drive (optional)
train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv"
val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv"
!cp {train_csv_path} {train_drive_path}
!cp {val_csv_path} {val_drive_path}
print(f"πŸ“‚ Training set also saved to Google Drive: {train_drive_path}")
print(f"πŸ“‚ Validation set also saved to Google Drive: {val_drive_path}")
import os
file_path = "/content/drive/MyDrive/training_data.jsonl"
if os.path.exists(file_path):
print("βœ… File exists, proceeding with upload...")
else:
print("❌ File not found! Check file path.")
import torch
if torch.cuda.is_available():
print("βœ… GPU is available:", torch.cuda.get_device_name(0))
else:
print("❌ No GPU found! Go to Runtime β†’ Change runtime type β†’ Select GPU.")
import pandas as pd
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv")
# Display first few rows
print(df.head())
!pip install datasets
!pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece
from huggingface_hub import login
login(token="") # Paste your HF token here
print("βœ… Hugging Face login successful!")
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True)
print("βœ… LLaMA 2 model loaded successfully!")
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
# Define QLoRA configuration
lora_config = LoraConfig(
r=16, # Low-rank adaptation size
lora_alpha=32, # Scaling factor
lora_dropout=0.05, # Dropout to prevent overfitting
target_modules=["q_proj", "v_proj"] # Apply LoRA to attention layers
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
json_path = "/content/drive/MyDrive/judicial_cases.json"
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": json_path})
print("βœ… Dataset loaded successfully!")
import os
json_path = "/content/drive/MyDrive/judicial_cases.json" # Update the path if needed
if os.path.exists(json_path):
print(f"βœ… JSON file found: {json_path}")
else:
print(f"❌ JSON file not found! You need to generate it first.")
!pip install --upgrade datasets transformers
import datasets
from datasets import load_dataset
print("βœ… Hugging Face `datasets` library is installed and working!")
import datasets
from datasets import load_dataset
print("βœ… Hugging Face `datasets` library is installed and working!")
from datasets import load_dataset
# Load dataset from JSON file
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
# Split dataset into training (80%) and evaluation (20%)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"] # Required for evaluation
print("βœ… Dataset split into training and evaluation sets!")
from google.colab import drive
drive.mount('/content/drive')
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": json_path})
print("βœ… Dataset loaded successfully!")
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token="")
# Load model without offloading
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
#device_map="auto", # Remove automatic device mapping
#offload_folder="offload" # Remove offloading
)
# Manually move the model to the desired device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move entire model to GPU if available, else CPU
print("βœ… Model loaded successfully!")
from datasets import load_dataset
# Load dataset from JSON file
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
# Split dataset into training (80%) and evaluation (20%)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"] # Required for evaluation
print("βœ… Dataset split into training and evaluation sets!")
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="/content/fine_tuned_llama2",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=100,
max_steps=500,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_strategy="epoch",
eval_strategy="epoch", # Fix deprecation warning
push_to_hub=False
)
from transformers import Trainer
trainer = Trainer(
model=model, # Do NOT move manually
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset # Include evaluation dataset if available
)
print("βœ… Trainer initialized successfully!")
model.save_pretrained("/content/fine_tuned_llama2")
tokenizer.save_pretrained("/content/fine_tuned_llama2")
print("βœ… Model saved successfully!")
# Optional: Upload to Hugging Face
from huggingface_hub import notebook_login
notebook_login()
# Replace "your-hf-username" with your actual Hugging Face username
model.push_to_hub("and89/fine_tuned_llama2")
tokenizer.push_to_hub("and89/fine_tuned_llama2")
print("πŸš€ Model uploaded to Hugging Face!")
from huggingface_hub import HfApi
api = HfApi()
datasets = api.list_repo_files("and89/fine_tuned_llama2")
print("βœ… Uploaded dataset files:", datasets)
api.upload_file(
path_or_fileobj="/content/drive/MyDrive/training_data.jsonl", # Update file path
path_in_repo="training_data.jsonl",
repo_id="and89/fine_tuned_llama2"
)
from transformers import Trainer
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["facts"], padding="max_length", truncation=True)
# Assuming "facts" is the column you want to use for input
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
# Now initialize the Trainer
trainer = Trainer(
model=model, # Do NOT move manually
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset # Include evaluation dataset if available
)
print("βœ… Trainer initialized successfully!")
from datasets import load_dataset
# Replace with your dataset name
dataset = load_dataset("and89/fine_tuned_llama2")
# Check dataset format
print(dataset)
print(dataset["train"][0]) # Print first row to check structure
print(dataset) # Prints dataset details
print("Sample row:", dataset["train"][0]) # Prints the first row
from datasets import load_dataset
dataset = load_dataset("and89/fine_tuned_llama2")
print("βœ… Dataset loaded successfully!")
print(dataset)
from transformers import AutoTokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(dataset["train"].features)
def preprocess_function(examples):
text_column = list(dataset["train"].features.keys())[0] # Get the text column name
# Ensure the input is a list of strings
texts = examples[text_column]
# Convert all values to strings in case they are not
texts = [str(text) for text in texts]
return tokenizer(texts, padding="max_length", truncation=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
print("βœ… Tokenization successful!")
tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")
print("βœ… Tokenization successful!")
print(tokenized_datasets)
tokenized_datasets.save_to_disk("tokenized_dataset")
# Reload and verify
from datasets import load_from_disk
reloaded_dataset = load_from_disk("tokenized_dataset")
print("βœ… Reloaded Tokenized Dataset:", reloaded_dataset)
print(tokenized_datasets) # Prints available dataset splits
from datasets import load_dataset
# Load dataset
dataset = load_dataset("and89/fine_tuned_llama2")
# Split dataset (90% train, 10% test)
train_test_split = dataset["train"].train_test_split(test_size=0.1)
# Verify new splits
print(train_test_split)
from datasets import DatasetDict
# Split dataset into train and test (90% train, 10% test)
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
# Convert to DatasetDict
tokenized_datasets = DatasetDict({
"train": train_test_split["train"],
"test": train_test_split["test"]
})
print("βœ… Train-Test split created:", tokenized_datasets)
print(tokenized_datasets["train"][0])
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
hub_model_id="your_username/your_model_name",
remove_unused_columns=False # Ensure input columns are kept
)
from huggingface_hub import notebook_login
# Authenticate with Hugging Face
notebook_login()
# Push model and tokenizer
model.push_to_hub("and89/fine_tuned_llama2")
tokenizer.push_to_hub("and89/fine_tuned_llama2")
from transformers import pipeline
# Load model from Hugging Face
classifier = pipeline("text-classification", model="and89/fine_tuned_llama2")
# Run inference
result = classifier("Your input text here")
print(result)
!pip install gradio
import gradio as gr
def predict(text):
return classifier(text)
demo = gr.Interface(fn=predict, inputs="text", outputs="text")
demo.launch()
from transformers import pipeline
# Load the fine-tuned model
model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
def predict(text):
return classifier(text)[0]["label"] # Extracts the predicted label
# Test the function
print("βœ… Model loaded successfully!")
print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed"))
from huggingface_hub import login
login() # This will automatically use the HF_TOKEN secret
from google.colab import runtime
runtime.unassign()
import gradio as gr
from transformers import pipeline
# Load the fine-tuned model
model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
# Define label mapping (adjust based on your dataset)
label_mapping = {
"LABEL_0": "Not Guilty",
"LABEL_1": "Guilty"
}
def predict(text):
result = classifier(text)[0] # Extract the first result
label = result["label"] # Get the predicted label (e.g., "LABEL_1")
score = result["score"] # Confidence score
# Map label to meaningful text
label_text = label_mapping.get(label, "Unknown")
return f"Prediction: {label_text} (Confidence: {score:.2f})"
# Gradio UI
demo = gr.Interface(
fn=predict,
inputs="text",
outputs="text",
title="Legal Case Decision Predictor",
description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'."
)
# Launch the Gradio app
demo.launch()