Spaces:

and89
/

demo3-app

Sleeping

App Files Files Community

demo3-app / app.py

and89

Update app.py

c68fb4b verified 7 days ago

raw

history blame contribute delete

12.6 kB

	# -- coding: utf-8 --
	"""Untitled15.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH
	"""

	import pandas as pd

	# Mount Google Drive (optional if you want to save files there)
	from google.colab import drive
	drive.mount('/content/drive')

	# Define file paths
	input_csv_path = "/content/drive/MyDrive/judicial_cases.csv" # Ensure you have uploaded this file
	train_csv_path = "/content/training_judicial_cases.csv"
	val_csv_path = "/content/validation_judicial_cases.csv"

	# Load the dataset
	df = pd.read_csv(input_csv_path)

	# Split dataset (80% training, 20% validation)
	train_df = df.sample(frac=0.8, random_state=42) # Random sampling for training
	val_df = df.drop(train_df.index) # Remaining 20% for validation

	# Save training and validation sets as CSV
	train_df.to_csv(train_csv_path, index=False)
	val_df.to_csv(val_csv_path, index=False)

	print(f"✅ Training set saved: {train_csv_path}")
	print(f"✅ Validation set saved: {val_csv_path}")

	# Copy to Google Drive (optional)
	train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv"
	val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv"

	!cp {train_csv_path} {train_drive_path}
	!cp {val_csv_path} {val_drive_path}

	print(f"📂 Training set also saved to Google Drive: {train_drive_path}")
	print(f"📂 Validation set also saved to Google Drive: {val_drive_path}")

	import os

	file_path = "/content/drive/MyDrive/training_data.jsonl"

	if os.path.exists(file_path):
	print("✅ File exists, proceeding with upload...")
	else:
	print("❌ File not found! Check file path.")

	import torch

	if torch.cuda.is_available():
	print("✅ GPU is available:", torch.cuda.get_device_name(0))
	else:
	print("❌ No GPU found! Go to Runtime → Change runtime type → Select GPU.")

	import pandas as pd

	# Load dataset
	df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv")

	# Display first few rows
	print(df.head())

	!pip install datasets

	!pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece

	from huggingface_hub import login

	login(token="") # Paste your HF token here
	print("✅ Hugging Face login successful!")

	from transformers import AutoModelForCausalLM, AutoTokenizer

	model_name = "meta-llama/Llama-2-7b-hf"

	tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
	model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True)

	print("✅ LLaMA 2 model loaded successfully!")

	from peft import LoraConfig, get_peft_model
	from transformers import TrainingArguments

	# Define QLoRA configuration
	lora_config = LoraConfig(
	r=16, # Low-rank adaptation size
	lora_alpha=32, # Scaling factor
	lora_dropout=0.05, # Dropout to prevent overfitting
	target_modules=["q_proj", "v_proj"] # Apply LoRA to attention layers
	)

	# Apply LoRA to the model
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	json_path = "/content/drive/MyDrive/judicial_cases.json"

	from datasets import load_dataset

	dataset = load_dataset("json", data_files={"train": json_path})
	print("✅ Dataset loaded successfully!")

	import os

	json_path = "/content/drive/MyDrive/judicial_cases.json" # Update the path if needed

	if os.path.exists(json_path):
	print(f"✅ JSON file found: {json_path}")
	else:
	print(f"❌ JSON file not found! You need to generate it first.")

	!pip install --upgrade datasets transformers

	import datasets
	from datasets import load_dataset

	print("✅ Hugging Face `datasets` library is installed and working!")

	import datasets
	from datasets import load_dataset

	print("✅ Hugging Face `datasets` library is installed and working!")

	from datasets import load_dataset

	# Load dataset from JSON file
	dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})

	# Split dataset into training (80%) and evaluation (20%)
	split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

	train_dataset = split_dataset["train"]
	eval_dataset = split_dataset["test"] # Required for evaluation

	print("✅ Dataset split into training and evaluation sets!")

	from google.colab import drive
	drive.mount('/content/drive')

	from datasets import load_dataset

	dataset = load_dataset("json", data_files={"train": json_path})

	print("✅ Dataset loaded successfully!")

	from transformers import AutoModelForCausalLM, AutoTokenizer

	model_name = "meta-llama/Llama-2-7b-hf"

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name, token="")

	# Load model without offloading
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype="auto",
	#device_map="auto", # Remove automatic device mapping
	#offload_folder="offload" # Remove offloading
	)

	# Manually move the model to the desired device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device) # Move entire model to GPU if available, else CPU

	print("✅ Model loaded successfully!")

	from datasets import load_dataset

	# Load dataset from JSON file
	dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})

	# Split dataset into training (80%) and evaluation (20%)
	split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

	train_dataset = split_dataset["train"]
	eval_dataset = split_dataset["test"] # Required for evaluation

	print("✅ Dataset split into training and evaluation sets!")

	from transformers import TrainingArguments

	training_args = TrainingArguments(
	output_dir="/content/fine_tuned_llama2",
	per_device_train_batch_size=2,
	gradient_accumulation_steps=4,
	warmup_steps=100,
	max_steps=500,
	learning_rate=2e-4,
	fp16=True,
	logging_steps=10,
	save_strategy="epoch",
	eval_strategy="epoch", # Fix deprecation warning
	push_to_hub=False
	)

	from transformers import Trainer

	trainer = Trainer(
	model=model, # Do NOT move manually
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset # Include evaluation dataset if available
	)

	print("✅ Trainer initialized successfully!")

	model.save_pretrained("/content/fine_tuned_llama2")
	tokenizer.save_pretrained("/content/fine_tuned_llama2")

	print("✅ Model saved successfully!")

	# Optional: Upload to Hugging Face
	from huggingface_hub import notebook_login
	notebook_login()

	# Replace "your-hf-username" with your actual Hugging Face username
	model.push_to_hub("and89/fine_tuned_llama2")
	tokenizer.push_to_hub("and89/fine_tuned_llama2")
	print("🚀 Model uploaded to Hugging Face!")

	from huggingface_hub import HfApi

	api = HfApi()
	datasets = api.list_repo_files("and89/fine_tuned_llama2")

	print("✅ Uploaded dataset files:", datasets)

	api.upload_file(
	path_or_fileobj="/content/drive/MyDrive/training_data.jsonl", # Update file path
	path_in_repo="training_data.jsonl",
	repo_id="and89/fine_tuned_llama2"
	)

	from transformers import Trainer

	# Tokenize the dataset
	def tokenize_function(examples):
	return tokenizer(examples["facts"], padding="max_length", truncation=True)

	# Assuming "facts" is the column you want to use for input

	train_dataset = train_dataset.map(tokenize_function, batched=True)
	eval_dataset = eval_dataset.map(tokenize_function, batched=True)

	# Now initialize the Trainer
	trainer = Trainer(
	model=model, # Do NOT move manually
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset # Include evaluation dataset if available
	)

	print("✅ Trainer initialized successfully!")

	from datasets import load_dataset

	# Replace with your dataset name
	dataset = load_dataset("and89/fine_tuned_llama2")

	# Check dataset format
	print(dataset)

	print(dataset["train"][0]) # Print first row to check structure

	print(dataset) # Prints dataset details
	print("Sample row:", dataset["train"][0]) # Prints the first row

	from datasets import load_dataset

	dataset = load_dataset("and89/fine_tuned_llama2")
	print("✅ Dataset loaded successfully!")
	print(dataset)

	from transformers import AutoTokenizer

	model_name = "bert-base-uncased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	print(dataset["train"].features)

	def preprocess_function(examples):
	text_column = list(dataset["train"].features.keys())[0] # Get the text column name

	# Ensure the input is a list of strings
	texts = examples[text_column]

	# Convert all values to strings in case they are not
	texts = [str(text) for text in texts]

	return tokenizer(texts, padding="max_length", truncation=True)

	tokenized_datasets = dataset.map(preprocess_function, batched=True)

	print("✅ Tokenization successful!")

	tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")

	print("✅ Tokenization successful!")
	print(tokenized_datasets)

	tokenized_datasets.save_to_disk("tokenized_dataset")

	# Reload and verify
	from datasets import load_from_disk
	reloaded_dataset = load_from_disk("tokenized_dataset")

	print("✅ Reloaded Tokenized Dataset:", reloaded_dataset)

	print(tokenized_datasets) # Prints available dataset splits

	from datasets import load_dataset

	# Load dataset
	dataset = load_dataset("and89/fine_tuned_llama2")

	# Split dataset (90% train, 10% test)
	train_test_split = dataset["train"].train_test_split(test_size=0.1)

	# Verify new splits
	print(train_test_split)

	from datasets import DatasetDict

	# Split dataset into train and test (90% train, 10% test)
	train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)

	# Convert to DatasetDict
	tokenized_datasets = DatasetDict({
	"train": train_test_split["train"],
	"test": train_test_split["test"]
	})

	print("✅ Train-Test split created:", tokenized_datasets)

	print(tokenized_datasets["train"][0])

	training_args = TrainingArguments(
	output_dir="./results",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	num_train_epochs=3,
	weight_decay=0.01,
	push_to_hub=True,
	hub_model_id="your_username/your_model_name",
	remove_unused_columns=False # Ensure input columns are kept
	)

	from huggingface_hub import notebook_login

	# Authenticate with Hugging Face
	notebook_login()

	# Push model and tokenizer
	model.push_to_hub("and89/fine_tuned_llama2")
	tokenizer.push_to_hub("and89/fine_tuned_llama2")

	from transformers import pipeline

	# Load model from Hugging Face
	classifier = pipeline("text-classification", model="and89/fine_tuned_llama2")

	# Run inference
	result = classifier("Your input text here")
	print(result)

	!pip install gradio

	import gradio as gr

	def predict(text):
	return classifier(text)

	demo = gr.Interface(fn=predict, inputs="text", outputs="text")
	demo.launch()

	from transformers import pipeline

	# Load the fine-tuned model
	model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
	classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

	def predict(text):
	return classifier(text)[0]["label"] # Extracts the predicted label

	# Test the function
	print("✅ Model loaded successfully!")
	print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed"))

	from huggingface_hub import login
	login() # This will automatically use the HF_TOKEN secret

	from google.colab import runtime
	runtime.unassign()

	import gradio as gr
	from transformers import pipeline

	# Load the fine-tuned model
	model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
	classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

	# Define label mapping (adjust based on your dataset)
	label_mapping = {
	"LABEL_0": "Not Guilty",
	"LABEL_1": "Guilty"
	}

	def predict(text):
	result = classifier(text)[0] # Extract the first result
	label = result["label"] # Get the predicted label (e.g., "LABEL_1")
	score = result["score"] # Confidence score

	# Map label to meaningful text
	label_text = label_mapping.get(label, "Unknown")

	return f"Prediction: {label_text} (Confidence: {score:.2f})"

	# Gradio UI
	demo = gr.Interface(
	fn=predict,
	inputs="text",
	outputs="text",
	title="Legal Case Decision Predictor",
	description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'."
	)

	# Launch the Gradio app
	demo.launch()