# -*- coding: utf-8 -*- """Untitled15.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH """ import pandas as pd # Mount Google Drive (optional if you want to save files there) from google.colab import drive drive.mount('/content/drive') # Define file paths input_csv_path = "/content/drive/MyDrive/judicial_cases.csv" # Ensure you have uploaded this file train_csv_path = "/content/training_judicial_cases.csv" val_csv_path = "/content/validation_judicial_cases.csv" # Load the dataset df = pd.read_csv(input_csv_path) # Split dataset (80% training, 20% validation) train_df = df.sample(frac=0.8, random_state=42) # Random sampling for training val_df = df.drop(train_df.index) # Remaining 20% for validation # Save training and validation sets as CSV train_df.to_csv(train_csv_path, index=False) val_df.to_csv(val_csv_path, index=False) print(f"✅ Training set saved: {train_csv_path}") print(f"✅ Validation set saved: {val_csv_path}") # Copy to Google Drive (optional) train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv" val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv" !cp {train_csv_path} {train_drive_path} !cp {val_csv_path} {val_drive_path} print(f"📂 Training set also saved to Google Drive: {train_drive_path}") print(f"📂 Validation set also saved to Google Drive: {val_drive_path}") import os file_path = "/content/drive/MyDrive/training_data.jsonl" if os.path.exists(file_path): print("✅ File exists, proceeding with upload...") else: print("❌ File not found! Check file path.") import torch if torch.cuda.is_available(): print("✅ GPU is available:", torch.cuda.get_device_name(0)) else: print("❌ No GPU found! Go to Runtime → Change runtime type → Select GPU.") import pandas as pd # Load dataset df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv") # Display first few rows print(df.head()) !pip install datasets !pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece from huggingface_hub import login login(token="") # Paste your HF token here print("✅ Hugging Face login successful!") from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "meta-llama/Llama-2-7b-hf" tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True) print("✅ LLaMA 2 model loaded successfully!") from peft import LoraConfig, get_peft_model from transformers import TrainingArguments # Define QLoRA configuration lora_config = LoraConfig( r=16, # Low-rank adaptation size lora_alpha=32, # Scaling factor lora_dropout=0.05, # Dropout to prevent overfitting target_modules=["q_proj", "v_proj"] # Apply LoRA to attention layers ) # Apply LoRA to the model model = get_peft_model(model, lora_config) model.print_trainable_parameters() json_path = "/content/drive/MyDrive/judicial_cases.json" from datasets import load_dataset dataset = load_dataset("json", data_files={"train": json_path}) print("✅ Dataset loaded successfully!") import os json_path = "/content/drive/MyDrive/judicial_cases.json" # Update the path if needed if os.path.exists(json_path): print(f"✅ JSON file found: {json_path}") else: print(f"❌ JSON file not found! You need to generate it first.") !pip install --upgrade datasets transformers import datasets from datasets import load_dataset print("✅ Hugging Face `datasets` library is installed and working!") import datasets from datasets import load_dataset print("✅ Hugging Face `datasets` library is installed and working!") from datasets import load_dataset # Load dataset from JSON file dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"}) # Split dataset into training (80%) and evaluation (20%) split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # Required for evaluation print("✅ Dataset split into training and evaluation sets!") from google.colab import drive drive.mount('/content/drive') from datasets import load_dataset dataset = load_dataset("json", data_files={"train": json_path}) print("✅ Dataset loaded successfully!") from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "meta-llama/Llama-2-7b-hf" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, token="") # Load model without offloading model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", #device_map="auto", # Remove automatic device mapping #offload_folder="offload" # Remove offloading ) # Manually move the model to the desired device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Move entire model to GPU if available, else CPU print("✅ Model loaded successfully!") from datasets import load_dataset # Load dataset from JSON file dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"}) # Split dataset into training (80%) and evaluation (20%) split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] # Required for evaluation print("✅ Dataset split into training and evaluation sets!") from transformers import TrainingArguments training_args = TrainingArguments( output_dir="/content/fine_tuned_llama2", per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=100, max_steps=500, learning_rate=2e-4, fp16=True, logging_steps=10, save_strategy="epoch", eval_strategy="epoch", # Fix deprecation warning push_to_hub=False ) from transformers import Trainer trainer = Trainer( model=model, # Do NOT move manually args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset # Include evaluation dataset if available ) print("✅ Trainer initialized successfully!") model.save_pretrained("/content/fine_tuned_llama2") tokenizer.save_pretrained("/content/fine_tuned_llama2") print("✅ Model saved successfully!") # Optional: Upload to Hugging Face from huggingface_hub import notebook_login notebook_login() # Replace "your-hf-username" with your actual Hugging Face username model.push_to_hub("and89/fine_tuned_llama2") tokenizer.push_to_hub("and89/fine_tuned_llama2") print("🚀 Model uploaded to Hugging Face!") from huggingface_hub import HfApi api = HfApi() datasets = api.list_repo_files("and89/fine_tuned_llama2") print("✅ Uploaded dataset files:", datasets) api.upload_file( path_or_fileobj="/content/drive/MyDrive/training_data.jsonl", # Update file path path_in_repo="training_data.jsonl", repo_id="and89/fine_tuned_llama2" ) from transformers import Trainer # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["facts"], padding="max_length", truncation=True) # Assuming "facts" is the column you want to use for input train_dataset = train_dataset.map(tokenize_function, batched=True) eval_dataset = eval_dataset.map(tokenize_function, batched=True) # Now initialize the Trainer trainer = Trainer( model=model, # Do NOT move manually args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset # Include evaluation dataset if available ) print("✅ Trainer initialized successfully!") from datasets import load_dataset # Replace with your dataset name dataset = load_dataset("and89/fine_tuned_llama2") # Check dataset format print(dataset) print(dataset["train"][0]) # Print first row to check structure print(dataset) # Prints dataset details print("Sample row:", dataset["train"][0]) # Prints the first row from datasets import load_dataset dataset = load_dataset("and89/fine_tuned_llama2") print("✅ Dataset loaded successfully!") print(dataset) from transformers import AutoTokenizer model_name = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) print(dataset["train"].features) def preprocess_function(examples): text_column = list(dataset["train"].features.keys())[0] # Get the text column name # Ensure the input is a list of strings texts = examples[text_column] # Convert all values to strings in case they are not texts = [str(text) for text in texts] return tokenizer(texts, padding="max_length", truncation=True) tokenized_datasets = dataset.map(preprocess_function, batched=True) print("✅ Tokenization successful!") tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset") print("✅ Tokenization successful!") print(tokenized_datasets) tokenized_datasets.save_to_disk("tokenized_dataset") # Reload and verify from datasets import load_from_disk reloaded_dataset = load_from_disk("tokenized_dataset") print("✅ Reloaded Tokenized Dataset:", reloaded_dataset) print(tokenized_datasets) # Prints available dataset splits from datasets import load_dataset # Load dataset dataset = load_dataset("and89/fine_tuned_llama2") # Split dataset (90% train, 10% test) train_test_split = dataset["train"].train_test_split(test_size=0.1) # Verify new splits print(train_test_split) from datasets import DatasetDict # Split dataset into train and test (90% train, 10% test) train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1) # Convert to DatasetDict tokenized_datasets = DatasetDict({ "train": train_test_split["train"], "test": train_test_split["test"] }) print("✅ Train-Test split created:", tokenized_datasets) print(tokenized_datasets["train"][0]) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, push_to_hub=True, hub_model_id="your_username/your_model_name", remove_unused_columns=False # Ensure input columns are kept ) from huggingface_hub import notebook_login # Authenticate with Hugging Face notebook_login() # Push model and tokenizer model.push_to_hub("and89/fine_tuned_llama2") tokenizer.push_to_hub("and89/fine_tuned_llama2") from transformers import pipeline # Load model from Hugging Face classifier = pipeline("text-classification", model="and89/fine_tuned_llama2") # Run inference result = classifier("Your input text here") print(result) !pip install gradio import gradio as gr def predict(text): return classifier(text) demo = gr.Interface(fn=predict, inputs="text", outputs="text") demo.launch() from transformers import pipeline # Load the fine-tuned model model_name = "and89/fine_tuned_llama2" # Replace with your actual model name classifier = pipeline("text-classification", model=model_name, tokenizer=model_name) def predict(text): return classifier(text)[0]["label"] # Extracts the predicted label # Test the function print("✅ Model loaded successfully!") print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed")) from huggingface_hub import login login() # This will automatically use the HF_TOKEN secret from google.colab import runtime runtime.unassign() import gradio as gr from transformers import pipeline # Load the fine-tuned model model_name = "and89/fine_tuned_llama2" # Replace with your actual model name classifier = pipeline("text-classification", model=model_name, tokenizer=model_name) # Define label mapping (adjust based on your dataset) label_mapping = { "LABEL_0": "Not Guilty", "LABEL_1": "Guilty" } def predict(text): result = classifier(text)[0] # Extract the first result label = result["label"] # Get the predicted label (e.g., "LABEL_1") score = result["score"] # Confidence score # Map label to meaningful text label_text = label_mapping.get(label, "Unknown") return f"Prediction: {label_text} (Confidence: {score:.2f})" # Gradio UI demo = gr.Interface( fn=predict, inputs="text", outputs="text", title="Legal Case Decision Predictor", description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'." ) # Launch the Gradio app demo.launch()