Spaces:

and89
/

demo3-app

Sleeping

App Files Files Community

and89 commited on Mar 7

Commit

d58462e

verified ·

1 Parent(s): 6f16bf3

requirements.txt

Browse files

torch==2.2.0
transformers==4.39.1
datasets==2.18.0
accelerate==0.27.2
peft==0.10.0
bitsandbytes==0.41.0
sentencepiece==0.1.99
gradio==4.20.0
google-colab
pandas
huggingface_hub==0.21.3

Files changed (1) hide show

app.py +436 -0

app.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# -*- coding: utf-8 -*-
+"""Untitled15.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH
+"""
+import pandas as pd
+# Mount Google Drive (optional if you want to save files there)
+from google.colab import drive
+drive.mount('/content/drive')
+# Define file paths
+input_csv_path = "/content/drive/MyDrive/judicial_cases.csv"  # Ensure you have uploaded this file
+train_csv_path = "/content/training_judicial_cases.csv"
+val_csv_path = "/content/validation_judicial_cases.csv"
+# Load the dataset
+df = pd.read_csv(input_csv_path)
+# Split dataset (80% training, 20% validation)
+train_df = df.sample(frac=0.8, random_state=42)  # Random sampling for training
+val_df = df.drop(train_df.index)  # Remaining 20% for validation
+# Save training and validation sets as CSV
+train_df.to_csv(train_csv_path, index=False)
+val_df.to_csv(val_csv_path, index=False)
+print(f"✅ Training set saved: {train_csv_path}")
+print(f"✅ Validation set saved: {val_csv_path}")
+# Copy to Google Drive (optional)
+train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv"
+val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv"
+!cp {train_csv_path} {train_drive_path}
+!cp {val_csv_path} {val_drive_path}
+print(f"📂 Training set also saved to Google Drive: {train_drive_path}")
+print(f"📂 Validation set also saved to Google Drive: {val_drive_path}")
+import os
+file_path = "/content/drive/MyDrive/training_data.jsonl"
+if os.path.exists(file_path):
+    print("✅ File exists, proceeding with upload...")
+else:
+    print("❌ File not found! Check file path.")
+import torch
+if torch.cuda.is_available():
+    print("✅ GPU is available:", torch.cuda.get_device_name(0))
+else:
+    print("❌ No GPU found! Go to Runtime → Change runtime type → Select GPU.")
+import pandas as pd
+# Load dataset
+df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv")
+# Display first few rows
+print(df.head())
+!pip install datasets
+!pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece
+from huggingface_hub import login
+login(token="")  # Paste your HF token here
+print("✅ Hugging Face login successful!")
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "meta-llama/Llama-2-7b-hf"
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True)
+print("✅ LLaMA 2 model loaded successfully!")
+from peft import LoraConfig, get_peft_model
+from transformers import TrainingArguments
+# Define QLoRA configuration
+lora_config = LoraConfig(
+    r=16,  # Low-rank adaptation size
+    lora_alpha=32,  # Scaling factor
+    lora_dropout=0.05,  # Dropout to prevent overfitting
+    target_modules=["q_proj", "v_proj"]  # Apply LoRA to attention layers
+)
+# Apply LoRA to the model
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+json_path = "/content/drive/MyDrive/judicial_cases.json"
+from datasets import load_dataset
+dataset = load_dataset("json", data_files={"train": json_path})
+print("✅ Dataset loaded successfully!")
+import os
+json_path = "/content/drive/MyDrive/judicial_cases.json"  # Update the path if needed
+if os.path.exists(json_path):
+    print(f"✅ JSON file found: {json_path}")
+else:
+    print(f"❌ JSON file not found! You need to generate it first.")
+!pip install --upgrade datasets transformers
+import datasets
+from datasets import load_dataset
+print("✅ Hugging Face `datasets` library is installed and working!")
+import datasets
+from datasets import load_dataset
+print("✅ Hugging Face `datasets` library is installed and working!")
+from datasets import load_dataset
+# Load dataset from JSON file
+dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
+# Split dataset into training (80%) and evaluation (20%)
+split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
+train_dataset = split_dataset["train"]
+eval_dataset = split_dataset["test"]  # Required for evaluation
+print("✅ Dataset split into training and evaluation sets!")
+from google.colab import drive
+drive.mount('/content/drive')
+from datasets import load_dataset
+dataset = load_dataset("json", data_files={"train": json_path})
+print("✅ Dataset loaded successfully!")
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "meta-llama/Llama-2-7b-hf"
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, token="")
+# Load model without offloading
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    #device_map="auto",  # Remove automatic device mapping
+    #offload_folder="offload"  # Remove offloading
+)
+# Manually move the model to the desired device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device) # Move entire model to GPU if available, else CPU
+print("✅ Model loaded successfully!")
+from datasets import load_dataset
+# Load dataset from JSON file
+dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
+# Split dataset into training (80%) and evaluation (20%)
+split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
+train_dataset = split_dataset["train"]
+eval_dataset = split_dataset["test"]  # Required for evaluation
+print("✅ Dataset split into training and evaluation sets!")
+from transformers import TrainingArguments
+training_args = TrainingArguments(
+    output_dir="/content/fine_tuned_llama2",
+    per_device_train_batch_size=2,
+    gradient_accumulation_steps=4,
+    warmup_steps=100,
+    max_steps=500,
+    learning_rate=2e-4,
+    fp16=True,
+    logging_steps=10,
+    save_strategy="epoch",
+    eval_strategy="epoch",  # Fix deprecation warning
+    push_to_hub=False
+)
+from transformers import Trainer
+trainer = Trainer(
+    model=model,  # Do NOT move manually
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset  # Include evaluation dataset if available
+)
+print("✅ Trainer initialized successfully!")
+model.save_pretrained("/content/fine_tuned_llama2")
+tokenizer.save_pretrained("/content/fine_tuned_llama2")
+print("✅ Model saved successfully!")
+# Optional: Upload to Hugging Face
+from huggingface_hub import notebook_login
+notebook_login()
+# Replace "your-hf-username" with your actual Hugging Face username
+model.push_to_hub("and89/fine_tuned_llama2")
+tokenizer.push_to_hub("and89/fine_tuned_llama2")
+print("🚀 Model uploaded to Hugging Face!")
+from huggingface_hub import HfApi
+api = HfApi()
+datasets = api.list_repo_files("and89/fine_tuned_llama2")
+print("✅ Uploaded dataset files:", datasets)
+api.upload_file(
+    path_or_fileobj="/content/drive/MyDrive/training_data.jsonl",  # Update file path
+    path_in_repo="training_data.jsonl",
+    repo_id="and89/fine_tuned_llama2"
+)
+from transformers import Trainer
+# Tokenize the dataset
+def tokenize_function(examples):
+    return tokenizer(examples["facts"], padding="max_length", truncation=True)
+# Assuming "facts" is the column you want to use for input
+train_dataset = train_dataset.map(tokenize_function, batched=True)
+eval_dataset = eval_dataset.map(tokenize_function, batched=True)
+# Now initialize the Trainer
+trainer = Trainer(
+    model=model,  # Do NOT move manually
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset  # Include evaluation dataset if available
+)
+print("✅ Trainer initialized successfully!")
+from datasets import load_dataset
+# Replace with your dataset name
+dataset = load_dataset("and89/fine_tuned_llama2")
+# Check dataset format
+print(dataset)
+print(dataset["train"][0])  # Print first row to check structure
+print(dataset)  # Prints dataset details
+print("Sample row:", dataset["train"][0])  # Prints the first row
+from datasets import load_dataset
+dataset = load_dataset("and89/fine_tuned_llama2")
+print("✅ Dataset loaded successfully!")
+print(dataset)
+from transformers import AutoTokenizer
+model_name = "bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(dataset["train"].features)
+def preprocess_function(examples):
+    text_column = list(dataset["train"].features.keys())[0]  # Get the text column name
+    # Ensure the input is a list of strings
+    texts = examples[text_column]
+    # Convert all values to strings in case they are not
+    texts = [str(text) for text in texts]
+    return tokenizer(texts, padding="max_length", truncation=True)
+tokenized_datasets = dataset.map(preprocess_function, batched=True)
+print("✅ Tokenization successful!")
+tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")
+print("✅ Tokenization successful!")
+print(tokenized_datasets)
+tokenized_datasets.save_to_disk("tokenized_dataset")
+# Reload and verify
+from datasets import load_from_disk
+reloaded_dataset = load_from_disk("tokenized_dataset")
+print("✅ Reloaded Tokenized Dataset:", reloaded_dataset)
+print(tokenized_datasets)  # Prints available dataset splits
+from datasets import load_dataset
+# Load dataset
+dataset = load_dataset("and89/fine_tuned_llama2")
+# Split dataset (90% train, 10% test)
+train_test_split = dataset["train"].train_test_split(test_size=0.1)
+# Verify new splits
+print(train_test_split)
+from datasets import DatasetDict
+# Split dataset into train and test (90% train, 10% test)
+train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
+# Convert to DatasetDict
+tokenized_datasets = DatasetDict({
+    "train": train_test_split["train"],
+    "test": train_test_split["test"]
+})
+print("✅ Train-Test split created:", tokenized_datasets)
+print(tokenized_datasets["train"][0])
+training_args = TrainingArguments(
+    output_dir="./results",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    push_to_hub=True,
+    hub_model_id="your_username/your_model_name",
+    remove_unused_columns=False  # Ensure input columns are kept
+)
+from huggingface_hub import notebook_login
+# Authenticate with Hugging Face
+notebook_login()
+# Push model and tokenizer
+model.push_to_hub("and89/fine_tuned_llama2")
+tokenizer.push_to_hub("and89/fine_tuned_llama2")
+from transformers import pipeline
+# Load model from Hugging Face
+classifier = pipeline("text-classification", model="and89/fine_tuned_llama2")
+# Run inference
+result = classifier("Your input text here")
+print(result)
+!pip install gradio
+import gradio as gr
+def predict(text):
+    return classifier(text)
+demo = gr.Interface(fn=predict, inputs="text", outputs="text")
+demo.launch()
+from transformers import pipeline
+# Load the fine-tuned model
+model_name = "and89/fine_tuned_llama2"  # Replace with your actual model name
+classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
+def predict(text):
+    return classifier(text)[0]["label"]  # Extracts the predicted label
+# Test the function
+print("✅ Model loaded successfully!")
+print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed"))
+from huggingface_hub import login
+login()  # This will automatically use the HF_TOKEN secret
+from google.colab import runtime
+runtime.unassign()
+import gradio as gr
+from transformers import pipeline
+# Load the fine-tuned model
+model_name = "and89/fine_tuned_llama2"  # Replace with your actual model name
+classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
+# Define label mapping (adjust based on your dataset)
+label_mapping = {
+    "LABEL_0": "Not Guilty",
+    "LABEL_1": "Guilty"
+}
+def predict(text):
+    result = classifier(text)[0]  # Extract the first result
+    label = result["label"]  # Get the predicted label (e.g., "LABEL_1")
+    score = result["score"]  # Confidence score
+    # Map label to meaningful text
+    label_text = label_mapping.get(label, "Unknown")
+    return f"Prediction: {label_text} (Confidence: {score:.2f})"
+# Gradio UI
+demo = gr.Interface(
+    fn=predict,
+    inputs="text",
+    outputs="text",
+    title="Legal Case Decision Predictor",
+    description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'."
+)
+# Launch the Gradio app
+demo.launch()