and89 commited on
Commit
d58462e
Β·
verified Β·
1 Parent(s): 6f16bf3

requirements.txt

Browse files

torch==2.2.0
transformers==4.39.1
datasets==2.18.0
accelerate==0.27.2
peft==0.10.0
bitsandbytes==0.41.0
sentencepiece==0.1.99
gradio==4.20.0
google-colab
pandas
huggingface_hub==0.21.3

Files changed (1) hide show
  1. app.py +436 -0
app.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled15.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1fx7o1di_oHCoQdFAAh8tqJ9NPQ82-rOH
8
+ """
9
+
10
+ import pandas as pd
11
+
12
+ # Mount Google Drive (optional if you want to save files there)
13
+ from google.colab import drive
14
+ drive.mount('/content/drive')
15
+
16
+ # Define file paths
17
+ input_csv_path = "/content/drive/MyDrive/judicial_cases.csv" # Ensure you have uploaded this file
18
+ train_csv_path = "/content/training_judicial_cases.csv"
19
+ val_csv_path = "/content/validation_judicial_cases.csv"
20
+
21
+ # Load the dataset
22
+ df = pd.read_csv(input_csv_path)
23
+
24
+ # Split dataset (80% training, 20% validation)
25
+ train_df = df.sample(frac=0.8, random_state=42) # Random sampling for training
26
+ val_df = df.drop(train_df.index) # Remaining 20% for validation
27
+
28
+ # Save training and validation sets as CSV
29
+ train_df.to_csv(train_csv_path, index=False)
30
+ val_df.to_csv(val_csv_path, index=False)
31
+
32
+ print(f"βœ… Training set saved: {train_csv_path}")
33
+ print(f"βœ… Validation set saved: {val_csv_path}")
34
+
35
+ # Copy to Google Drive (optional)
36
+ train_drive_path = "/content/drive/MyDrive/training_judicial_cases.csv"
37
+ val_drive_path = "/content/drive/MyDrive/validation_judicial_cases.csv"
38
+
39
+ !cp {train_csv_path} {train_drive_path}
40
+ !cp {val_csv_path} {val_drive_path}
41
+
42
+ print(f"πŸ“‚ Training set also saved to Google Drive: {train_drive_path}")
43
+ print(f"πŸ“‚ Validation set also saved to Google Drive: {val_drive_path}")
44
+
45
+ import os
46
+
47
+ file_path = "/content/drive/MyDrive/training_data.jsonl"
48
+
49
+ if os.path.exists(file_path):
50
+ print("βœ… File exists, proceeding with upload...")
51
+ else:
52
+ print("❌ File not found! Check file path.")
53
+
54
+ import torch
55
+
56
+ if torch.cuda.is_available():
57
+ print("βœ… GPU is available:", torch.cuda.get_device_name(0))
58
+ else:
59
+ print("❌ No GPU found! Go to Runtime β†’ Change runtime type β†’ Select GPU.")
60
+
61
+ import pandas as pd
62
+
63
+ # Load dataset
64
+ df = pd.read_csv("/content/drive/MyDrive/judicial_cases.csv")
65
+
66
+ # Display first few rows
67
+ print(df.head())
68
+
69
+ !pip install datasets
70
+
71
+ !pip install torch transformers peft bitsandbytes datasets accelerate sentencepiece
72
+
73
+ from huggingface_hub import login
74
+
75
+ login(token="") # Paste your HF token here
76
+ print("βœ… Hugging Face login successful!")
77
+
78
+ from transformers import AutoModelForCausalLM, AutoTokenizer
79
+
80
+ model_name = "meta-llama/Llama-2-7b-hf"
81
+
82
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
83
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", use_auth_token=True)
84
+
85
+ print("βœ… LLaMA 2 model loaded successfully!")
86
+
87
+ from peft import LoraConfig, get_peft_model
88
+ from transformers import TrainingArguments
89
+
90
+ # Define QLoRA configuration
91
+ lora_config = LoraConfig(
92
+ r=16, # Low-rank adaptation size
93
+ lora_alpha=32, # Scaling factor
94
+ lora_dropout=0.05, # Dropout to prevent overfitting
95
+ target_modules=["q_proj", "v_proj"] # Apply LoRA to attention layers
96
+ )
97
+
98
+ # Apply LoRA to the model
99
+ model = get_peft_model(model, lora_config)
100
+ model.print_trainable_parameters()
101
+
102
+ json_path = "/content/drive/MyDrive/judicial_cases.json"
103
+
104
+ from datasets import load_dataset
105
+
106
+ dataset = load_dataset("json", data_files={"train": json_path})
107
+ print("βœ… Dataset loaded successfully!")
108
+
109
+ import os
110
+
111
+ json_path = "/content/drive/MyDrive/judicial_cases.json" # Update the path if needed
112
+
113
+ if os.path.exists(json_path):
114
+ print(f"βœ… JSON file found: {json_path}")
115
+ else:
116
+ print(f"❌ JSON file not found! You need to generate it first.")
117
+
118
+ !pip install --upgrade datasets transformers
119
+
120
+ import datasets
121
+ from datasets import load_dataset
122
+
123
+ print("βœ… Hugging Face `datasets` library is installed and working!")
124
+
125
+ import datasets
126
+ from datasets import load_dataset
127
+
128
+ print("βœ… Hugging Face `datasets` library is installed and working!")
129
+
130
+ from datasets import load_dataset
131
+
132
+ # Load dataset from JSON file
133
+ dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
134
+
135
+ # Split dataset into training (80%) and evaluation (20%)
136
+ split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
137
+
138
+ train_dataset = split_dataset["train"]
139
+ eval_dataset = split_dataset["test"] # Required for evaluation
140
+
141
+ print("βœ… Dataset split into training and evaluation sets!")
142
+
143
+ from google.colab import drive
144
+ drive.mount('/content/drive')
145
+
146
+ from datasets import load_dataset
147
+
148
+ dataset = load_dataset("json", data_files={"train": json_path})
149
+
150
+ print("βœ… Dataset loaded successfully!")
151
+
152
+ from transformers import AutoModelForCausalLM, AutoTokenizer
153
+
154
+ model_name = "meta-llama/Llama-2-7b-hf"
155
+
156
+ # Load tokenizer
157
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token="")
158
+
159
+ # Load model without offloading
160
+ model = AutoModelForCausalLM.from_pretrained(
161
+ model_name,
162
+ torch_dtype="auto",
163
+ #device_map="auto", # Remove automatic device mapping
164
+ #offload_folder="offload" # Remove offloading
165
+ )
166
+
167
+ # Manually move the model to the desired device
168
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
169
+ model.to(device) # Move entire model to GPU if available, else CPU
170
+
171
+ print("βœ… Model loaded successfully!")
172
+
173
+ from datasets import load_dataset
174
+
175
+ # Load dataset from JSON file
176
+ dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/judicial_cases.json"})
177
+
178
+ # Split dataset into training (80%) and evaluation (20%)
179
+ split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
180
+
181
+ train_dataset = split_dataset["train"]
182
+ eval_dataset = split_dataset["test"] # Required for evaluation
183
+
184
+ print("βœ… Dataset split into training and evaluation sets!")
185
+
186
+ from transformers import TrainingArguments
187
+
188
+ training_args = TrainingArguments(
189
+ output_dir="/content/fine_tuned_llama2",
190
+ per_device_train_batch_size=2,
191
+ gradient_accumulation_steps=4,
192
+ warmup_steps=100,
193
+ max_steps=500,
194
+ learning_rate=2e-4,
195
+ fp16=True,
196
+ logging_steps=10,
197
+ save_strategy="epoch",
198
+ eval_strategy="epoch", # Fix deprecation warning
199
+ push_to_hub=False
200
+ )
201
+
202
+ from transformers import Trainer
203
+
204
+ trainer = Trainer(
205
+ model=model, # Do NOT move manually
206
+ args=training_args,
207
+ train_dataset=train_dataset,
208
+ eval_dataset=eval_dataset # Include evaluation dataset if available
209
+ )
210
+
211
+ print("βœ… Trainer initialized successfully!")
212
+
213
+ model.save_pretrained("/content/fine_tuned_llama2")
214
+ tokenizer.save_pretrained("/content/fine_tuned_llama2")
215
+
216
+ print("βœ… Model saved successfully!")
217
+
218
+ # Optional: Upload to Hugging Face
219
+ from huggingface_hub import notebook_login
220
+ notebook_login()
221
+
222
+ # Replace "your-hf-username" with your actual Hugging Face username
223
+ model.push_to_hub("and89/fine_tuned_llama2")
224
+ tokenizer.push_to_hub("and89/fine_tuned_llama2")
225
+ print("πŸš€ Model uploaded to Hugging Face!")
226
+
227
+ from huggingface_hub import HfApi
228
+
229
+ api = HfApi()
230
+ datasets = api.list_repo_files("and89/fine_tuned_llama2")
231
+
232
+ print("βœ… Uploaded dataset files:", datasets)
233
+
234
+ api.upload_file(
235
+ path_or_fileobj="/content/drive/MyDrive/training_data.jsonl", # Update file path
236
+ path_in_repo="training_data.jsonl",
237
+ repo_id="and89/fine_tuned_llama2"
238
+ )
239
+
240
+ from transformers import Trainer
241
+
242
+ # Tokenize the dataset
243
+ def tokenize_function(examples):
244
+ return tokenizer(examples["facts"], padding="max_length", truncation=True)
245
+
246
+ # Assuming "facts" is the column you want to use for input
247
+
248
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
249
+ eval_dataset = eval_dataset.map(tokenize_function, batched=True)
250
+
251
+ # Now initialize the Trainer
252
+ trainer = Trainer(
253
+ model=model, # Do NOT move manually
254
+ args=training_args,
255
+ train_dataset=train_dataset,
256
+ eval_dataset=eval_dataset # Include evaluation dataset if available
257
+ )
258
+
259
+ print("βœ… Trainer initialized successfully!")
260
+
261
+ from datasets import load_dataset
262
+
263
+ # Replace with your dataset name
264
+ dataset = load_dataset("and89/fine_tuned_llama2")
265
+
266
+ # Check dataset format
267
+ print(dataset)
268
+
269
+ print(dataset["train"][0]) # Print first row to check structure
270
+
271
+ print(dataset) # Prints dataset details
272
+ print("Sample row:", dataset["train"][0]) # Prints the first row
273
+
274
+ from datasets import load_dataset
275
+
276
+ dataset = load_dataset("and89/fine_tuned_llama2")
277
+ print("βœ… Dataset loaded successfully!")
278
+ print(dataset)
279
+
280
+ from transformers import AutoTokenizer
281
+
282
+ model_name = "bert-base-uncased"
283
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
284
+
285
+ print(dataset["train"].features)
286
+
287
+ def preprocess_function(examples):
288
+ text_column = list(dataset["train"].features.keys())[0] # Get the text column name
289
+
290
+ # Ensure the input is a list of strings
291
+ texts = examples[text_column]
292
+
293
+ # Convert all values to strings in case they are not
294
+ texts = [str(text) for text in texts]
295
+
296
+ return tokenizer(texts, padding="max_length", truncation=True)
297
+
298
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
299
+
300
+ print("βœ… Tokenization successful!")
301
+
302
+ tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")
303
+
304
+ print("βœ… Tokenization successful!")
305
+ print(tokenized_datasets)
306
+
307
+ tokenized_datasets.save_to_disk("tokenized_dataset")
308
+
309
+ # Reload and verify
310
+ from datasets import load_from_disk
311
+ reloaded_dataset = load_from_disk("tokenized_dataset")
312
+
313
+ print("βœ… Reloaded Tokenized Dataset:", reloaded_dataset)
314
+
315
+ print(tokenized_datasets) # Prints available dataset splits
316
+
317
+ from datasets import load_dataset
318
+
319
+ # Load dataset
320
+ dataset = load_dataset("and89/fine_tuned_llama2")
321
+
322
+ # Split dataset (90% train, 10% test)
323
+ train_test_split = dataset["train"].train_test_split(test_size=0.1)
324
+
325
+ # Verify new splits
326
+ print(train_test_split)
327
+
328
+ from datasets import DatasetDict
329
+
330
+ # Split dataset into train and test (90% train, 10% test)
331
+ train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)
332
+
333
+ # Convert to DatasetDict
334
+ tokenized_datasets = DatasetDict({
335
+ "train": train_test_split["train"],
336
+ "test": train_test_split["test"]
337
+ })
338
+
339
+ print("βœ… Train-Test split created:", tokenized_datasets)
340
+
341
+ print(tokenized_datasets["train"][0])
342
+
343
+ training_args = TrainingArguments(
344
+ output_dir="./results",
345
+ evaluation_strategy="epoch",
346
+ save_strategy="epoch",
347
+ per_device_train_batch_size=8,
348
+ per_device_eval_batch_size=8,
349
+ num_train_epochs=3,
350
+ weight_decay=0.01,
351
+ push_to_hub=True,
352
+ hub_model_id="your_username/your_model_name",
353
+ remove_unused_columns=False # Ensure input columns are kept
354
+ )
355
+
356
+ from huggingface_hub import notebook_login
357
+
358
+ # Authenticate with Hugging Face
359
+ notebook_login()
360
+
361
+ # Push model and tokenizer
362
+ model.push_to_hub("and89/fine_tuned_llama2")
363
+ tokenizer.push_to_hub("and89/fine_tuned_llama2")
364
+
365
+ from transformers import pipeline
366
+
367
+ # Load model from Hugging Face
368
+ classifier = pipeline("text-classification", model="and89/fine_tuned_llama2")
369
+
370
+ # Run inference
371
+ result = classifier("Your input text here")
372
+ print(result)
373
+
374
+ !pip install gradio
375
+
376
+ import gradio as gr
377
+
378
+ def predict(text):
379
+ return classifier(text)
380
+
381
+ demo = gr.Interface(fn=predict, inputs="text", outputs="text")
382
+ demo.launch()
383
+
384
+ from transformers import pipeline
385
+
386
+ # Load the fine-tuned model
387
+ model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
388
+ classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
389
+
390
+ def predict(text):
391
+ return classifier(text)[0]["label"] # Extracts the predicted label
392
+
393
+ # Test the function
394
+ print("βœ… Model loaded successfully!")
395
+ print(predict("Help me to analyze this case: employee filed complaint against supervisor terminated fine imposed"))
396
+
397
+ from huggingface_hub import login
398
+ login() # This will automatically use the HF_TOKEN secret
399
+
400
+ from google.colab import runtime
401
+ runtime.unassign()
402
+
403
+ import gradio as gr
404
+ from transformers import pipeline
405
+
406
+ # Load the fine-tuned model
407
+ model_name = "and89/fine_tuned_llama2" # Replace with your actual model name
408
+ classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)
409
+
410
+ # Define label mapping (adjust based on your dataset)
411
+ label_mapping = {
412
+ "LABEL_0": "Not Guilty",
413
+ "LABEL_1": "Guilty"
414
+ }
415
+
416
+ def predict(text):
417
+ result = classifier(text)[0] # Extract the first result
418
+ label = result["label"] # Get the predicted label (e.g., "LABEL_1")
419
+ score = result["score"] # Confidence score
420
+
421
+ # Map label to meaningful text
422
+ label_text = label_mapping.get(label, "Unknown")
423
+
424
+ return f"Prediction: {label_text} (Confidence: {score:.2f})"
425
+
426
+ # Gradio UI
427
+ demo = gr.Interface(
428
+ fn=predict,
429
+ inputs="text",
430
+ outputs="text",
431
+ title="Legal Case Decision Predictor",
432
+ description="Enter a legal case scenario, and the model will predict whether the decision is 'Guilty' or 'Not Guilty'."
433
+ )
434
+
435
+ # Launch the Gradio app
436
+ demo.launch()