SivaMallikarjun commited on
Commit
81b935b
·
verified ·
1 Parent(s): 979d940

Upload 8 files

Browse files
README.md ADDED
File without changes
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
3
+ import torch
4
+
5
+ model_path = "./models/fine_tuned_xlm_roberta_quantized"
6
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
7
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
8
+
9
+ def classify_text(text):
10
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
11
+ outputs = model(**inputs)
12
+ prediction = torch.argmax(outputs.logits, dim=1).item()
13
+ label = "Correct" if prediction == 1 else "Incorrect"
14
+ return label
15
+
16
+ iface = gr.Interface(fn=classify_text,
17
+ inputs="text",
18
+ outputs="text",
19
+ title="Multi-Language RL Text Classifier")
20
+
21
+ if __name__ == "__main__":
22
+ iface.launch()
checkpoints/checkpoint_episode_100/pytorch_model.bin ADDED
File without changes
data/raw_data.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ text,label
2
+ "Bonjour tout le monde",1
3
+ "Hola mundo",1
4
+ "Hello world",1
5
+ "Das ist falsch",0
6
+ "यह गलत है",0
logs/Untitledtraining_log.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Epoch 1/3 - Loss: 0.456 - Accuracy: 88%
2
+ Epoch 2/3 - Loss: 0.320 - Accuracy: 91%
3
+ Epoch 3/3 - Loss: 0.278 - Accuracy: 93%
4
+ Final Model saved to ./models/fine_tuned_xlm_roberta_quantized/
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ datasets
5
+ huggingface_hub
train.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+ import torch
4
+ import os
5
+
6
+ # Load Dataset
7
+ dataset = load_dataset('csv', data_files={'train': './data/raw_data.csv'}, delimiter=",")
8
+
9
+ # Load Pretrained Tokenizer and Model
10
+ model_name = "xlm-roberta-base"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
13
+
14
+ # Tokenization
15
+ def preprocess_function(examples):
16
+ return tokenizer(examples['text'], truncation=True, padding=True)
17
+
18
+ encoded_dataset = dataset.map(preprocess_function, batched=True)
19
+
20
+ # Training Arguments
21
+ training_args = TrainingArguments(
22
+ output_dir="./checkpoints",
23
+ num_train_epochs=3,
24
+ per_device_train_batch_size=8,
25
+ save_steps=100,
26
+ save_total_limit=1,
27
+ logging_dir="./logs",
28
+ logging_steps=10,
29
+ evaluation_strategy="no",
30
+ push_to_hub=False,
31
+ load_best_model_at_end=False
32
+ )
33
+
34
+ # Trainer Setup
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=encoded_dataset['train']
39
+ )
40
+
41
+ # Start Training
42
+ trainer.train()
43
+
44
+ # Save Final Fine-tuned Model
45
+ save_directory = "./models/fine_tuned_xlm_roberta"
46
+ os.makedirs(save_directory, exist_ok=True)
47
+ model.save_pretrained(save_directory)
48
+ tokenizer.save_pretrained(save_directory)
49
+
50
+ # Quantize Model (Make Lightweight)
51
+ def quantize_model(model_path):
52
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
53
+ model.to(torch.device('cpu'))
54
+ model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
55
+ quantized_model_path = model_path + "_quantized"
56
+ os.makedirs(quantized_model_path, exist_ok=True)
57
+ model.save_pretrained(quantized_model_path)
58
+ tokenizer.save_pretrained(quantized_model_path)
59
+ print(f"Quantized model saved to {quantized_model_path}")
60
+
61
+ quantize_model(save_directory)
utils/data_preprocessing.py ADDED
File without changes