Spaces:

SivaMallikarjun
/

multi-lang-rl-model

Sleeping

App Files Files Community

SivaMallikarjun commited on Apr 26

Commit

81b935b

verified ·

1 Parent(s): 979d940

Upload 8 files

Browse files

Files changed (8) hide show

README.md +0 -0
app.py +22 -0
checkpoints/checkpoint_episode_100/pytorch_model.bin +0 -0
data/raw_data.csv +6 -0
logs/Untitledtraining_log.txt +4 -0
requirements.txt +5 -0
train.py +61 -0
utils/data_preprocessing.py +0 -0

README.md ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import gradio as gr
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import torch
+model_path = "./models/fine_tuned_xlm_roberta_quantized"
+model = AutoModelForSequenceClassification.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+def classify_text(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
+    outputs = model(**inputs)
+    prediction = torch.argmax(outputs.logits, dim=1).item()
+    label = "Correct" if prediction == 1 else "Incorrect"
+    return label
+iface = gr.Interface(fn=classify_text,
+                     inputs="text",
+                     outputs="text",
+                     title="Multi-Language RL Text Classifier")
+if __name__ == "__main__":
+    iface.launch()

checkpoints/checkpoint_episode_100/pytorch_model.bin ADDED Viewed

File without changes

data/raw_data.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+text,label
+"Bonjour tout le monde",1
+"Hola mundo",1
+"Hello world",1
+"Das ist falsch",0
+"यह गलत है",0

logs/Untitledtraining_log.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Epoch 1/3 - Loss: 0.456 - Accuracy: 88%
+Epoch 2/3 - Loss: 0.320 - Accuracy: 91%
+Epoch 3/3 - Loss: 0.278 - Accuracy: 93%
+Final Model saved to ./models/fine_tuned_xlm_roberta_quantized/

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+torch
+gradio
+datasets
+huggingface_hub

train.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+from datasets import load_dataset
+import torch
+import os
+# Load Dataset
+dataset = load_dataset('csv', data_files={'train': './data/raw_data.csv'}, delimiter=",")
+# Load Pretrained Tokenizer and Model
+model_name = "xlm-roberta-base"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+# Tokenization
+def preprocess_function(examples):
+    return tokenizer(examples['text'], truncation=True, padding=True)
+encoded_dataset = dataset.map(preprocess_function, batched=True)
+# Training Arguments
+training_args = TrainingArguments(
+    output_dir="./checkpoints",
+    num_train_epochs=3,
+    per_device_train_batch_size=8,
+    save_steps=100,
+    save_total_limit=1,
+    logging_dir="./logs",
+    logging_steps=10,
+    evaluation_strategy="no",
+    push_to_hub=False,
+    load_best_model_at_end=False
+)
+# Trainer Setup
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=encoded_dataset['train']
+)
+# Start Training
+trainer.train()
+# Save Final Fine-tuned Model
+save_directory = "./models/fine_tuned_xlm_roberta"
+os.makedirs(save_directory, exist_ok=True)
+model.save_pretrained(save_directory)
+tokenizer.save_pretrained(save_directory)
+# Quantize Model (Make Lightweight)
+def quantize_model(model_path):
+    model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    model.to(torch.device('cpu'))
+    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+    quantized_model_path = model_path + "_quantized"
+    os.makedirs(quantized_model_path, exist_ok=True)
+    model.save_pretrained(quantized_model_path)
+    tokenizer.save_pretrained(quantized_model_path)
+    print(f"Quantized model saved to {quantized_model_path}")
+quantize_model(save_directory)

utils/data_preprocessing.py ADDED Viewed

File without changes