File size: 2,758 Bytes
38f7c66
 
6281c34
 
38f7c66
6281c34
38f7c66
 
 
 
6281c34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38f7c66
 
6281c34
 
38f7c66
 
 
 
 
 
 
 
 
 
6281c34
38f7c66
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

# Load model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare a custom dataset
train_texts = [
    "Water freezes at 0 degrees Celsius.",
    "The sun rises in the west.",
    "Dogs can fly in the sky.",
    "Birds lay eggs.",
    "The earth is flat.",
    "Fish can swim in water.",
    "Humans can live without oxygen.",
    "Plants need sunlight to grow.",
    "Cars run on milk.",
    "The moon orbits the earth."
]
train_labels = [
    1,  # Correct
    0,  # Incorrect
    0,  # Incorrect
    1,  # Correct
    0,  # Incorrect
    1,  # Correct
    0,  # Incorrect
    1,  # Correct
    0,  # Incorrect
    1   # Correct
]

# Create Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Load Dataset
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(5):  # Train for 5 epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} completed")

# Now model is fine-tuned!

# Define prediction function
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs)
    label = torch.argmax(output.logits, dim=1).item()
    return "Correct" if label == 1 else "Incorrect"

# Gradio UI
gradio_app = gr.Interface(
    fn=classify_text,
    inputs=gr.Textbox(label="Enter Text"),
    outputs="text",
    title="Multi-Language RL Model (Trained)"
)

gradio_app.launch()