File size: 3,250 Bytes
85e3d20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

from datasets import load_dataset
import torch
import pandas as pd
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import Trainer, TrainingArguments
from accelerate import Accelerator

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

if __name__ == "__main__":
    imdb = load_dataset("imdb")

    # Preprocess data
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    tokenized_imdb = imdb.map(tokenize_function, batched=True)
    tokenized_imdb.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    # Define model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

    # Set up TrainingArguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Create an Accelerator instance
    accelerator = Accelerator()

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_imdb['train'],
        eval_dataset=tokenized_imdb['test'],
    )

    # Prepare the trainer and data for accelerated training
    trainer, train_dataloader, eval_dataloader = accelerator.prepare(
        trainer, 
        DataLoader(tokenized_imdb['train'], batch_size=training_args.per_device_train_batch_size),
        DataLoader(tokenized_imdb['test'], batch_size=training_args.per_device_eval_batch_size)
    )

    # Start training
    trainer.train()

    # Use the Trainer's predict method to get predictions
    predictions = trainer.predict(tokenized_imdb['test'])

    # Extract the logits from the predictions
    logits = predictions.predictions

    # Convert logits to probabilities using softmax
    probs = torch.nn.functional.softmax(torch.Tensor(logits), dim=1).numpy()

    # Create a DataFrame with the probabilities
    submission = pd.DataFrame(probs, columns=['negative', 'positive'])

    # Save the DataFrame to a CSV file
    submission.to_csv('submission.csv', index=False)

    # Print a message confirming the save
    print('Predictions saved to submission.csv')

    # The following loop has been commented out as requested
    '''
    #evaluate model and print accuracy on test set, also save the predictions of probabilities per class to submission.csv
    submission = pd.DataFrame(columns=list(range(2)), index=range(len(imdb["test"])))
    acc = 0
    for idx, data in enumerate(imdb["test"]):
        text = data["text"]
        label = data["label"]
        pred = model(text) # TODO: replace with proper prediction
        pred = torch.softmax(pred, dim=0)
        submission.loc[idx] = pred.tolist()
        acc += int(torch.argmax(pred).item() == label)
    print("Accuracy: ", acc/len(imdb["test"]))
    
    submission.to_csv('submission.csv', index_label='idx')
    '''