File size: 4,462 Bytes
e369cb0
 
 
 
 
0473071
 
e369cb0
 
0473071
 
 
 
 
e369cb0
5d26793
b7302f3
 
3854d90
e369cb0
 
05a2e2d
 
 
 
e369cb0
 
0473071
 
 
 
 
 
 
 
 
 
 
05a2e2d
 
 
0473071
 
 
 
 
 
 
 
 
e369cb0
0473071
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05a2e2d
e369cb0
0473071
05a2e2d
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
0473071
 
e369cb0
 
 
 
3854d90
b7302f3
 
34d084e
cfd2220
 
 
 
 
ba7e072
cfd2220
ba7e072
cfd2220
e369cb0
05a2e2d
ba7e072
 
 
b7302f3
ba7e072
e22b7aa
ba7e072
5d26793
ba7e072
 
 
3ed7c05
 
ba7e072
 
a87b166
 
ba7e072
05a2e2d
 
e369cb0
 
05a2e2d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
import spacy
import math
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
import torch.nn.functional as F
import numpy as np
import evaluate


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# answer = "Pizza"
guesses = []
answer = "Pizza"


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load("accuracy")
    return metric.compute(predictions=predictions, references=labels)


def training():
    dataset = load_dataset("glue", "cola")
    dataset = dataset["train"]
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
    
    
    
    finetune(small_train_dataset, small_eval_dataset)


def finetune(train, eval):
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
    
    training_args = TrainingArguments(output_dir="test_trainer")
    
    # USE THIS LINK
    # https://huggingface.co/blog/how-to-train-sentence-transformers
    
    
    # accuracy = compute_metrics(eval, metric)
    
    training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=eval,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    sentences = ["This is an example sentence", "Each sentence is converted"]

    # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(sentences)
    print(embeddings)
    
    # Sentences we want sentence embeddings for
    sentences = ['This is an example sentence', 'Each sentence is converted']

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    print("Sentence embeddings:")
    print(sentence_embeddings)

 

def greet(name):
    return "Hello " + name + "!!"

def check_answer(guess:str):
    global guesses
    global answer
    guesses.append(guess)
    output = ""
    for guess in guesses:
        output += ("- " + guess + "\n")
    output = output[:-2]
    
    if guess.lower() == answer.lower():
        return "Correct!", output
    else:
        return "Try again!", output

def main():
    word1 = "Black"
    word2 = "White"
    word3 = "Sun"
    global answer
    answer = "Moon"
    global guesses
    
    prompt = f"{word1} is to {word2} as {word3} is to ____"
    with gr.Blocks() as iface:
        gr.Markdown(prompt)
        with gr.Tab("Guess"):
            text_input = gr.Textbox()
            text_output = gr.Textbox()
            text_button = gr.Button("Submit")
        with gr.Accordion("Open for previous guesses"):
            text_guesses = gr.Textbox()
        text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
    # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
    iface.launch()


    
if __name__ == "__main__":
    main()