File size: 4,231 Bytes
e369cb0
 
 
 
 
0473071
 
e369cb0
 
0473071
 
 
 
 
e369cb0
3854d90
 
e369cb0
 
05a2e2d
 
 
 
e369cb0
 
0473071
 
 
 
 
 
 
 
 
 
 
05a2e2d
 
 
0473071
 
 
 
 
 
 
 
 
e369cb0
0473071
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05a2e2d
e369cb0
0473071
05a2e2d
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
e369cb0
05a2e2d
 
0473071
 
e369cb0
 
 
 
3854d90
ba7e072
 
 
 
e369cb0
05a2e2d
ba7e072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3854d90
ba7e072
05a2e2d
 
e369cb0
 
05a2e2d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import spacy
import math
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
import torch.nn.functional as F
import numpy as np
import evaluate


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

answer = "Pizza"


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load("accuracy")
    return metric.compute(predictions=predictions, references=labels)


def training():
    dataset = load_dataset("glue", "cola")
    dataset = dataset["train"]
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
    
    
    
    finetune(small_train_dataset, small_eval_dataset)


def finetune(train, eval):
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
    
    training_args = TrainingArguments(output_dir="test_trainer")
    
    # USE THIS LINK
    # https://huggingface.co/blog/how-to-train-sentence-transformers
    
    
    # accuracy = compute_metrics(eval, metric)
    
    training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=eval,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    sentences = ["This is an example sentence", "Each sentence is converted"]

    # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(sentences)
    print(embeddings)
    
    # Sentences we want sentence embeddings for
    sentences = ['This is an example sentence', 'Each sentence is converted']

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    print("Sentence embeddings:")
    print(sentence_embeddings)

 

def greet(name):
    return "Hello " + name + "!!"

def check_answer(guess:str):
    if guess.lower() == answer.lower():
        return "Correct!"
    else:
        return "Try again!"

def main():
    word1 = "Black"
    word2 = "White"
    word3 = "Sun"
    answer = "Moon"
    guesses = []
    
    prompt = "{word1} is to {word2} as {word3} is to ____"
    with gr.Blocks() as iface:
        gr.Markdown(prompt)
        with gr.Tab("Guess"):
            text_input = gr.Textbox()
            text_output = gr.Textbox()
            text_button = gr.Button("Submit")
        with gr.Accordion("Open for previous guesses"):
            for guess in guesses:
                gr.Markdown(guess)
        text_button.click(check_answer, inputs=[text_input], outputs=text_output)
    # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
    iface.launch()


    
if __name__ == "__main__":
    main()