Spaces:
Sleeping
Sleeping
smhavens
commited on
Commit
·
fd49958
1
Parent(s):
2c3c9f7
Edits for testing
Browse files
app.py
CHANGED
@@ -3,10 +3,13 @@ import spacy
|
|
3 |
import math
|
4 |
from datasets import load_dataset
|
5 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
6 |
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
|
7 |
from transformers import TrainingArguments, Trainer
|
8 |
import torch
|
9 |
import torch.nn.functional as F
|
|
|
10 |
import numpy as np
|
11 |
import evaluate
|
12 |
|
@@ -40,23 +43,34 @@ def training():
|
|
40 |
dataset_id = "glue-cola"
|
41 |
dataset = load_dataset("glue", "cola")
|
42 |
dataset = dataset["train"]
|
43 |
-
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
44 |
|
45 |
print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
|
46 |
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
|
47 |
print(f"- Examples look like this: {dataset['train'][0]}")
|
48 |
|
49 |
-
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
50 |
-
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
51 |
|
|
|
|
|
|
|
|
|
52 |
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
|
|
|
|
55 |
|
56 |
return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['set']), dataset['train'][0], embeddings)
|
57 |
|
58 |
|
59 |
-
def finetune(
|
60 |
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
|
61 |
model_id = "sentence-transformers/all-MiniLM-L6-v2"
|
62 |
model = SentenceTransformer(model_id)
|
@@ -66,20 +80,23 @@ def finetune(train, eval):
|
|
66 |
# USE THIS LINK
|
67 |
# https://huggingface.co/blog/how-to-train-sentence-transformers
|
68 |
|
|
|
|
|
|
|
69 |
|
70 |
# accuracy = compute_metrics(eval, metric)
|
71 |
|
72 |
-
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
|
73 |
|
74 |
-
trainer = Trainer(
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
)
|
81 |
|
82 |
-
trainer.train()
|
83 |
|
84 |
sentences = ["This is an example sentence", "Each sentence is converted"]
|
85 |
|
@@ -91,8 +108,8 @@ def finetune(train, eval):
|
|
91 |
sentences = ['This is an example sentence', 'Each sentence is converted']
|
92 |
|
93 |
# Load model from HuggingFace Hub
|
94 |
-
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
95 |
-
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
96 |
|
97 |
# Tokenize sentences
|
98 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
|
3 |
import math
|
4 |
from datasets import load_dataset
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
+
from sentence_transformers import InputExample
|
7 |
+
from sentence_transformers import losses
|
8 |
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
|
9 |
from transformers import TrainingArguments, Trainer
|
10 |
import torch
|
11 |
import torch.nn.functional as F
|
12 |
+
from torch.utils.data import DataLoader
|
13 |
import numpy as np
|
14 |
import evaluate
|
15 |
|
|
|
43 |
dataset_id = "glue-cola"
|
44 |
dataset = load_dataset("glue", "cola")
|
45 |
dataset = dataset["train"]
|
46 |
+
# tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
47 |
|
48 |
print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
|
49 |
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
|
50 |
print(f"- Examples look like this: {dataset['train'][0]}")
|
51 |
|
52 |
+
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
53 |
+
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
54 |
|
55 |
+
train_examples = []
|
56 |
+
train_data = dataset['train']['set']
|
57 |
+
# For agility we only 1/2 of our available data
|
58 |
+
n_examples = dataset['train'].num_rows // 2
|
59 |
|
60 |
+
for i in range(n_examples):
|
61 |
+
example = train_data[i]
|
62 |
+
train_examples.append(InputExample(texts=[example['query'], example['pos'][0], example['neg'][0]]))
|
63 |
+
|
64 |
+
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
|
65 |
|
66 |
+
|
67 |
+
|
68 |
+
embeddings = finetune(train_dataloader)
|
69 |
|
70 |
return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['set']), dataset['train'][0], embeddings)
|
71 |
|
72 |
|
73 |
+
def finetune(train_dataloader):
|
74 |
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
|
75 |
model_id = "sentence-transformers/all-MiniLM-L6-v2"
|
76 |
model = SentenceTransformer(model_id)
|
|
|
80 |
# USE THIS LINK
|
81 |
# https://huggingface.co/blog/how-to-train-sentence-transformers
|
82 |
|
83 |
+
train_loss = losses.TripletLoss(model=model)
|
84 |
+
|
85 |
+
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
|
86 |
|
87 |
# accuracy = compute_metrics(eval, metric)
|
88 |
|
89 |
+
# training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
|
90 |
|
91 |
+
# trainer = Trainer(
|
92 |
+
# model=model,
|
93 |
+
# args=training_args,
|
94 |
+
# train_dataset=train,
|
95 |
+
# eval_dataset=eval,
|
96 |
+
# compute_metrics=compute_metrics,
|
97 |
+
# )
|
98 |
|
99 |
+
# trainer.train()
|
100 |
|
101 |
sentences = ["This is an example sentence", "Each sentence is converted"]
|
102 |
|
|
|
108 |
sentences = ['This is an example sentence', 'Each sentence is converted']
|
109 |
|
110 |
# Load model from HuggingFace Hub
|
111 |
+
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
112 |
+
# model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
113 |
|
114 |
# Tokenize sentences
|
115 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|