smhavens commited on
Commit
fd49958
·
1 Parent(s): 2c3c9f7

Edits for testing

Browse files
Files changed (1) hide show
  1. app.py +33 -16
app.py CHANGED
@@ -3,10 +3,13 @@ import spacy
3
  import math
4
  from datasets import load_dataset
5
  from sentence_transformers import SentenceTransformer
 
 
6
  from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
7
  from transformers import TrainingArguments, Trainer
8
  import torch
9
  import torch.nn.functional as F
 
10
  import numpy as np
11
  import evaluate
12
 
@@ -40,23 +43,34 @@ def training():
40
  dataset_id = "glue-cola"
41
  dataset = load_dataset("glue", "cola")
42
  dataset = dataset["train"]
43
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
44
 
45
  print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
46
  print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
47
  print(f"- Examples look like this: {dataset['train'][0]}")
48
 
49
- small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
50
- small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
51
 
 
 
 
 
52
 
 
 
 
 
 
53
 
54
- embeddings = finetune(small_train_dataset, small_eval_dataset)
 
 
55
 
56
  return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['set']), dataset['train'][0], embeddings)
57
 
58
 
59
- def finetune(train, eval):
60
  # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
61
  model_id = "sentence-transformers/all-MiniLM-L6-v2"
62
  model = SentenceTransformer(model_id)
@@ -66,20 +80,23 @@ def finetune(train, eval):
66
  # USE THIS LINK
67
  # https://huggingface.co/blog/how-to-train-sentence-transformers
68
 
 
 
 
69
 
70
  # accuracy = compute_metrics(eval, metric)
71
 
72
- training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
73
 
74
- trainer = Trainer(
75
- model=model,
76
- args=training_args,
77
- train_dataset=train,
78
- eval_dataset=eval,
79
- compute_metrics=compute_metrics,
80
- )
81
 
82
- trainer.train()
83
 
84
  sentences = ["This is an example sentence", "Each sentence is converted"]
85
 
@@ -91,8 +108,8 @@ def finetune(train, eval):
91
  sentences = ['This is an example sentence', 'Each sentence is converted']
92
 
93
  # Load model from HuggingFace Hub
94
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
95
- model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
96
 
97
  # Tokenize sentences
98
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 
3
  import math
4
  from datasets import load_dataset
5
  from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
  from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
  from transformers import TrainingArguments, Trainer
10
  import torch
11
  import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
  import numpy as np
14
  import evaluate
15
 
 
43
  dataset_id = "glue-cola"
44
  dataset = load_dataset("glue", "cola")
45
  dataset = dataset["train"]
46
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
47
 
48
  print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
49
  print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
50
  print(f"- Examples look like this: {dataset['train'][0]}")
51
 
52
+ # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
53
+ # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
54
 
55
+ train_examples = []
56
+ train_data = dataset['train']['set']
57
+ # For agility we only 1/2 of our available data
58
+ n_examples = dataset['train'].num_rows // 2
59
 
60
+ for i in range(n_examples):
61
+ example = train_data[i]
62
+ train_examples.append(InputExample(texts=[example['query'], example['pos'][0], example['neg'][0]]))
63
+
64
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
65
 
66
+
67
+
68
+ embeddings = finetune(train_dataloader)
69
 
70
  return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['set']), dataset['train'][0], embeddings)
71
 
72
 
73
+ def finetune(train_dataloader):
74
  # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
75
  model_id = "sentence-transformers/all-MiniLM-L6-v2"
76
  model = SentenceTransformer(model_id)
 
80
  # USE THIS LINK
81
  # https://huggingface.co/blog/how-to-train-sentence-transformers
82
 
83
+ train_loss = losses.TripletLoss(model=model)
84
+
85
+ model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
86
 
87
  # accuracy = compute_metrics(eval, metric)
88
 
89
+ # training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
90
 
91
+ # trainer = Trainer(
92
+ # model=model,
93
+ # args=training_args,
94
+ # train_dataset=train,
95
+ # eval_dataset=eval,
96
+ # compute_metrics=compute_metrics,
97
+ # )
98
 
99
+ # trainer.train()
100
 
101
  sentences = ["This is an example sentence", "Each sentence is converted"]
102
 
 
108
  sentences = ['This is an example sentence', 'Each sentence is converted']
109
 
110
  # Load model from HuggingFace Hub
111
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
112
+ # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
113
 
114
  # Tokenize sentences
115
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')