Spaces:
Sleeping
Sleeping
Commit
ยท
3945f15
1
Parent(s):
9317198
app.py
Browse files
app.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import random
|
7 |
+
import transformers
|
8 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
9 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
10 |
+
import torch
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
|
13 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
14 |
+
import gradio as gr
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
def greet(co):
|
20 |
+
code_text = []
|
21 |
+
while True:
|
22 |
+
code = co
|
23 |
+
if not code:
|
24 |
+
break
|
25 |
+
code_text.append(code)
|
26 |
+
'''
|
27 |
+
iter_num = int(
|
28 |
+
input('false alarm์ ๋ถ๋ฅํ๊ธฐ ์ํด์ ์
๋ ฅํ ์ฝ๋์ ๊ฐฏ์๋ ๋ช๊ฐ์ธ๊ฐ์? (์ซ์๋ง ์
๋ ฅํ์ธ์.) : '))
|
29 |
+
code_text = []
|
30 |
+
for _ in range(iter_num):
|
31 |
+
code = input('์ฝ๋๋ฅผ ์
๋ ฅํ์ธ์ : ')
|
32 |
+
code_text.append(code)
|
33 |
+
'''
|
34 |
+
code_text = ' '.join(code_text)
|
35 |
+
code_text = re.sub('\/\*[\S\s]*\*\/', '', code_text)
|
36 |
+
code_text = re.sub('\/\/.*', '', code_text)
|
37 |
+
code_text = re.sub('(\\\\n)+', '\\n', code_text)
|
38 |
+
|
39 |
+
# 1. CFA-CodeBERTa-small.pt -> CodeBERTa-small-v1 finetunig model
|
40 |
+
path = os.getcwd() + '/models/CFA-CodeBERTa-small.pt'
|
41 |
+
tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
|
42 |
+
input_ids = tokenizer.encode(
|
43 |
+
code_text, max_length=512, truncation=True, padding='max_length')
|
44 |
+
input_ids = torch.tensor([input_ids])
|
45 |
+
model = RobertaForSequenceClassification.from_pretrained(
|
46 |
+
path, num_labels=2)
|
47 |
+
model.to('cpu')
|
48 |
+
pred_1 = model(input_ids)[0].detach().cpu().numpy()[0]
|
49 |
+
# model(input_ids)[0].argmax().detach().cpu().numpy().item()
|
50 |
+
|
51 |
+
# 2. CFA-codebert-c.pt -> codebert-c finetuning model
|
52 |
+
path = os.getcwd() + '/models/CFA-codebert-c.pt'
|
53 |
+
tokenizer = AutoTokenizer.from_pretrained(path)
|
54 |
+
input_ids = tokenizer(code_text, padding=True, max_length=512,
|
55 |
+
truncation=True, return_token_type_ids=True)['input_ids']
|
56 |
+
input_ids = torch.tensor([input_ids])
|
57 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
58 |
+
path, num_labels=2)
|
59 |
+
pred_2 = model(input_ids)[0].detach().cpu().numpy()[0]
|
60 |
+
|
61 |
+
# 3. CFA-codebert-c-v2.pt -> undersampling + codebert-c finetuning model
|
62 |
+
path = os.getcwd() + '/models/CFA-codebert-c-v2.pt'
|
63 |
+
tokenizer = RobertaTokenizer.from_pretrained(path)
|
64 |
+
input_ids = tokenizer(code_text, padding=True, max_length=512,
|
65 |
+
truncation=True, return_token_type_ids=True)['input_ids']
|
66 |
+
input_ids = torch.tensor([input_ids])
|
67 |
+
model = RobertaForSequenceClassification.from_pretrained(
|
68 |
+
path, num_labels=2)
|
69 |
+
pred_3 = model(input_ids)[0].detach().cpu().numpy()
|
70 |
+
|
71 |
+
# 4. codeT5 finetuning model
|
72 |
+
path = os.getcwd() + '/models/CFA-codeT5'
|
73 |
+
model_params = {
|
74 |
+
# model_type: t5-base/t5-large
|
75 |
+
"MODEL": path,
|
76 |
+
"TRAIN_BATCH_SIZE": 8, # training batch size
|
77 |
+
"VALID_BATCH_SIZE": 8, # validation batch size
|
78 |
+
"VAL_EPOCHS": 1, # number of validation epochs
|
79 |
+
"MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text
|
80 |
+
"MAX_TARGET_TEXT_LENGTH": 3, # max length of target text
|
81 |
+
"SEED": 2022, # set seed for reproducibility
|
82 |
+
}
|
83 |
+
data = pd.DataFrame({'code': [code_text]})
|
84 |
+
pred_4 = T5Trainer(
|
85 |
+
dataframe=data,
|
86 |
+
source_text="code",
|
87 |
+
model_params=model_params
|
88 |
+
)
|
89 |
+
pred_4 = int(pred_4[0])
|
90 |
+
|
91 |
+
# ensemble
|
92 |
+
tot_result = (pred_1 * 0.8 + pred_2 * 0.1 +
|
93 |
+
pred_3 * 0.1 + pred_4 * 0.1).argmax()
|
94 |
+
|
95 |
+
return tot_result
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
# codeT5
|
100 |
+
class YourDataSetClass(Dataset):
|
101 |
+
|
102 |
+
def __init__(
|
103 |
+
self, dataframe, tokenizer, source_len, source_text):
|
104 |
+
|
105 |
+
self.tokenizer = tokenizer
|
106 |
+
self.data = dataframe
|
107 |
+
self.source_len = source_len
|
108 |
+
# self.summ_len = target_len
|
109 |
+
# self.target_text = self.data[target_text]
|
110 |
+
self.source_text = self.data[source_text]
|
111 |
+
|
112 |
+
def __len__(self):
|
113 |
+
return len(self.source_text)
|
114 |
+
|
115 |
+
def __getitem__(self, index):
|
116 |
+
|
117 |
+
source_text = str(self.source_text[index])
|
118 |
+
source_text = " ".join(source_text.split())
|
119 |
+
source = self.tokenizer.batch_encode_plus(
|
120 |
+
[source_text],
|
121 |
+
max_length=self.source_len,
|
122 |
+
pad_to_max_length=True,
|
123 |
+
truncation=True,
|
124 |
+
padding="max_length",
|
125 |
+
return_tensors="pt",
|
126 |
+
)
|
127 |
+
source_ids = source["input_ids"].squeeze()
|
128 |
+
source_mask = source["attention_mask"].squeeze()
|
129 |
+
return {
|
130 |
+
"source_ids": source_ids.to(dtype=torch.long),
|
131 |
+
"source_mask": source_mask.to(dtype=torch.long),
|
132 |
+
}
|
133 |
+
|
134 |
+
|
135 |
+
def validate(epoch, tokenizer, model, device, loader):
|
136 |
+
model.eval()
|
137 |
+
predictions = []
|
138 |
+
with torch.no_grad():
|
139 |
+
for _, data in enumerate(loader, 0):
|
140 |
+
ids = data['source_ids'].to(device, dtype=torch.long)
|
141 |
+
mask = data['source_mask'].to(device, dtype=torch.long)
|
142 |
+
|
143 |
+
generated_ids = model.generate(
|
144 |
+
input_ids=ids,
|
145 |
+
attention_mask=mask,
|
146 |
+
max_length=150,
|
147 |
+
num_beams=2,
|
148 |
+
repetition_penalty=2.5,
|
149 |
+
length_penalty=1.0,
|
150 |
+
early_stopping=True
|
151 |
+
)
|
152 |
+
|
153 |
+
preds = [tokenizer.decode(
|
154 |
+
g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
|
155 |
+
if ((preds != '0') | (preds != '1')):
|
156 |
+
preds = '0'
|
157 |
+
|
158 |
+
predictions.extend(preds)
|
159 |
+
return predictions
|
160 |
+
|
161 |
+
|
162 |
+
def T5Trainer(dataframe, source_text, model_params, step="test",):
|
163 |
+
|
164 |
+
torch.manual_seed(model_params["SEED"]) # pytorch random seed
|
165 |
+
np.random.seed(model_params["SEED"]) # numpy random seed
|
166 |
+
torch.backends.cudnn.deterministic = True
|
167 |
+
|
168 |
+
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
|
169 |
+
|
170 |
+
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
|
171 |
+
model = model.to('cpu')
|
172 |
+
|
173 |
+
dataframe = dataframe[[source_text]]
|
174 |
+
|
175 |
+
val_dataset = dataframe
|
176 |
+
val_set = YourDataSetClass(
|
177 |
+
val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], source_text)
|
178 |
+
|
179 |
+
val_params = {
|
180 |
+
'batch_size': model_params["VALID_BATCH_SIZE"],
|
181 |
+
'shuffle': False,
|
182 |
+
'num_workers': 0
|
183 |
+
}
|
184 |
+
|
185 |
+
val_loader = DataLoader(val_set, **val_params)
|
186 |
+
|
187 |
+
for epoch in range(model_params["VAL_EPOCHS"]):
|
188 |
+
predictions = validate(epoch, tokenizer, model, 'cpu', val_loader)
|
189 |
+
|
190 |
+
return predictions
|
191 |
+
|
192 |
+
|
193 |
+
#################################################################################
|
194 |
+
|
195 |
+
demo = gr.Interface(
|
196 |
+
fn = greet,
|
197 |
+
inputs = "text",
|
198 |
+
outputs= "number")
|
199 |
+
demo.launch(share=True)
|