Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import re | |
import os | |
import sys | |
import random | |
import transformers | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
import torch | |
import torch.nn.functional as F | |
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
import gradio as gr | |
examples = 'static int\nFax3SetupState(TIFF* tif)\n{\n\tstatic const char module[] = "Fax3SetupState";\n\tTIFFDirectory* td = &tif->tif_dir;\n\tFax3BaseState* sp = Fax3State(tif);\n\tint needsRefLine;\n\tFax3CodecState* dsp = (Fax3CodecState*) Fax3State(tif);\n\ttmsize_t rowbytes;\n\tuint32 rowpixels, nruns;\n\n\tif (td->td_bitspersample != 1) {\n\t\tTIFFErrorExt(tif->tif_clientdata, module,\n\t\t "Bits/sample must be 1 for Group 3/4 encoding/decoding");\n\t\treturn (0);\n\t}\n\t/*\n\t * Calculate the scanline/tile widths.\n\t */\n\tif (isTiled(tif)) {\n\t\trowbytes = TIFFTileRowSize(tif);\n\t\trowpixels = td->td_tilewidth;\n\t} else {\n\t\trowbytes = TIFFScanlineSize(tif);\n\t\trowpixels = td->td_imagewidth;\n\t}\n\tsp->rowbytes = rowbytes;\n\tsp->rowpixels = rowpixels;\n\t/*\n\t * Allocate any additional space required for decoding/encoding.\n\t */\n\tneedsRefLine = (\n\t (sp->groupoptions & GROUP3OPT_2DENCODING) ||\n\t td->td_compression == COMPRESSION_CCITTFAX4\n\t);\n\n\tnruns = needsRefLine ? 2*TIFFroundup_32(rowpixels,32) : rowpixels;\n\tnruns += 3;\n\tdsp->runs = (uint32*) _TIFFCheckMalloc(tif, 2*nruns, sizeof (uint32),\n\t\t\t\t\t "for Group 3/4 run arrays");\n\tif (dsp->runs == NULL)\n\t\treturn (0);\n\tdsp->curruns = dsp->runs;\n\tif (needsRefLine)\n\t\tdsp->refruns = dsp->runs + nruns;\n\telse\n\t\tdsp->refruns = NULL;\n\tif (td->td_compression == COMPRESSION_CCITTFAX3\n\t && is2DEncoding(dsp)) {\t/* NB: default is 1D routine */\n\t\ttif->tif_decoderow = Fax3Decode2D;\n\t\ttif->tif_decodestrip = Fax3Decode2D;\n\t\ttif->tif_decodetile = Fax3Decode2D;\n\t}\n\n\tif (needsRefLine) {\t\t/* 2d encoding */\n\t\tFax3CodecState* esp = EncoderState(tif);\n\t\t/*\n\t\t * 2d encoding requires a scanline\n\t\t * buffer for the ``reference line\'\'; the\n\t\t * scanline against which delta encoding\n\t\t * is referenced. The reference line must\n\t\t * be initialized to be ``white\'\' (done elsewhere).\n\t\t */\n\t\tesp->refline = (unsigned char*) _TIFFmalloc(rowbytes);\n\t\tif (esp->refline == NULL) {\n\t\t\tTIFFErrorExt(tif->tif_clientdata, module,\n\t\t\t "No space for Group 3/4 reference line");\n\t\t\treturn (0);\n\t\t}\n\t} else\t\t\t\t\t/* 1d encoding */\n\t\tEncoderState(tif)->refline = NULL;\n\n\treturn (1);\n}' | |
def greet(co): | |
code_text = [] | |
code_text.append(co) | |
code_text = ' '.join(code_text) | |
code_text = re.sub('\/\*[\S\s]*\*\/', '', code_text) | |
code_text = re.sub('\/\/.*', '', code_text) | |
code_text = re.sub('(\\\\n)+', '\\n', code_text) | |
# 1. CFA-CodeBERTa-small.pt -> CodeBERTa-small-v1 finetunig model | |
path = os.getcwd() + '/models/CFA-CodeBERTa-small.pt' | |
tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1") | |
input_ids = tokenizer.encode( | |
code_text, max_length=512, truncation=True, padding='max_length') | |
input_ids = torch.tensor([input_ids]) | |
model = RobertaForSequenceClassification.from_pretrained( | |
path, num_labels=2) | |
model.to('cpu') | |
pred_1 = model(input_ids)[0].detach().cpu().numpy()[0] | |
# model(input_ids)[0].argmax().detach().cpu().numpy().item() | |
# 2. CFA-codebert-c.pt -> codebert-c finetuning model | |
path = os.getcwd() + '/models/CFA-codebert-c.pt' | |
tokenizer = AutoTokenizer.from_pretrained(path) | |
input_ids = tokenizer(code_text, padding=True, max_length=512, | |
truncation=True, return_token_type_ids=True)['input_ids'] | |
input_ids = torch.tensor([input_ids]) | |
model = AutoModelForSequenceClassification.from_pretrained( | |
path, num_labels=2) | |
model.to('cpu') | |
pred_2 = model(input_ids)[0].detach().cpu().numpy()[0] | |
# 3. CFA-codebert-c-v2.pt -> undersampling + codebert-c finetuning model | |
path = os.getcwd() + '/models/CFA-codebert-c-v2.pt' | |
tokenizer = RobertaTokenizer.from_pretrained(path) | |
input_ids = tokenizer(code_text, padding=True, max_length=512, | |
truncation=True, return_token_type_ids=True)['input_ids'] | |
input_ids = torch.tensor([input_ids]) | |
model = RobertaForSequenceClassification.from_pretrained( | |
path, num_labels=2) | |
model.to('cpu') | |
pred_3 = model(input_ids)[0].detach().cpu().numpy() | |
# 4. codeT5 finetuning model | |
path = os.getcwd() + '/models/CFA-codeT5' | |
model_params = { | |
# model_type: t5-base/t5-large | |
"MODEL": path, | |
"TRAIN_BATCH_SIZE": 8, # training batch size | |
"VALID_BATCH_SIZE": 8, # validation batch size | |
"VAL_EPOCHS": 1, # number of validation epochs | |
"MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text | |
"MAX_TARGET_TEXT_LENGTH": 3, # max length of target text | |
"SEED": 2022, # set seed for reproducibility | |
} | |
data = pd.DataFrame({'code': [code_text]}) | |
pred_4 = T5Trainer( | |
dataframe=data, | |
source_text="code", | |
model_params=model_params | |
) | |
pred_4 = int(pred_4[0]) | |
# ensemble | |
tot_result = (pred_1 * 0.8 + pred_2 * 0.1 + | |
pred_3 * 0.1 + pred_4 * 0.1).argmax() | |
if tot_result == 0: | |
return "false positive !!" | |
else: | |
return "true positive !!" | |
# codeT5 | |
class YourDataSetClass(Dataset): | |
def __init__( | |
self, dataframe, tokenizer, source_len, source_text): | |
self.tokenizer = tokenizer | |
self.data = dataframe | |
self.source_len = source_len | |
# self.summ_len = target_len | |
# self.target_text = self.data[target_text] | |
self.source_text = self.data[source_text] | |
def __len__(self): | |
return len(self.source_text) | |
def __getitem__(self, index): | |
source_text = str(self.source_text[index]) | |
source_text = " ".join(source_text.split()) | |
source = self.tokenizer.batch_encode_plus( | |
[source_text], | |
max_length=self.source_len, | |
pad_to_max_length=True, | |
truncation=True, | |
padding="max_length", | |
return_tensors="pt", | |
) | |
source_ids = source["input_ids"].squeeze() | |
source_mask = source["attention_mask"].squeeze() | |
return { | |
"source_ids": source_ids.to(dtype=torch.long), | |
"source_mask": source_mask.to(dtype=torch.long), | |
} | |
def validate(epoch, tokenizer, model, device, loader): | |
model.eval() | |
predictions = [] | |
with torch.no_grad(): | |
for _, data in enumerate(loader, 0): | |
ids = data['source_ids'].to(device, dtype=torch.long) | |
mask = data['source_mask'].to(device, dtype=torch.long) | |
generated_ids = model.generate( | |
input_ids=ids, | |
attention_mask=mask, | |
max_length=150, | |
num_beams=2, | |
repetition_penalty=2.5, | |
length_penalty=1.0, | |
early_stopping=True | |
) | |
preds = [tokenizer.decode( | |
g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids] | |
if ((preds != '0') | (preds != '1')): | |
preds = '0' | |
predictions.extend(preds) | |
return predictions | |
def T5Trainer(dataframe, source_text, model_params, step="test",): | |
torch.manual_seed(model_params["SEED"]) # pytorch random seed | |
np.random.seed(model_params["SEED"]) # numpy random seed | |
torch.backends.cudnn.deterministic = True | |
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) | |
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"]) | |
model = model.to('cpu') | |
dataframe = dataframe[[source_text]] | |
val_dataset = dataframe | |
val_set = YourDataSetClass( | |
val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], source_text) | |
val_params = { | |
'batch_size': model_params["VALID_BATCH_SIZE"], | |
'shuffle': False, | |
'num_workers': 0 | |
} | |
val_loader = DataLoader(val_set, **val_params) | |
for epoch in range(model_params["VAL_EPOCHS"]): | |
predictions = validate(epoch, tokenizer, model, 'cpu', val_loader) | |
return predictions | |
################################################################################# | |
'''demo = gr.Interface( | |
fn = greet, | |
inputs = "text", | |
outputs= "number") | |
demo.launch(share=True) | |
''' | |
with gr.Blocks() as demo1: | |
gr.Markdown( | |
""" | |
<h1 align="center"> | |
False-Alarm-Detector | |
</h1> | |
""") | |
gr.Markdown( | |
""" | |
์ ์ ๋ถ์๊ธฐ๋ก ์ค๋ฅ๋ผ๊ณ ๋ณด๊ณ ๋ ์ฝ๋๋ฅผ ์ ๋ ฅํ๋ฉด, | |
์ค๋ฅ๊ฐ True-positive ์ธ์ง False-positive ์ธ์ง ๋ถ๋ฅ ํด ์ฃผ๋ ํ๋ก๊ทธ๋จ์ด๋ค. | |
""") | |
with gr.Accordion(label='๋ชจ๋ธ์ ๋ํ ์ค๋ช ( ์ฌ๊ธฐ๋ฅผ ํด๋ฆญ ํ์์ค. )',open=False): | |
gr.Markdown( | |
""" | |
์ด 3๊ฐ์ ๋ชจ๋ธ์ ์ฌ์ฉํ์๋ค. | |
1. codeBERTa-small-v1 | |
- codeBERTa-small-v1 ์ค๋ช | |
2. codeBERT - C | |
- codeBERT - C ์ค๋ช | |
3. codeT5 | |
- codeT5 ์ค๋ช | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
inputs_1 = gr.Textbox(placeholder="์ฝ๋๋ฅผ ์ ๋ ฅํ์์ค.", label='Code') | |
with gr.Row(): | |
btn = gr.Button("๊ฒฐ๊ณผ ์ถ๋ ฅ") | |
with gr.Column(): | |
outputs_1 = gr.Text(label = 'Result') | |
btn.click(fn = greet, inputs = inputs_1, outputs= outputs_1) | |
gr.Examples(examples, inputs = inputs_1) | |
if __name__ == "__main__": | |
demo1.launch() | |