Spaces:

minseokKoo
/

Auto_Classifier

Sleeping

App Files Files Community

Auto_Classifier / app.py

minseokKoo

Update app.py

0c1d006 over 2 years ago

raw

history blame

9.86 kB

	import pandas as pd
	import numpy as np
	import re
	import os
	import sys
	import random
	import transformers
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from transformers import RobertaTokenizer, RobertaForSequenceClassification
	import torch
	import torch.nn.functional as F
	from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	import gradio as gr

	examples = 'static int\nFax3SetupState(TIFF* tif)\n{\n\tstatic const char module[] = "Fax3SetupState";\n\tTIFFDirectory* td = &tif->tif_dir;\n\tFax3BaseState* sp = Fax3State(tif);\n\tint needsRefLine;\n\tFax3CodecState* dsp = (Fax3CodecState) Fax3State(tif);\n\ttmsize_t rowbytes;\n\tuint32 rowpixels, nruns;\n\n\tif (td->td_bitspersample != 1) {\n\t\tTIFFErrorExt(tif->tif_clientdata, module,\n\t\t "Bits/sample must be 1 for Group 3/4 encoding/decoding");\n\t\treturn (0);\n\t}\n\t/\n\t * Calculate the scanline/tile widths.\n\t /\n\tif (isTiled(tif)) {\n\t\trowbytes = TIFFTileRowSize(tif);\n\t\trowpixels = td->td_tilewidth;\n\t} else {\n\t\trowbytes = TIFFScanlineSize(tif);\n\t\trowpixels = td->td_imagewidth;\n\t}\n\tsp->rowbytes = rowbytes;\n\tsp->rowpixels = rowpixels;\n\t/\n\t * Allocate any additional space required for decoding/encoding.\n\t /\n\tneedsRefLine = (\n\t (sp->groupoptions & GROUP3OPT_2DENCODING) \|\|\n\t td->td_compression == COMPRESSION_CCITTFAX4\n\t);\n\n\tnruns = needsRefLine ? 2TIFFroundup_32(rowpixels,32) : rowpixels;\n\tnruns += 3;\n\tdsp->runs = (uint32) _TIFFCheckMalloc(tif, 2nruns, sizeof (uint32),\n\t\t\t\t\t "for Group 3/4 run arrays");\n\tif (dsp->runs == NULL)\n\t\treturn (0);\n\tdsp->curruns = dsp->runs;\n\tif (needsRefLine)\n\t\tdsp->refruns = dsp->runs + nruns;\n\telse\n\t\tdsp->refruns = NULL;\n\tif (td->td_compression == COMPRESSION_CCITTFAX3\n\t && is2DEncoding(dsp)) {\t/* NB: default is 1D routine /\n\t\ttif->tif_decoderow = Fax3Decode2D;\n\t\ttif->tif_decodestrip = Fax3Decode2D;\n\t\ttif->tif_decodetile = Fax3Decode2D;\n\t}\n\n\tif (needsRefLine) {\t\t/ 2d encoding /\n\t\tFax3CodecState esp = EncoderState(tif);\n\t\t/\n\t\t 2d encoding requires a scanline\n\t\t * buffer for the ``reference line\'\'; the\n\t\t * scanline against which delta encoding\n\t\t * is referenced. The reference line must\n\t\t * be initialized to be ``white\'\' (done elsewhere).\n\t\t /\n\t\tesp->refline = (unsigned char) _TIFFmalloc(rowbytes);\n\t\tif (esp->refline == NULL) {\n\t\t\tTIFFErrorExt(tif->tif_clientdata, module,\n\t\t\t "No space for Group 3/4 reference line");\n\t\t\treturn (0);\n\t\t}\n\t} else\t\t\t\t\t/* 1d encoding */\n\t\tEncoderState(tif)->refline = NULL;\n\n\treturn (1);\n}'

	def greet(co):
	code_text = []

	code_text.append(co)

	code_text = ' '.join(code_text)
	code_text = re.sub('\/\[\S\s]\*\/', '', code_text)
	code_text = re.sub('\/\/.*', '', code_text)
	code_text = re.sub('(\\\\n)+', '\\n', code_text)

	# 1. CFA-CodeBERTa-small.pt -> CodeBERTa-small-v1 finetunig model
	path = os.getcwd() + '/models/CFA-CodeBERTa-small.pt'
	tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
	input_ids = tokenizer.encode(
	code_text, max_length=512, truncation=True, padding='max_length')
	input_ids = torch.tensor([input_ids])
	model = RobertaForSequenceClassification.from_pretrained(
	path, num_labels=2)
	model.to('cpu')
	pred_1 = model(input_ids)[0].detach().cpu().numpy()[0]
	# model(input_ids)[0].argmax().detach().cpu().numpy().item()

	# 2. CFA-codebert-c.pt -> codebert-c finetuning model
	path = os.getcwd() + '/models/CFA-codebert-c.pt'
	tokenizer = AutoTokenizer.from_pretrained(path)
	input_ids = tokenizer(code_text, padding=True, max_length=512,
	truncation=True, return_token_type_ids=True)['input_ids']
	input_ids = torch.tensor([input_ids])
	model = AutoModelForSequenceClassification.from_pretrained(
	path, num_labels=2)
	model.to('cpu')
	pred_2 = model(input_ids)[0].detach().cpu().numpy()[0]

	# 3. CFA-codebert-c-v2.pt -> undersampling + codebert-c finetuning model
	path = os.getcwd() + '/models/CFA-codebert-c-v2.pt'
	tokenizer = RobertaTokenizer.from_pretrained(path)
	input_ids = tokenizer(code_text, padding=True, max_length=512,
	truncation=True, return_token_type_ids=True)['input_ids']
	input_ids = torch.tensor([input_ids])
	model = RobertaForSequenceClassification.from_pretrained(
	path, num_labels=2)
	model.to('cpu')
	pred_3 = model(input_ids)[0].detach().cpu().numpy()

	# 4. codeT5 finetuning model
	path = os.getcwd() + '/models/CFA-codeT5'
	model_params = {
	# model_type: t5-base/t5-large
	"MODEL": path,
	"TRAIN_BATCH_SIZE": 8, # training batch size
	"VALID_BATCH_SIZE": 8, # validation batch size
	"VAL_EPOCHS": 1, # number of validation epochs
	"MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text
	"MAX_TARGET_TEXT_LENGTH": 3, # max length of target text
	"SEED": 2022, # set seed for reproducibility
	}
	data = pd.DataFrame({'code': [code_text]})
	pred_4 = T5Trainer(
	dataframe=data,
	source_text="code",
	model_params=model_params
	)
	pred_4 = int(pred_4[0])

	# ensemble
	tot_result = (pred_1 * 0.8 + pred_2 * 0.1 +
	pred_3 * 0.1 + pred_4 * 0.1).argmax()
	if tot_result == 0:
	return "false positive !!"
	else:
	return "true positive !!"




	# codeT5
	class YourDataSetClass(Dataset):

	def __init__(
	self, dataframe, tokenizer, source_len, source_text):

	self.tokenizer = tokenizer
	self.data = dataframe
	self.source_len = source_len
	# self.summ_len = target_len
	# self.target_text = self.data[target_text]
	self.source_text = self.data[source_text]

	def __len__(self):
	return len(self.source_text)

	def __getitem__(self, index):

	source_text = str(self.source_text[index])
	source_text = " ".join(source_text.split())
	source = self.tokenizer.batch_encode_plus(
	[source_text],
	max_length=self.source_len,
	pad_to_max_length=True,
	truncation=True,
	padding="max_length",
	return_tensors="pt",
	)
	source_ids = source["input_ids"].squeeze()
	source_mask = source["attention_mask"].squeeze()
	return {
	"source_ids": source_ids.to(dtype=torch.long),
	"source_mask": source_mask.to(dtype=torch.long),
	}


	def validate(epoch, tokenizer, model, device, loader):
	model.eval()
	predictions = []
	with torch.no_grad():
	for _, data in enumerate(loader, 0):
	ids = data['source_ids'].to(device, dtype=torch.long)
	mask = data['source_mask'].to(device, dtype=torch.long)

	generated_ids = model.generate(
	input_ids=ids,
	attention_mask=mask,
	max_length=150,
	num_beams=2,
	repetition_penalty=2.5,
	length_penalty=1.0,
	early_stopping=True
	)

	preds = [tokenizer.decode(
	g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
	if ((preds != '0') \| (preds != '1')):
	preds = '0'

	predictions.extend(preds)
	return predictions


	def T5Trainer(dataframe, source_text, model_params, step="test",):

	torch.manual_seed(model_params["SEED"]) # pytorch random seed
	np.random.seed(model_params["SEED"]) # numpy random seed
	torch.backends.cudnn.deterministic = True

	tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

	model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
	model = model.to('cpu')

	dataframe = dataframe[[source_text]]

	val_dataset = dataframe
	val_set = YourDataSetClass(
	val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], source_text)

	val_params = {
	'batch_size': model_params["VALID_BATCH_SIZE"],
	'shuffle': False,
	'num_workers': 0
	}

	val_loader = DataLoader(val_set, **val_params)

	for epoch in range(model_params["VAL_EPOCHS"]):
	predictions = validate(epoch, tokenizer, model, 'cpu', val_loader)

	return predictions


	#################################################################################

	'''demo = gr.Interface(
	fn = greet,
	inputs = "text",
	outputs= "number")
	demo.launch(share=True)
	'''
	with gr.Blocks() as demo1:
	gr.Markdown(
	"""
	<h1 align="center">
	False-Alarm-Detector
	</h1>
	""")

	gr.Markdown(
	"""
	정적 분석기로 오류라고 보고된 코드를 입력하면,
	오류가 True-positive 인지 False-positive 인지 분류 해 주는 프로그램이다.
	""")

	with gr.Accordion(label='모델에 대한 설명 ( 여기를 클릭 하시오. )',open=False):
	gr.Markdown(
	"""
	총 3개의 모델을 사용하였다.
	1. codeBERTa-small-v1
	- codeBERTa-small-v1 설명
	2. codeBERT - C
	- codeBERT - C 설명
	3. codeT5
	- codeT5 설명
	"""
	)
	with gr.Row():
	with gr.Column():
	inputs_1 = gr.Textbox(placeholder="코드를 입력하시오.", label='Code')
	with gr.Row():
	btn = gr.Button("결과 출력")
	with gr.Column():
	outputs_1 = gr.Text(label = 'Result')
	btn.click(fn = greet, inputs = inputs_1, outputs= outputs_1)
	gr.Examples(examples, inputs = inputs_1)

	if __name__ == "__main__":
	demo1.launch()