Spaces:

ejschwartz
/

resym

Running on Zero

App Files Files Community

resym / app.py

ejschwartz

description

edab27e 3 months ago

raw

history blame

3.55 kB

	import gradio as gr
	import json
	import os
	import spaces
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	import huggingface_hub

	import prep_decompiled

	description = """# ReSym Test Space

	This is a test space of the models from the [ReSym
	artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing
	this, not all of ReSym is publicly available; specifically, the Prolog component
	is [not available](https://github.com/lt-asset/resym/issues/2).

	This space simply performs inference on the two pretrained models available as
	part of the ReSym artifacts. It takes a variable name and some decompiled code
	as input, and outputs the variable type and other information.

	## Todo

	* Add support for FieldDecoder model

	"""

	hf_key = os.environ["HF_TOKEN"]
	huggingface_hub.login(token=hf_key)

	tokenizer = AutoTokenizer.from_pretrained(
	"bigcode/starcoderbase-3b"
	)
	vardecoder_model = AutoModelForCausalLM.from_pretrained(
	"ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
	)

	example = """{
	"input": "What are the original name and data type of variables `a1`, `a2`, `k`, `j`, `i`?\n```\n_BYTE __fastcall sub_4022CD(_BYTE a1, __int64 a2)\n{\n_BYTE result; // rax\n__int16 v4; // [rsp+1Ch] [rbp-14h]\nunsigned __int16 v5; // [rsp+1Eh] [rbp-12h]\nunsigned __int16 v6; // [rsp+20h] [rbp-10h]\nunsigned __int16 v7; // [rsp+22h] [rbp-Eh]\nunsigned int k; // [rsp+24h] [rbp-Ch]\nunsigned int j; // [rsp+28h] [rbp-8h]\nunsigned int i; // [rsp+2Ch] [rbp-4h]\n\nfor ( i = 0; i <= 2; ++i )\n{\nfor ( j = 0; j <= 0x3F; ++j )\n{\nfor ( k = 0; k <= 3; ++k )\n{\n(&v4 + k) = (_WORD )(a2 + 2 * (k + 4 * j + ((unsigned __int64)i << 8)));\n(&v4 + k) += ((&v4 + k) >> 15) & 0xD01;\n(&v4 + k) = ((((unsigned __int16)(&v4 + k) << 10) + 1664) / 0xD01u) & 0x3FF;\n}\na1 = v4;\na1[1] = (4 v5) \| HIBYTE(v4);\na1[2] = (16 * v6) \| (v5 >> 6);\na1[3] = ((_BYTE)v7 << 6) \| (v6 >> 4);\nresult = a1 + 4;\na1[4] = v7 >> 2;\na1 += 5;\n}\n}\nreturn result;\n}\n```",
	"output": "a1: r, uint8_t\na2: a, const polyvec\nk: t, uint16_t\nj: -, -\ni: k, unsigned int",
	"funname": "pqcrystals_kyber768_ref_polyvec_compress",
	"bin": "6ea440a6c772bc0d6a6089c9ff33ae31da13daf3b72acbe175674b0bb21987ed",
	"proj": "pq-crystals/kyber",
	"cluster_var": {
	"array": [
	[
	"k",
	"j"
	]
	]
	}
	}"""


	@spaces.GPU
	def infer(var_name, code):

	splitcode = code.splitlines()
	comments = prep_decompiled.extract_comments(splitcode)
	sig = prep_decompiled.parse_signature(splitcode)
	print(f"comments={comments} sig={sig}")

	#line = json.loads(input)
	#first_token = line["output"].split(":")[0]
	prompt = code + var_name + ":"

	input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024]
	output = vardecoder_model.generate(
	input_ids=input_ids,
	max_new_tokens=1024,
	num_beams=4,
	num_return_sequences=1,
	do_sample=False,
	early_stopping=False,
	pad_token_id=0,
	eos_token_id=0,
	)[0]
	output = tokenizer.decode(
	output[input_ids.size(1) :],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)

	output = var_name + ":" + output
	return output


	demo = gr.Interface(
	fn=infer,
	inputs=[
	gr.Text(label="First Token", value="a1"),
	gr.Textbox(lines=10, value=json.loads(example)['input']),
	],
	outputs=gr.Text(label="Var Decoder Output"),
	description=description
	)
	demo.launch()