resym / app.py
ejschwartz's picture
try again
d2a677d
raw
history blame
3.38 kB
import gradio as gr
import json
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import huggingface_hub
import prep_decompiled
description = """# ReSym Test Space
This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing
this, not all of ReSym is publicly available; specifically, the Prolog component
is [not available](https://github.com/lt-asset/resym/issues/2).
This space simply performs inference on the two pretrained models available as
part of the ReSym artifacts. It takes a variable name and some decompiled code
as input, and outputs the variable type and other information.
## Disclaimer
I'm not a ReSym developer and I may have messed something up. In particular,
you must prompt the variable names in the decompiled code as part of the prompt,
and I reused some of their own code to do this.
## Todo
* Add support for FieldDecoder model
"""
hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)
tokenizer = AutoTokenizer.from_pretrained(
"bigcode/starcoderbase-3b"
)
vardecoder_model = AutoModelForCausalLM.from_pretrained(
"ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
)
example = r"""{
"input": "What are the original name and data type of variables `a1`, `a2`, `a3`, `v4`, `v5`?\n```\n__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)\n{\nint v4; // [rsp+20h] [rbp-20h] BYREF\n__int64 v5; // [rsp+28h] [rbp-18h]\n\nif ( !a1 || !a2 || !a3 )\nreturn 0LL;\nv4 = 5;\nv5 = a3;\nreturn sub_411142(a1, a2, &v4);\n}\n```",
"output": "a1: dict, pdfio_dict_t*\na2: key, const char*\na3: value, pdfio_dict_t*\nv4: temp, struct _pdfio_value_t\nv5: -, -",
"funname": "pdfioDictSetDict",
"bin": "d0ebaa77558783765df381826c961e95c83338e285011d501a62bea474e93451",
"proj": "michaelrsweet/pdfio",
"cluster_var": {
"struct": [
[
"v4",
"v5"
]
]
}
}"""
@spaces.GPU
def infer(var_name, code):
splitcode = code.splitlines()
bodyvars = [v['name'] for v in prep_decompiled.extract_comments(splitcode) if "name" in v]
argvars = [v['name'] for v in prep_decompiled.parse_signature(splitcode) if "name" in v]
vars = argvars + bodyvars
#comments = prep_decompiled.extract_comments(splitcode)
#sig = prep_decompiled.parse_signature(splitcode)
print(f"vars {vars}")
#line = json.loads(input)
#first_token = line["output"].split(":")[0]
prompt = code + var_name + ":"
input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024]
output = vardecoder_model.generate(
input_ids=input_ids,
max_new_tokens=1024,
num_beams=4,
num_return_sequences=1,
do_sample=False,
early_stopping=False,
pad_token_id=0,
eos_token_id=0,
)[0]
output = tokenizer.decode(
output[input_ids.size(1) :],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
output = var_name + ":" + output
return output
demo = gr.Interface(
fn=infer,
inputs=[
gr.Text(label="First Token", value="a1"),
gr.Textbox(lines=10, value=json.loads(example)['input']),
],
outputs=gr.Text(label="Var Decoder Output"),
description=description
)
demo.launch()