import gradio as gr import json import os import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM import huggingface_hub import prep_decompiled description = """# ReSym Test Space This is a test space of the models from the [ReSym artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing this, not all of ReSym is publicly available; specifically, the Prolog component is [not available](https://github.com/lt-asset/resym/issues/2). This space simply performs inference on the two pretrained models available as part of the ReSym artifacts. It takes a variable name and some decompiled code as input, and outputs the variable type and other information. ## Disclaimer I'm not a ReSym developer and I may have messed something up. In particular, you must prompt the variable names in the decompiled code as part of the prompt, and I reused some of their own code to do this. ## Todo * Add support for FieldDecoder model """ hf_key = os.environ["HF_TOKEN"] huggingface_hub.login(token=hf_key) tokenizer = AutoTokenizer.from_pretrained( "bigcode/starcoderbase-3b" ) vardecoder_model = AutoModelForCausalLM.from_pretrained( "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto" ) example = r"""{ "input": "What are the original name and data type of variables `a1`, `a2`, `a3`, `v4`, `v5`?\n```\n__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)\n{\nint v4; // [rsp+20h] [rbp-20h] BYREF\n__int64 v5; // [rsp+28h] [rbp-18h]\n\nif ( !a1 || !a2 || !a3 )\nreturn 0LL;\nv4 = 5;\nv5 = a3;\nreturn sub_411142(a1, a2, &v4);\n}\n```", "output": "a1: dict, pdfio_dict_t*\na2: key, const char*\na3: value, pdfio_dict_t*\nv4: temp, struct _pdfio_value_t\nv5: -, -", "funname": "pdfioDictSetDict", "bin": "d0ebaa77558783765df381826c961e95c83338e285011d501a62bea474e93451", "proj": "michaelrsweet/pdfio", "cluster_var": { "struct": [ [ "v4", "v5" ] ] } }""" @spaces.GPU def infer(var_name, code): splitcode = code.splitlines() bodyvars = [v.name for v in prep_decompiled.extract_comments(splitcode) if "name" in v] argvars = [v.name for v in prep_decompiled.parse_signature(splitcode) if "name" in v] vars = argvars + bodyvars #comments = prep_decompiled.extract_comments(splitcode) #sig = prep_decompiled.parse_signature(splitcode) print(f"vars {vars}") #line = json.loads(input) #first_token = line["output"].split(":")[0] prompt = code + var_name + ":" input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024] output = vardecoder_model.generate( input_ids=input_ids, max_new_tokens=1024, num_beams=4, num_return_sequences=1, do_sample=False, early_stopping=False, pad_token_id=0, eos_token_id=0, )[0] output = tokenizer.decode( output[input_ids.size(1) :], skip_special_tokens=True, clean_up_tokenization_spaces=True, ) output = var_name + ":" + output return output demo = gr.Interface( fn=infer, inputs=[ gr.Text(label="First Token", value="a1"), gr.Textbox(lines=10, value=json.loads(example)['input']), ], outputs=gr.Text(label="Var Decoder Output"), description=description ) demo.launch()