import gradio as gr import json import os import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM import huggingface_hub import prep_decompiled description = """# ReSym Test Space This is a test space of the models from the [ReSym artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing this, not all of ReSym is publicly available; specifically, the Prolog component is [not available](https://github.com/lt-asset/resym/issues/2). This space simply performs inference on the two pretrained models available as part of the ReSym artifacts. It takes a variable name and some decompiled code as input, and outputs the variable type and other information. ## Disclaimer I'm not a ReSym developer and I may have messed something up. In particular, you must prompt the variable names in the decompiled code as part of the prompt, and I reused some of their own code to do this. ## Todo * Add support for FieldDecoder model """ hf_key = os.environ["HF_TOKEN"] huggingface_hub.login(token=hf_key) tokenizer = AutoTokenizer.from_pretrained( "bigcode/starcoderbase-3b" ) vardecoder_model = AutoModelForCausalLM.from_pretrained( "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto" ) fielddecoder_model = AutoModelForCausalLM.from_pretrained( "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto" ) example = r"""__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3) { int v4; // [rsp+20h] [rbp-20h] BYREF __int64 v5; // [rsp+28h] [rbp-18h] if ( !a1 || !a2 || !a3 ) return 0LL; v4 = 5; v5 = a3; return sub_411142(a1, a2, &v4); }""" @spaces.GPU def infer(code): splitcode = code.splitlines() bodyvars = [v['name'] for v in prep_decompiled.extract_comments(splitcode) if "name" in v] argvars = [v['name'] for v in prep_decompiled.parse_signature(splitcode) if "name" in v] vars = argvars + bodyvars #comments = prep_decompiled.extract_comments(splitcode) #sig = prep_decompiled.parse_signature(splitcode) #print(f"vars {vars}") varstring = ", ".join([f"`{v}`" for v in vars]) prompt = f"What are the original name and data types of variables {varstring}?\n```{code}\n```" var_name = vars[0] prompt = code + var_name + ":" print(prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024] output = vardecoder_model.generate( input_ids=input_ids, max_new_tokens=1024, num_beams=4, num_return_sequences=1, do_sample=False, early_stopping=False, pad_token_id=0, eos_token_id=0, )[0] output = tokenizer.decode( output[input_ids.size(1) :], skip_special_tokens=True, clean_up_tokenization_spaces=True, ) output = var_name + ":" + output return output, varstring demo = gr.Interface( fn=infer, inputs=[ gr.Textbox(lines=10, value=example), ], outputs=[gr.Text(label="Var Decoder Output"), gr.Text(label="Generated Variable List")], description=description ) demo.launch()