Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,372 Bytes
01c3073 bbc1fe3 01c3073 d3eb07d b1dd808 3f47af7 edab27e fb5ce4e edab27e 8ea9eda b1dd808 d3eb07d b1dd808 49a522b b1dd808 09910fb b1dd808 d3eb07d 0416598 374968b 0416598 374968b 0416598 374968b b1dd808 01c3073 894ff3a 3f47af7 565f3fd fb5ce4e 3f47af7 894ff3a bbc1fe3 4954b56 bbc1fe3 4954b56 bbc1fe3 4954b56 bbc1fe3 894ff3a bbc1fe3 4954b56 894ff3a 4954b56 edab27e 4954b56 01c3073 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
import json
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import huggingface_hub
import prep_decompiled
description = """# ReSym Test Space
This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym). Sadly, at the time I am writing
this, not all of ReSym is publicly available; specifically, the Prolog component
is [not available](https://github.com/lt-asset/resym/issues/2).
This space simply performs inference on the two pretrained models available as
part of the ReSym artifacts. It takes a variable name and some decompiled code
as input, and outputs the variable type and other information.
## Disclaimer
I'm not a ReSym developer and I may have messed something up. In particular,
you must prompt the variable names in the decompiled code as part of the prompt,
and I reused some of their own code to do this.
## Todo
* Add support for FieldDecoder model
"""
hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)
tokenizer = AutoTokenizer.from_pretrained(
"bigcode/starcoderbase-3b"
)
vardecoder_model = AutoModelForCausalLM.from_pretrained(
"ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
)
example = r"""{
"input": "What are the original name and data type of variables `a1`, `a2`, `a3`, `v4`, `v5`?\n```\n__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)\n{\nint v4; // [rsp+20h] [rbp-20h] BYREF\n__int64 v5; // [rsp+28h] [rbp-18h]\n\nif ( !a1 || !a2 || !a3 )\nreturn 0LL;\nv4 = 5;\nv5 = a3;\nreturn sub_411142(a1, a2, &v4);\n}\n```",
"output": "a1: dict, pdfio_dict_t*\na2: key, const char*\na3: value, pdfio_dict_t*\nv4: temp, struct _pdfio_value_t\nv5: -, -",
"funname": "pdfioDictSetDict",
"bin": "d0ebaa77558783765df381826c961e95c83338e285011d501a62bea474e93451",
"proj": "michaelrsweet/pdfio",
"cluster_var": {
"struct": [
[
"v4",
"v5"
]
]
}
}"""
@spaces.GPU
def infer(var_name, code):
splitcode = code.splitlines()
bodyvars = [v.name for v in prep_decompiled.extract_comments(splitcode) if "name" in v]
argvars = [v.name for v in prep_decompiled.parse_signature(splitcode) if "name" in v]
vars = argvars + bodyvars
#comments = prep_decompiled.extract_comments(splitcode)
#sig = prep_decompiled.parse_signature(splitcode)
print(f"vars {vars}")
#line = json.loads(input)
#first_token = line["output"].split(":")[0]
prompt = code + var_name + ":"
input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024]
output = vardecoder_model.generate(
input_ids=input_ids,
max_new_tokens=1024,
num_beams=4,
num_return_sequences=1,
do_sample=False,
early_stopping=False,
pad_token_id=0,
eos_token_id=0,
)[0]
output = tokenizer.decode(
output[input_ids.size(1) :],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
output = var_name + ":" + output
return output
demo = gr.Interface(
fn=infer,
inputs=[
gr.Text(label="First Token", value="a1"),
gr.Textbox(lines=10, value=json.loads(example)['input']),
],
outputs=gr.Text(label="Var Decoder Output"),
description=description
)
demo.launch()
|