Spaces:

ejschwartz
/

resym

Running on Zero

File size: 3,372 Bytes

01c3073
bbc1fe3
 
01c3073
 
d3eb07d
 
b1dd808
 
3f47af7
 
edab27e
 
 
 
 
 
 
 
 
 
 
fb5ce4e
 
 
 
 
 
edab27e
 
 
 
 
8ea9eda
b1dd808
 
d3eb07d
b1dd808
49a522b
b1dd808
09910fb
b1dd808
d3eb07d
 
0416598
 
 
 
 
 
374968b
0416598
374968b
0416598
 
374968b
 
 
 
b1dd808
01c3073
 
894ff3a
3f47af7
 
565f3fd
 
fb5ce4e
 
 
 
3f47af7
894ff3a
 
 
bbc1fe3
4954b56
bbc1fe3
4954b56
 
 
 
 
 
 
 
bbc1fe3
4954b56
 
 
 
 
bbc1fe3
894ff3a
bbc1fe3
 
4954b56
 
 
 
894ff3a
 
4954b56
 
edab27e
4954b56
01c3073

import gradio as gr
import json
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

import huggingface_hub

import prep_decompiled

description = """# ReSym Test Space

This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym).  Sadly, at the time I am writing
this, not all of ReSym is publicly available; specifically, the Prolog component
is [not available](https://github.com/lt-asset/resym/issues/2).

This space simply performs inference on the two pretrained models available as
part of the ReSym artifacts. It takes a variable name and some decompiled code
as input, and outputs the variable type and other information.

## Disclaimer

I'm not a ReSym developer and I may have messed something up.  In particular,
you must prompt the variable names in the decompiled code as part of the prompt,
and I reused some of their own code to do this.

## Todo

* Add support for FieldDecoder model

"""

hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)

tokenizer = AutoTokenizer.from_pretrained(
    "bigcode/starcoderbase-3b"
)
vardecoder_model = AutoModelForCausalLM.from_pretrained(
    "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
)

example = r"""{
  "input": "What are the original name and data type of variables `a1`, `a2`, `a3`, `v4`, `v5`?\n```\n__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)\n{\nint v4; // [rsp+20h] [rbp-20h] BYREF\n__int64 v5; // [rsp+28h] [rbp-18h]\n\nif ( !a1 || !a2 || !a3 )\nreturn 0LL;\nv4 = 5;\nv5 = a3;\nreturn sub_411142(a1, a2, &v4);\n}\n```",
  "output": "a1: dict, pdfio_dict_t*\na2: key, const char*\na3: value, pdfio_dict_t*\nv4: temp, struct _pdfio_value_t\nv5: -, -",
  "funname": "pdfioDictSetDict",
  "bin": "d0ebaa77558783765df381826c961e95c83338e285011d501a62bea474e93451",
  "proj": "michaelrsweet/pdfio",
  "cluster_var": {
    "struct": [
      [
        "v4",
        "v5"
      ]
    ]
  }
}"""


@spaces.GPU
def infer(var_name, code):

    splitcode = code.splitlines()
    bodyvars = [v.name for v in prep_decompiled.extract_comments(splitcode) if "name" in v]
    argvars = [v.name for v in prep_decompiled.parse_signature(splitcode) if "name" in v]
    vars = argvars + bodyvars
    #comments = prep_decompiled.extract_comments(splitcode)
    #sig = prep_decompiled.parse_signature(splitcode)
    print(f"vars {vars}")

    #line = json.loads(input)
    #first_token = line["output"].split(":")[0]
    prompt = code + var_name + ":"

    input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()[:, : 8192 - 1024]
    output = vardecoder_model.generate(
        input_ids=input_ids,
        max_new_tokens=1024,
        num_beams=4,
        num_return_sequences=1,
        do_sample=False,
        early_stopping=False,
        pad_token_id=0,
        eos_token_id=0,
    )[0]
    output = tokenizer.decode(
        output[input_ids.size(1) :],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    output = var_name + ":" + output
    return output


demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Text(label="First Token", value="a1"),
        gr.Textbox(lines=10, value=json.loads(example)['input']),
    ],
    outputs=gr.Text(label="Var Decoder Output"),
    description=description
)
demo.launch()