import gradio as gr
import json
import os
import spaces
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

import huggingface_hub

import prep_decompiled

description = """# ReSym Test Space

This is a test space of the models from the [ReSym
artifacts](https://github.com/lt-asset/resym).  Sadly, at the time I am writing
this, not all of ReSym is publicly available; specifically, the Prolog component
is [not available](https://github.com/lt-asset/resym/issues/2).

This space simply performs inference on the two pretrained models available as
part of the ReSym artifacts. It takes a variable name and some decompiled code
as input, and outputs the variable type and other information.

The examples are randomly selected from `vardecoder_test.jsonl`.

## Disclaimer

I'm not a ReSym developer and I may have messed something up.  In particular,
you must prompt the variable names in the decompiled code as part of the prompt,
and I reused some of their own code to do this.

## Todo

* Add field decoding (probably needs Docker)

"""

hf_key = os.environ["HF_TOKEN"]
huggingface_hub.login(token=hf_key)

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-3b")
vardecoder_model = AutoModelForCausalLM.from_pretrained(
    "ejschwartz/resym-vardecoder", torch_dtype=torch.bfloat16, device_map="auto"
)
# fielddecoder_model = AutoModelForCausalLM.from_pretrained(
#     "ejschwartz/resym-fielddecoder", torch_dtype=torch.bfloat16, device_map="auto"
# )

example = r"""__int64 __fastcall sub_410D81(__int64 a1, __int64 a2, __int64 a3)
{
int v4; // [rsp+20h] [rbp-20h] BYREF
__int64 v5; // [rsp+28h] [rbp-18h]

if ( !a1 || !a2 || !a3 )
return 0LL;
v4 = 5;
v5 = a3;
return sub_411142(a1, a2, &v4);
}"""

examples = [
    ex.encode().decode("unicode_escape") for ex in open("examples.txt", "r").readlines()
]


@spaces.GPU
def infer(code):

    splitcode = [s.strip() for s in code.splitlines()]
    code = "\n".join(splitcode)
    bodyvars = [
        v["name"] for v in prep_decompiled.extract_comments(splitcode) if "name" in v
    ]
    argvars = [
        v["name"] for v in prep_decompiled.parse_signature(splitcode) if "name" in v
    ]
    vars = argvars + bodyvars
    # comments = prep_decompiled.extract_comments(splitcode)
    # sig = prep_decompiled.parse_signature(splitcode)
    # print(f"vars {vars}")

    varstring = ", ".join([f"`{v}`" for v in vars])

    var_name = vars[0]

    # ejs: Yeah, this var_name thing is really bizarre. But look at https://github.com/lt-asset/resym/blob/main/training_src/fielddecoder_inf.py
    var_prompt = f"What are the original name and data types of variables {varstring}?\n```\n{code}\n```{var_name}"

    print(f"Prompt:\n{var_prompt}")

    input_ids = tokenizer.encode(var_prompt, return_tensors="pt").cuda()[
        :, : 8192 - 1024
    ]
    var_output = vardecoder_model.generate(
        input_ids=input_ids,
        max_new_tokens=1024,
        num_beams=4,
        num_return_sequences=1,
        do_sample=False,
        early_stopping=False,
        pad_token_id=0,
        eos_token_id=0,
    )[0]
    var_output = tokenizer.decode(
        var_output[input_ids.size(1) :],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )
    # field_output = fielddecoder_model.generate(
    #     input_ids=input_ids,
    #     max_new_tokens=1024,
    #     num_beams=4,
    #     num_return_sequences=1,
    #     do_sample=False,
    #     early_stopping=False,
    #     pad_token_id=0,
    #     eos_token_id=0,
    # )[0]
    # field_output = tokenizer.decode(
    #     field_output[input_ids.size(1) :],
    #     skip_special_tokens=True,
    #     clean_up_tokenization_spaces=True,
    # )

    var_output = var_name + ":" + var_output
    #field_output = var_name + ":" + field_output
    return var_output, varstring


demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Textbox(lines=10, value=example, label="Hex-Rays Decompilation"),
    ],
    outputs=[
        gr.Text(label="Var Decoder Output"),
        # gr.Text(label="Field Decoder Output"),
        gr.Text(label="Generated Variable List"),
    ],
    description=description,
    examples=examples,
)
demo.launch()