list_of_pos_tags = [
    "ADJ",
    "ADP",
    "ADV",
    "AUX",
    "CCONJ",
    "DET",
    "INTJ",
    "NOUN",
    "NUM",
    "PART",
    "PRON",
    "PROPN",
    "PUNCT",
    "SCONJ",
    "SYM",
    "VERB",
    "X"
]

realis_list = ["O",
    "Generic", 
    "Other", 
    "Actual"
]


event_args_list = ['O',
    'B-System',
    'I-System',
    'B-Organization',
    'B-Money',
    'I-Money',
    'B-Device',
    'B-Person',
    'I-Person',
    'B-Vulnerability',
    'I-Vulnerability',
    'B-Capabilities',
    'I-Capabilities',
    'I-Organization',
    'B-PaymentMethod',
    'I-PaymentMethod',
    'B-Data',
    'I-Data',
    'B-Number',
    'I-Number',
    'B-Malware',
    'I-Malware',
    'B-PII',
    'I-PII',
    'B-CVE',
    'I-CVE',
    'B-Purpose',
    'I-Purpose',
    'B-File',
    'I-File',
    'I-Device',
    'B-Time',
    'I-Time',
    'B-Software',
    'I-Software',
    'B-Patch',
    'I-Patch',
    'B-Version',
    'I-Version',
    'B-Website',
    'I-Website',
    'B-GPE',
    'I-GPE'
]

event_nugget_list = ['O',
 'B-Ransom',
 'I-Ransom',
 'B-DiscoverVulnerability',
 'I-DiscoverVulnerability',
 'B-PatchVulnerability',
 'I-PatchVulnerability',
 'B-Databreach',
 'I-Databreach',
 'B-Phishing',
 'I-Phishing'
]

arg_2_role = {
    "File" : ['Tool', 'Trusted-Entity'],
    "Person" : ['Victim', 'Attacker', 'Discoverer', 'Releaser', 'Trusted-Entity', 'Vulnerable_System_Owner'],
    "Capabilities" : ['Attack-Pattern', 'Capabilities', 'Issues-Addressed'],
    "Purpose" : ['Purpose'],
    "Time" : ['Time'],
    "PII" : ['Compromised-Data', 'Trusted-Entity'],
    "Data" : ['Compromised-Data', 'Trusted-Entity'],
    "Organization" : ['Victim', 'Releaser', 'Discoverer', 'Attacker', 'Vulnerable_System_Owner', 'Trusted-Entity'],
    "Patch" : ['Patch'],
    "Software" : ['Vulnerable_System', 'Victim', 'Trusted-Entity', 'Supported_Platform'],
    "Vulnerability" : ['Vulnerability'],
    "Version" : ['Patch-Number', 'Vulnerable_System_Version'],
    "Device" : ['Vulnerable_System', 'Victim', 'Supported_Platform'],
    "CVE" : ['CVE'],
    "Number" : ['Number-of-Data', 'Number-of-Victim'],
    "System" : ['Victim', 'Supported_Platform', 'Vulnerable_System', 'Trusted-Entity'],
    "Malware" : ['Tool'],
    "Money" : ['Price', 'Damage-Amount'],
    "PaymentMethod" : ['Payment-Method'],
    "GPE" : ['Place'],
    "Website" : ['Trusted-Entity', 'Tool', 'Vulnerable_System', 'Victim', 'Supported_Platform'],
}

def get_content(data):
    return data["content"]

def get_event_nugget(data):
    return [
        {"nugget" : event["nugget"], "type" : event["type"], "subtype" : event["subtype"], "realis" : event["realis"]}
        for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]
    ]
def get_event_args(data):
    events = [event for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]]
    args = []
    for event in events:
        if "argument" in event.keys():
            args.extend(event["argument"])
    return args

def get_idxs_from_text(text, text_tokenized):
    rest_text = text
    last_idx = 0
    result_dict = []

    for substring in text_tokenized:
        index = rest_text.find(substring)
        result_dict.append(
            {
                "word" : substring,
                "start_idx" : last_idx + index,
                "end_idx" : last_idx + index + len(substring)
            }
        )
        rest_text = rest_text[index + len(substring) : ]
        last_idx += index + len(substring)
    return result_dict

def get_entity_from_idx(start_idx, end_idx, event_nuggets):
    event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets]
    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
            return "B-" + event_nuggets[idx]["subtype"]
        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
            return "I-" + event_nuggets[idx]["subtype"]
    return "O"

def get_entity_and_realis_from_idx(start_idx, end_idx, event_nuggets):
    event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets]
    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
            return "B-" + event_nuggets[idx]["subtype"], "B-" + event_nuggets[idx]["realis"]
        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
            return "I-" + event_nuggets[idx]["subtype"], "I-" + event_nuggets[idx]["realis"]
    return "O", "O"

def get_args_entity_from_idx(start_idx, end_idx, event_args):
    event_nuggets_idxs = [(nugget["startOffset"], nugget["endOffset"]) for nugget in event_args]
    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
            return "B-" + event_args[idx]["type"]
        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
            return "I-" + event_args[idx]["type"]
    return "O"

def split_with_character(string, char):
    result = []
    start = 0
    for i, c in enumerate(string):
        if c == char:
            result.append(string[start:i])
            result.append(char)
            start = i + 1
    result.append(string[start:])
    return [x for x in result if x != '']

def extend_list_with_character(content_list, character):
    content_as_words = []
    for word in content_list:
        if character in word:
            split_list = split_with_character(word, character)
            content_as_words.extend(split_list)
        else:
            content_as_words.append(word)
    return content_as_words

def find_dict_by_overlap(list_of_dicts, key_value_pairs):
    for dictionary in list_of_dicts:
        if max(dictionary["start"], dictionary["end"]) >= min(key_value_pairs["start"], key_value_pairs["end"]) and max(key_value_pairs["start"], key_value_pairs["end"]) >= min(dictionary["start"], dictionary["end"]):
            return dictionary
    return None