|
list_of_pos_tags = [ |
|
"ADJ", |
|
"ADP", |
|
"ADV", |
|
"AUX", |
|
"CCONJ", |
|
"DET", |
|
"INTJ", |
|
"NOUN", |
|
"NUM", |
|
"PART", |
|
"PRON", |
|
"PROPN", |
|
"PUNCT", |
|
"SCONJ", |
|
"SYM", |
|
"VERB", |
|
"X" |
|
] |
|
|
|
realis_list = ["O", |
|
"Generic", |
|
"Other", |
|
"Actual" |
|
] |
|
|
|
|
|
event_args_list = ['O', |
|
'B-System', |
|
'I-System', |
|
'B-Organization', |
|
'B-Money', |
|
'I-Money', |
|
'B-Device', |
|
'B-Person', |
|
'I-Person', |
|
'B-Vulnerability', |
|
'I-Vulnerability', |
|
'B-Capabilities', |
|
'I-Capabilities', |
|
'I-Organization', |
|
'B-PaymentMethod', |
|
'I-PaymentMethod', |
|
'B-Data', |
|
'I-Data', |
|
'B-Number', |
|
'I-Number', |
|
'B-Malware', |
|
'I-Malware', |
|
'B-PII', |
|
'I-PII', |
|
'B-CVE', |
|
'I-CVE', |
|
'B-Purpose', |
|
'I-Purpose', |
|
'B-File', |
|
'I-File', |
|
'I-Device', |
|
'B-Time', |
|
'I-Time', |
|
'B-Software', |
|
'I-Software', |
|
'B-Patch', |
|
'I-Patch', |
|
'B-Version', |
|
'I-Version', |
|
'B-Website', |
|
'I-Website', |
|
'B-GPE', |
|
'I-GPE' |
|
] |
|
|
|
event_nugget_list = ['O', |
|
'B-Ransom', |
|
'I-Ransom', |
|
'B-DiscoverVulnerability', |
|
'I-DiscoverVulnerability', |
|
'B-PatchVulnerability', |
|
'I-PatchVulnerability', |
|
'B-Databreach', |
|
'I-Databreach', |
|
'B-Phishing', |
|
'I-Phishing' |
|
] |
|
|
|
arg_2_role = { |
|
"File" : ['Tool', 'Trusted-Entity'], |
|
"Person" : ['Victim', 'Attacker', 'Discoverer', 'Releaser', 'Trusted-Entity', 'Vulnerable_System_Owner'], |
|
"Capabilities" : ['Attack-Pattern', 'Capabilities', 'Issues-Addressed'], |
|
"Purpose" : ['Purpose'], |
|
"Time" : ['Time'], |
|
"PII" : ['Compromised-Data', 'Trusted-Entity'], |
|
"Data" : ['Compromised-Data', 'Trusted-Entity'], |
|
"Organization" : ['Victim', 'Releaser', 'Discoverer', 'Attacker', 'Vulnerable_System_Owner', 'Trusted-Entity'], |
|
"Patch" : ['Patch'], |
|
"Software" : ['Vulnerable_System', 'Victim', 'Trusted-Entity', 'Supported_Platform'], |
|
"Vulnerability" : ['Vulnerability'], |
|
"Version" : ['Patch-Number', 'Vulnerable_System_Version'], |
|
"Device" : ['Vulnerable_System', 'Victim', 'Supported_Platform'], |
|
"CVE" : ['CVE'], |
|
"Number" : ['Number-of-Data', 'Number-of-Victim'], |
|
"System" : ['Victim', 'Supported_Platform', 'Vulnerable_System', 'Trusted-Entity'], |
|
"Malware" : ['Tool'], |
|
"Money" : ['Price', 'Damage-Amount'], |
|
"PaymentMethod" : ['Payment-Method'], |
|
"GPE" : ['Place'], |
|
"Website" : ['Trusted-Entity', 'Tool', 'Vulnerable_System', 'Victim', 'Supported_Platform'], |
|
} |
|
|
|
def get_content(data): |
|
return data["content"] |
|
|
|
def get_event_nugget(data): |
|
return [ |
|
{"nugget" : event["nugget"], "type" : event["type"], "subtype" : event["subtype"], "realis" : event["realis"]} |
|
for hopper in data["cyberevent"]["hopper"] for event in hopper["events"] |
|
] |
|
def get_event_args(data): |
|
events = [event for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]] |
|
args = [] |
|
for event in events: |
|
if "argument" in event.keys(): |
|
args.extend(event["argument"]) |
|
return args |
|
|
|
def get_idxs_from_text(text, text_tokenized): |
|
rest_text = text |
|
last_idx = 0 |
|
result_dict = [] |
|
|
|
for substring in text_tokenized: |
|
index = rest_text.find(substring) |
|
result_dict.append( |
|
{ |
|
"word" : substring, |
|
"start_idx" : last_idx + index, |
|
"end_idx" : last_idx + index + len(substring) |
|
} |
|
) |
|
rest_text = rest_text[index + len(substring) : ] |
|
last_idx += index + len(substring) |
|
return result_dict |
|
|
|
def get_entity_from_idx(start_idx, end_idx, event_nuggets): |
|
event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets] |
|
for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs): |
|
if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start): |
|
return "B-" + event_nuggets[idx]["subtype"] |
|
elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end): |
|
return "I-" + event_nuggets[idx]["subtype"] |
|
return "O" |
|
|
|
def get_entity_and_realis_from_idx(start_idx, end_idx, event_nuggets): |
|
event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets] |
|
for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs): |
|
if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start): |
|
return "B-" + event_nuggets[idx]["subtype"], "B-" + event_nuggets[idx]["realis"] |
|
elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end): |
|
return "I-" + event_nuggets[idx]["subtype"], "I-" + event_nuggets[idx]["realis"] |
|
return "O", "O" |
|
|
|
def get_args_entity_from_idx(start_idx, end_idx, event_args): |
|
event_nuggets_idxs = [(nugget["startOffset"], nugget["endOffset"]) for nugget in event_args] |
|
for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs): |
|
if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start): |
|
return "B-" + event_args[idx]["type"] |
|
elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end): |
|
return "I-" + event_args[idx]["type"] |
|
return "O" |
|
|
|
def split_with_character(string, char): |
|
result = [] |
|
start = 0 |
|
for i, c in enumerate(string): |
|
if c == char: |
|
result.append(string[start:i]) |
|
result.append(char) |
|
start = i + 1 |
|
result.append(string[start:]) |
|
return [x for x in result if x != ''] |
|
|
|
def extend_list_with_character(content_list, character): |
|
content_as_words = [] |
|
for word in content_list: |
|
if character in word: |
|
split_list = split_with_character(word, character) |
|
content_as_words.extend(split_list) |
|
else: |
|
content_as_words.append(word) |
|
return content_as_words |
|
|
|
def find_dict_by_overlap(list_of_dicts, key_value_pairs): |
|
for dictionary in list_of_dicts: |
|
if max(dictionary["start"], dictionary["end"]) >= min(key_value_pairs["start"], key_value_pairs["end"]) and max(key_value_pairs["start"], key_value_pairs["end"]) >= min(dictionary["start"], dictionary["end"]): |
|
return dictionary |
|
return None |
|
|