from rex.utils.io import load_jsonlines def check_udi_instance(instance: dict): assert isinstance(instance["id"], str) assert isinstance(instance["instruction"], str) assert isinstance(instance["schema"], dict) for key in instance["schema"]: assert key in ["cls", "ent", "rel", "event"] if key in ["cls", "ent", "rel"]: assert isinstance(instance["schema"][key], list) and all( isinstance(x, str) for x in instance["schema"][key] ) elif key == "event": assert isinstance(instance["schema"][key], dict) for event_type in instance["schema"][key]: assert isinstance(instance["schema"][key][event_type], list) and all( isinstance(x, str) for x in instance["schema"][key][event_type] ) else: raise ValueError assert isinstance(instance["ans"], dict) for key in instance["ans"]: assert key in ["cls", "ent", "rel", "event", "span"] if key == "cls": assert isinstance(instance["ans"][key], list) and all( isinstance(x, str) for x in instance["ans"][key] ) elif key == "ent": assert isinstance(instance["ans"][key], list) and all( isinstance(x, dict) for x in instance["ans"][key] ) for ent in instance["ans"][key]: assert ( isinstance(ent["type"], str) and ent["type"] in instance["schema"]["ent"] ) assert ( isinstance(ent["text"], str) and instance["text"][ent["span"][0] : ent["span"][1]] == ent["text"] ) assert ( isinstance(ent["span"], list) and len(ent["span"]) == 2 and all(isinstance(x, int) for x in ent["span"]) ) elif key == "rel": assert isinstance(instance["ans"][key], list) and all( isinstance(x, dict) for x in instance["ans"][key] ) for rel in instance["ans"][key]: assert ( isinstance(rel["relation"], str) and rel["relation"] in instance["schema"]["rel"] ) assert ( isinstance(rel["head"], dict) and instance["text"][ rel["head"]["span"][0] : rel["head"]["span"][1] ] == rel["head"]["text"] ) assert ( isinstance(rel["tail"], dict) and instance["text"][ rel["tail"]["span"][0] : rel["tail"]["span"][1] ] == rel["tail"]["text"] ) elif key == "event": assert isinstance(instance["ans"][key], list) and all( isinstance(x, dict) for x in instance["ans"][key] ) for event in instance["ans"][key]: assert event["event_type"] in instance["schema"]["event"] assert ( isinstance(event["trigger"], dict) and event["trigger"]["text"] in instance["text"] and instance["text"][ event["trigger"]["span"][0] : event["trigger"]["span"][1] ] == event["trigger"]["text"] ) for arg in event["args"]: assert ( arg["role"] in instance["schema"]["event"][event["event_type"]] ) assert ( isinstance(arg["text"], str) and instance["text"][arg["span"][0] : arg["span"][1]] == arg["text"] ) elif key == "span": assert isinstance(instance["ans"][key], list) and all( isinstance(x, dict) for x in instance["ans"][key] ) for span in instance["ans"][key]: assert ( isinstance(span["text"], str) and instance["text"][span["span"][0] : span["span"][1]] == span["text"] ) else: raise ValueError assert isinstance(instance["text"], str) assert isinstance(instance["bg"], str) for key in ["ent", "rel", "event"]: if instance["schema"].get(key): assert len(instance["text"]) > 0 if "span" in instance["ans"]: assert len(instance["text"]) > 0 assert instance["instruction"] or instance["text"] or instance["bg"] def is_valid_udi_instance(instance: dict): ok = True try: check_udi_instance(instance) except: ok = False return ok def main(): filepaths = [] for filepath in filepaths: data = load_jsonlines(filepath) data_ok = True for ins in data: ok = is_valid_udi_instance(ins) if not ok: data_ok = False break if not data_ok: print(filepath) if __name__ == "__main__": main()