vitvit commited on
Commit
deaf77e
1 Parent(s): 24a1b51

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -0
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+
3
+ from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
4
+ from datasets import load_dataset
5
+
6
+ def split_into_windows(examples):
7
+ return {'sentences': [examples['sentence']], 'labels': [examples["label"]]}
8
+
9
+ def concatenate_dict_value(dict_obj):
10
+ concatenated_dict = {}
11
+ for key, value in dict_obj.items():
12
+ flattened_list = []
13
+ for sublist in value:
14
+ if len(flattened_list) + len(sublist) <= 512:
15
+ for item in sublist:
16
+ flattened_list.append(item)
17
+ else:
18
+ print("Not all sentences were processed due to length")
19
+ break
20
+ concatenated_dict[key] = flattened_list
21
+ return concatenated_dict
22
+
23
+ def tokenize_and_align_labels(examples):
24
+ tokenized_inputs = tokenizer(examples["sentences"], truncation=True, max_length=512)
25
+ tokeized_inp_concat = concatenate_dict_value(tokenized_inputs)
26
+ tokenized_inputs["input_ids"] = tokeized_inp_concat['input_ids']
27
+ tokenized_inputs["attention_mask"] = tokeized_inp_concat['attention_mask']
28
+ word_ids = tokenized_inputs["input_ids"]
29
+ labels = []
30
+ count = 0
31
+ for word_idx in word_ids:
32
+ if word_idx == 2:
33
+ labels.append(examples[f"labels"][count])
34
+ count = count + 1
35
+ else:
36
+ labels.append(-100)
37
+ tokenized_inputs["labels"] = labels
38
+ return tokenized_inputs
39
+
40
+ model = AutoModelForTokenClassification.from_pretrained('HeTree/HeConE') # same as bert?? so no wories
41
+ tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConE')
42
+ raw_dataset = load_dataset('HeTree/MevakerConcSen')
43
+ window_size = 5
44
+ raw_dataset_window = raw_dataset.map(split_into_windows, batched=True, batch_size=window_size, remove_columns=raw_dataset['train'].column_names)
45
+ tokenized_dataset = raw_dataset_window.map(tokenize_and_align_labels, batched=False)
46
+
47
+
48
+ ```