|
--- |
|
license: apache-2.0 |
|
language: |
|
- he |
|
library_name: transformers |
|
pipeline_tag: token-classification |
|
datasets: |
|
- HeTree/MevakerConcSen |
|
--- |
|
## Hebrew Conclusion Extraction Model (based on token classification) |
|
|
|
#### How to use |
|
```python |
|
from transformers import RobertaTokenizerFast, AutoModelForTokenClassification |
|
from datasets import load_dataset |
|
|
|
def split_into_windows(examples): |
|
return {'sentences': [examples['sentence']], 'labels': [examples["label"]]} |
|
|
|
def concatenate_dict_value(dict_obj): |
|
concatenated_dict = {} |
|
for key, value in dict_obj.items(): |
|
flattened_list = [] |
|
for sublist in value: |
|
if len(flattened_list) + len(sublist) <= 512: |
|
for item in sublist: |
|
flattened_list.append(item) |
|
else: |
|
print("Not all sentences were processed due to length") |
|
break |
|
concatenated_dict[key] = flattened_list |
|
return concatenated_dict |
|
|
|
def tokenize_and_align_labels(examples): |
|
tokenized_inputs = tokenizer(examples["sentences"], truncation=True, max_length=512) |
|
tokeized_inp_concat = concatenate_dict_value(tokenized_inputs) |
|
tokenized_inputs["input_ids"] = tokeized_inp_concat['input_ids'] |
|
tokenized_inputs["attention_mask"] = tokeized_inp_concat['attention_mask'] |
|
word_ids = tokenized_inputs["input_ids"] |
|
labels = [] |
|
count = 0 |
|
for word_idx in word_ids: |
|
if word_idx == 2: |
|
labels.append(examples[f"labels"][count]) |
|
count = count + 1 |
|
else: |
|
labels.append(-100) |
|
tokenized_inputs["labels"] = labels |
|
return tokenized_inputs |
|
|
|
model = AutoModelForTokenClassification.from_pretrained('HeTree/HeConE') |
|
tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConE') |
|
raw_dataset = load_dataset('HeTree/MevakerConcSen') |
|
window_size = 5 |
|
raw_dataset_window = raw_dataset.map(split_into_windows, batched=True, batch_size=window_size, remove_columns=raw_dataset['train'].column_names) |
|
tokenized_dataset = raw_dataset_window.map(tokenize_and_align_labels, batched=False) |
|
``` |
|
|
|
### Citing |
|
|
|
If you use HeConE in your research, please cite [Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language](https://arxiv.org/abs/2403.09719). |
|
``` |
|
@article{shalumov2024mevaker, |
|
title={Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language}, |
|
author={Vitaly Shalumov and Harel Haskey and Yuval Solaz}, |
|
year={2024}, |
|
eprint={2403.09719}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |