arubenruben
/

NER-PT-BERT-CRF-HAREM-Default

@@ -14,14 +14,13 @@ class TokenizeAndAlignLabelsStep():
     # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
     def tokenize_and_align_labels(self, examples, tokenizer):
-        tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)
         # Map tokens to their respective word.
         word_ids = tokenized_inputs.word_ids()
         previous_word_idx = None
-        tokens= []
         labels_mask = []
         for word_idx in word_ids:  # Set the special tokens to -100.
@@ -29,20 +28,17 @@ class TokenizeAndAlignLabelsStep():
                 labels_mask.append(False)
             # Only label the first token of a given word.
             elif word_idx != previous_word_idx:
-                labels_mask.append(True)
-                tokens.append(tokenized_inputs["input_ids"][word_idx])
             else:
                 labels_mask.append(False)
             previous_word_idx = word_idx
-        tokenized_inputs["tokens"] = tokenizer.decode(tokens, skip_special_tokens=True)
         tokenized_inputs["labels_mask"] = labels_mask
         return tokenized_inputs
 class BERT_CRF_Pipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
@@ -76,18 +72,17 @@ class BERT_CRF_Pipeline(Pipeline):
         outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
                              attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
-        return {'outputs': outputs, 'tokens': tokenizer_results['tokens']}
-    def postprocess(self, outputs):
-        model_outputs = outputs['outputs']
-        tokens = outputs['tokens']
         # From Ner_tags to Ner_labels
         for i, label in enumerate(model_outputs[0]):
             model_outputs[0][i] = self.model.config.id2label[label]
-        return model_outputs[0], tokens
 def main():

     # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
     def tokenize_and_align_labels(self, examples, tokenizer):
+        tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True)
         # Map tokens to their respective word.
         word_ids = tokenized_inputs.word_ids()
         previous_word_idx = None
         labels_mask = []
         for word_idx in word_ids:  # Set the special tokens to -100.
                 labels_mask.append(False)
             # Only label the first token of a given word.
             elif word_idx != previous_word_idx:
+                labels_mask.append(True)
             else:
                 labels_mask.append(False)
             previous_word_idx = word_idx
         tokenized_inputs["labels_mask"] = labels_mask
         return tokenized_inputs
 class BERT_CRF_Pipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
         outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
                              attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
+        return outputs
+    def postprocess(self, model_outputs):
         # From Ner_tags to Ner_labels
         for i, label in enumerate(model_outputs[0]):
             model_outputs[0][i] = self.model.config.id2label[label]
+        return model_outputs[0]
 def main():