WenqingZhang commited on
Commit
a03163b
1 Parent(s): 1acdd75

Upload transformer_vectorizer.py

Browse files
Files changed (1) hide show
  1. transformer_vectorizer.py +58 -0
transformer_vectorizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Let's import a few requirements
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import numpy
5
+
6
+ class TransformerVectorizer:
7
+ def __init__(self):
8
+ # Load the tokenizer (converts text to tokens)
9
+ self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
10
+
11
+ # Load the pre-trained model
12
+ self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
13
+ "cardiffnlp/twitter-roberta-base-sentiment-latest"
14
+ )
15
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+
17
+ def text_to_tensor(
18
+ self,
19
+ texts: list,
20
+ ) -> numpy.ndarray:
21
+ """Function that transforms a list of texts to their learned representation.
22
+
23
+ Args:
24
+ list_text_X (list): List of texts to be transformed.
25
+
26
+ Returns:
27
+ numpy.ndarray: Transformed list of texts.
28
+ """
29
+ # First, tokenize all the input text
30
+ tokenized_text_X_train = self.tokenizer.batch_encode_plus(
31
+ texts, return_tensors="pt"
32
+ )["input_ids"]
33
+
34
+ # Depending on the hardware used, the number of examples to be processed can be reduced
35
+ # Here we split the data into 100 examples per batch
36
+ tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50)
37
+
38
+ # Send the model to the device
39
+ transformer_model = self.transformer_model.to(self.device)
40
+ output_hidden_states_list = []
41
+
42
+ for tokenized_x in tokenized_text_X_train_split:
43
+ # Pass the tokens through the transformer model and get the hidden states
44
+ # Only keep the last hidden layer state for now
45
+ output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[
46
+ 1
47
+ ][-1]
48
+ # Average over the tokens axis to get a representation at the text level.
49
+ output_hidden_states = output_hidden_states.mean(dim=1)
50
+ output_hidden_states = output_hidden_states.detach().cpu().numpy()
51
+ output_hidden_states_list.append(output_hidden_states)
52
+
53
+ self.encodings = numpy.concatenate(output_hidden_states_list, axis=0)
54
+ return self.encodings
55
+
56
+ def transform(self, texts: list):
57
+ return self.text_to_tensor(texts)
58
+