Update README.md
Browse files
README.md
CHANGED
@@ -22,12 +22,13 @@ Replace usernames and links for placeholders: "@user" and "http".
|
|
22 |
If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
|
23 |
```python
|
24 |
def preprocess(text):
|
25 |
-
|
26 |
-
for t in text.split(
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
```
|
32 |
|
33 |
## Example Masked Language Model
|
@@ -39,8 +40,8 @@ MODEL = "cardiffnlp/twitter-roberta-base-jun2022-15M-incr"
|
|
39 |
fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
|
40 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
41 |
|
42 |
-
def
|
43 |
-
for i in range(
|
44 |
token = tokenizer.decode(candidates[i]['token'])
|
45 |
score = candidates[i]['score']
|
46 |
print("%d) %.5f %s" % (i+1, score, token))
|
@@ -54,7 +55,7 @@ for text in texts:
|
|
54 |
t = preprocess(text)
|
55 |
print(f"{'-'*30}\n{t}")
|
56 |
candidates = fill_mask(t)
|
57 |
-
|
58 |
```
|
59 |
|
60 |
Output:
|
@@ -90,13 +91,12 @@ import numpy as np
|
|
90 |
from scipy.spatial.distance import cosine
|
91 |
from collections import Counter
|
92 |
|
93 |
-
def get_embedding(text):
|
94 |
text = preprocess(text)
|
95 |
encoded_input = tokenizer(text, return_tensors='pt')
|
96 |
features = model(**encoded_input)
|
97 |
features = features[0].detach().cpu().numpy()
|
98 |
-
|
99 |
-
return features_mean
|
100 |
|
101 |
|
102 |
MODEL = "cardiffnlp/twitter-roberta-base-jun2022-15M-incr"
|
|
|
22 |
If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
|
23 |
```python
|
24 |
def preprocess(text):
|
25 |
+
preprocessed_text = []
|
26 |
+
for t in text.split(): # expects whitespace tokenization
|
27 |
+
if len(t) > 1:
|
28 |
+
t = '@user' if t[0] == '@' and t.count('@') == 1 else t
|
29 |
+
t = 'http' if t.startswith('http') else t
|
30 |
+
preprocessed_text.append(t)
|
31 |
+
return ' '.join(preprocessed_text)
|
32 |
```
|
33 |
|
34 |
## Example Masked Language Model
|
|
|
40 |
fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
|
41 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
42 |
|
43 |
+
def pprint(candidates, n):
|
44 |
+
for i in range(n):
|
45 |
token = tokenizer.decode(candidates[i]['token'])
|
46 |
score = candidates[i]['score']
|
47 |
print("%d) %.5f %s" % (i+1, score, token))
|
|
|
55 |
t = preprocess(text)
|
56 |
print(f"{'-'*30}\n{t}")
|
57 |
candidates = fill_mask(t)
|
58 |
+
pprint(candidates, 5)
|
59 |
```
|
60 |
|
61 |
Output:
|
|
|
91 |
from scipy.spatial.distance import cosine
|
92 |
from collections import Counter
|
93 |
|
94 |
+
def get_embedding(text): # naive approach for demonstration
|
95 |
text = preprocess(text)
|
96 |
encoded_input = tokenizer(text, return_tensors='pt')
|
97 |
features = model(**encoded_input)
|
98 |
features = features[0].detach().cpu().numpy()
|
99 |
+
return np.mean(features[0], axis=0)
|
|
|
100 |
|
101 |
|
102 |
MODEL = "cardiffnlp/twitter-roberta-base-jun2022-15M-incr"
|