danlou commited on
Commit
fe3e65b
1 Parent(s): bd443fb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +12 -12
README.md CHANGED
@@ -22,12 +22,13 @@ Replace usernames and links for placeholders: "@user" and "http".
22
  If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
23
  ```python
24
  def preprocess(text):
25
- new_text = []
26
- for t in text.split(" "):
27
- t = '@user' if t.startswith('@') and len(t) > 1 else t
28
- t = 'http' if t.startswith('http') else t
29
- new_text.append(t)
30
- return " ".join(new_text)
 
31
  ```
32
 
33
  ## Example Masked Language Model
@@ -39,8 +40,8 @@ MODEL = "cardiffnlp/twitter-roberta-base-jun2022-15M-incr"
39
  fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
40
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
41
 
42
- def print_candidates():
43
- for i in range(5):
44
  token = tokenizer.decode(candidates[i]['token'])
45
  score = candidates[i]['score']
46
  print("%d) %.5f %s" % (i+1, score, token))
@@ -54,7 +55,7 @@ for text in texts:
54
  t = preprocess(text)
55
  print(f"{'-'*30}\n{t}")
56
  candidates = fill_mask(t)
57
- print_candidates()
58
  ```
59
 
60
  Output:
@@ -90,13 +91,12 @@ import numpy as np
90
  from scipy.spatial.distance import cosine
91
  from collections import Counter
92
 
93
- def get_embedding(text):
94
  text = preprocess(text)
95
  encoded_input = tokenizer(text, return_tensors='pt')
96
  features = model(**encoded_input)
97
  features = features[0].detach().cpu().numpy()
98
- features_mean = np.mean(features[0], axis=0)
99
- return features_mean
100
 
101
 
102
  MODEL = "cardiffnlp/twitter-roberta-base-jun2022-15M-incr"
 
22
  If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
23
  ```python
24
  def preprocess(text):
25
+ preprocessed_text = []
26
+ for t in text.split(): # expects whitespace tokenization
27
+ if len(t) > 1:
28
+ t = '@user' if t[0] == '@' and t.count('@') == 1 else t
29
+ t = 'http' if t.startswith('http') else t
30
+ preprocessed_text.append(t)
31
+ return ' '.join(preprocessed_text)
32
  ```
33
 
34
  ## Example Masked Language Model
 
40
  fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
41
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
42
 
43
+ def pprint(candidates, n):
44
+ for i in range(n):
45
  token = tokenizer.decode(candidates[i]['token'])
46
  score = candidates[i]['score']
47
  print("%d) %.5f %s" % (i+1, score, token))
 
55
  t = preprocess(text)
56
  print(f"{'-'*30}\n{t}")
57
  candidates = fill_mask(t)
58
+ pprint(candidates, 5)
59
  ```
60
 
61
  Output:
 
91
  from scipy.spatial.distance import cosine
92
  from collections import Counter
93
 
94
+ def get_embedding(text): # naive approach for demonstration
95
  text = preprocess(text)
96
  encoded_input = tokenizer(text, return_tensors='pt')
97
  features = model(**encoded_input)
98
  features = features[0].detach().cpu().numpy()
99
+ return np.mean(features[0], axis=0)
 
100
 
101
 
102
  MODEL = "cardiffnlp/twitter-roberta-base-jun2022-15M-incr"