semantic-entropy-probes

Running on Zero

App Files Files Community

s-a-malik commited on Jul 23

Commit

bc61ed1

•

1 Parent(s): bf84689

sentence level highlighting, remove acc probe for now

Browse files

Files changed (3) hide show

__pycache__/app.cpython-311.pyc +0 -0
app.py +108 -56
debug.ipynb +171 -18

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import pickle as pkl
 from pathlib import Path
 from threading import Thread
-from typing import List, Tuple, Iterator, Optional
 from queue import Queue
 import spaces
@@ -10,7 +10,7 @@ import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# TODO Sentence level highlighting instead (prediction after every word is not what it was trained on). Also solves token-level highlighting issues.
 # TODO log prob output scaling highlighting instead?
 # TODO make it look nicer
 # TODO better examples.
@@ -18,8 +18,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 # TODO add options to switch between models, SLT/TBG, layers?
 # TODO full semantic entropy calculation
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """
@@ -31,11 +31,6 @@ DESCRIPTION = """
     <li><span style="background-color: #00FF00; color: black">Green</span> indicates more certain generations</li>
     <li><span style="background-color: #FF0000; color: black">Red</span> indicates more uncertain generations</li>
 </ul>
-<p>The demo compares the model's uncertainty with two different probes:</p>
-<ul>
-    <li><b>Semantic Uncertainty Probe:</b> Predicts the semantic uncertainty of the model's generations.</li>
-    <li><b>Accuracy Probe:</b> Predicts the accuracy of the model's generations.</li>
-</ul>
 <p>Please see our paper for more details. NOTE: This demo is a work in progress.</p>
 """
@@ -49,7 +44,7 @@ EXAMPLES = [
 if torch.cuda.is_available():
     model_id = "meta-llama/Llama-2-7b-chat-hf"
     # TODO load the full model not the 8bit one?
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
@@ -62,6 +57,7 @@ if torch.cuda.is_available():
     se_layer_range = probe_data['sep_layer_range']
     acc_probe = probe_data['t_amodel']
     acc_layer_range = probe_data['ap_layer_range']
 else:
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
@@ -74,7 +70,7 @@ def generate(
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
-) -> Tuple[str, str]:
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
@@ -86,10 +82,7 @@ def generate(
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    #### Generate without threading
     generation_kwargs = dict(
-        input_ids=input_ids,
-        max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         top_k=top_k,
@@ -98,40 +91,91 @@ def generate(
         output_hidden_states=True,
         return_dict_in_generate=True,
     )
-    with torch.no_grad():
-        outputs = model.generate(**generation_kwargs)
-    generated_tokens = outputs.sequences[0, input_ids.shape[1]:]
-    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    print(generated_text)
-    # hidden states
-    hidden = outputs.hidden_states  # list of tensors, one for each token, then (batch size, sequence length, hidden size)
-    se_highlighted_text = ""
-    acc_highlighted_text = ""
-    # skip the first hidden state as it is the prompt
-    for i in range(1, len(hidden)):
-        # Semantic Uncertainty Probe
-        token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden[i]]).numpy()   # (num_layers, hidden_size)
         se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
         se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
-        # Accuracy Probe
         acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
-        acc_probe_pred = (1 - acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1]) * 2 - 1
-        output_id = outputs.sequences[0, input_ids.shape[1]+i]
-        output_word = tokenizer.decode(output_id)
-        print(output_id, output_word, se_probe_pred, acc_probe_pred)
-        se_new_highlighted_text = highlight_text(output_word, se_probe_pred)
-        acc_new_highlighted_text = highlight_text(output_word, acc_probe_pred)
-        se_highlighted_text += f" {se_new_highlighted_text}"
-        acc_highlighted_text += f" {acc_new_highlighted_text}"
-    return se_highlighted_text, acc_highlighted_text
 def highlight_text(text: str, uncertainty_score: float) -> str:
@@ -151,7 +195,8 @@ def highlight_text(text: str, uncertainty_score: float) -> str:
         html_color, text
     )
-with gr.Blocks(title="Llama-2 7B Chat with Dual Probes", css="footer {visibility: hidden}") as demo:
     gr.HTML(DESCRIPTION)
     with gr.Row():
@@ -168,34 +213,41 @@ with gr.Blocks(title="Llama-2 7B Chat with Dual Probes", css="footer {visibility
     with gr.Row():
         generate_btn = gr.Button("Generate")
     # Add spacing between probes
     gr.HTML("<br><br>")
-    with gr.Row():
-        with gr.Column():
             # make a box
-            title = gr.HTML("<h2>Semantic Uncertainty Probe</h2>")
-            se_output = gr.HTML(label="Semantic Uncertainty Probe")
         # Add spacing between columns
-        gr.HTML("<div style='width: 20px;'></div>")
-        with gr.Column():
-            title = gr.HTML("<h2>Accuracy Probe</h2>")
-            acc_output = gr.HTML(label="Accuracy Probe")
     gr.Examples(
         examples=EXAMPLES,
         inputs=[message, system_prompt],
-        outputs=[se_output, acc_output],
         fn=generate,
     )
-    generate_btn.click(
         generate,
         inputs=[message, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[se_output, acc_output]
     )
 if __name__ == "__main__":

 import pickle as pkl
 from pathlib import Path
 from threading import Thread
+from typing import List, Tuple, Iterator, Optional, Generator
 from queue import Queue
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# TODO this is not as fast as it could be using generate function with 1 token at a time
 # TODO log prob output scaling highlighting instead?
 # TODO make it look nicer
 # TODO better examples.
 # TODO add options to switch between models, SLT/TBG, layers?
 # TODO full semantic entropy calculation
+MAX_MAX_NEW_TOKENS = 1024
+DEFAULT_MAX_NEW_TOKENS = 100
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """
     <li><span style="background-color: #00FF00; color: black">Green</span> indicates more certain generations</li>
     <li><span style="background-color: #FF0000; color: black">Red</span> indicates more uncertain generations</li>
 </ul>
 <p>Please see our paper for more details. NOTE: This demo is a work in progress.</p>
 """
 if torch.cuda.is_available():
     model_id = "meta-llama/Llama-2-7b-chat-hf"
     # TODO load the full model not the 8bit one?
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
     se_layer_range = probe_data['sep_layer_range']
     acc_probe = probe_data['t_amodel']
     acc_layer_range = probe_data['ap_layer_range']
+    print(f"Loaded probes with layer ranges: {se_layer_range}, {acc_layer_range}")
 else:
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
+) -> Generator[Tuple[str, str], None, None]:
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     generation_kwargs = dict(
         do_sample=True,
         top_p=top_p,
         top_k=top_k,
         output_hidden_states=True,
         return_dict_in_generate=True,
     )
+    sentence_start_idx = input_ids.shape[1]
+    sentence_token_count = 0
+    finished = False
+    with torch.no_grad():
+        # highlight and return the prompt
+        outputs = model.generate(**generation_kwargs, input_ids=input_ids, max_new_tokens=1)
+        prompt_tokens = outputs.sequences[0, :input_ids.shape[1]]
+        prompt_text = tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+        print(prompt_tokens, prompt_text)
+        # hidden states
+        hidden = outputs.hidden_states
+        # last token embeddings (note this is the same as the token before generation given this is the prompt)
+        token_embeddings = torch.stack([generated_token[0, -1, :].cpu() for generated_token in hidden[0]]).numpy()
         se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
         se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
         acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+        acc_probe_pred = acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][0] * 2 - 1    # accuracy probe is inverted wrt uncertainty
+        se_new_highlighted_text = highlight_text(prompt_text, se_probe_pred)
+        acc_new_highlighted_text = highlight_text(prompt_text, acc_probe_pred)
+        se_highlighted_text = f"{se_new_highlighted_text}<br>"
+        acc_highlighted_text = f"{acc_new_highlighted_text}<br>"
+        while not finished:
+            outputs = model.generate(**generation_kwargs, input_ids=input_ids, max_new_tokens=1)
+            # this should only be the one extra token (equivalent to -1)
+            generated_tokens = outputs.sequences[0, input_ids.shape[1]:]
+            print(f"generated_tokens {generated_tokens}" )
+            # add to the conversation
+            input_ids = torch.cat([input_ids, generated_tokens.unsqueeze(0)], dim=-1)
+            # stop at the end of a sequence
+            if generated_tokens[-1] == tokenizer.eos_token_id or input_ids.shape[1] > max_new_tokens:
+                print("Finished")
+                finished = True
+                if generated_text != "":
+                    # do final prediction on the last generated text (one before the eos token)
+                    print("Predicting probes")
+                    hidden = outputs.hidden_states  # hidden states = (num generated tokens, num layers, batch size, num tokens, hidden size)
+                    # last token embeddings
+                    token_embeddings = torch.stack([generated_token[0, -2, :].cpu() for generated_token in hidden[-1]]).numpy()
+                    se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
+                    se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+                    acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+                    acc_probe_pred = acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][0] * 2 - 1
+                    print(f"se_probe_pred {se_probe_pred}, acc_probe_pred {acc_probe_pred}")
+                    se_new_highlighted_text = highlight_text(generated_text, se_probe_pred)
+                    acc_new_highlighted_text = highlight_text(generated_text, acc_probe_pred)
+                    se_highlighted_text += f" {se_new_highlighted_text}"
+                    acc_highlighted_text += f" {acc_new_highlighted_text}"
+                    sentence_start_idx += sentence_token_count
+                    sentence_token_count = 0
+            # decode the full generated text
+            generated_text = tokenizer.decode(outputs.sequences[0, sentence_start_idx:], skip_special_tokens=True)
+            print(f"generated_text: {generated_text}")
+            sentence_token_count += 1
+            # TODO this should be when a factoid is detected rather than just punctuation. Is the SLT token always basically a period for the probes?
+            if generated_text.endswith(('.', '!', '?', ';', '."', '!"', '?"')):
+                print("Predicting probes")
+                hidden = outputs.hidden_states  # hidden states = (num generated tokens, num layers, batch size, num tokens, hidden size)
+                # last token embeddings
+                token_embeddings = torch.stack([generated_token[0, -1, :].cpu() for generated_token in hidden[-1]]).numpy()
+                se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
+                se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+                acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+                acc_probe_pred = acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][0] * 2 - 1
+                print(f"se_probe_pred {se_probe_pred}, acc_probe_pred {acc_probe_pred}")
+                se_new_highlighted_text = highlight_text(generated_text, se_probe_pred)
+                acc_new_highlighted_text = highlight_text(generated_text, acc_probe_pred)
+                se_highlighted_text += f" {se_new_highlighted_text}"
+                acc_highlighted_text += f" {acc_new_highlighted_text}"
+                sentence_start_idx += sentence_token_count
+                sentence_token_count = 0
+                generated_text = ""
+            # yield se_highlighted_text + generated_text, acc_highlighted_text + generated_text
+            yield se_highlighted_text + generated_text #, acc_highlighted_text + generated_text
 def highlight_text(text: str, uncertainty_score: float) -> str:
         html_color, text
     )
+with gr.Blocks(title="Llama-2 7B Chat with Semantic Uncertainty Probes", css="footer {visibility: hidden}") as demo:
     gr.HTML(DESCRIPTION)
     with gr.Row():
     with gr.Row():
         generate_btn = gr.Button("Generate")
+        stop_btn = gr.Button("Stop")
     # Add spacing between probes
     gr.HTML("<br><br>")
+    # with gr.Row():
+    with gr.Column():
+        title = gr.HTML("<h2>Semantic Uncertainty Probe</h2>")
+        se_output = gr.HTML(label="Semantic Uncertainty Probe")
+        # with gr.Column():
             # make a box
+            # title = gr.HTML("<h2>Semantic Uncertainty Probe</h2>")
+            # se_output = gr.HTML(label="Semantic Uncertainty Probe")
         # Add spacing between columns
+        # gr.HTML("<div style='width: 20px;'></div>")
+        # with gr.Column():
+            # title = gr.HTML("<h2>Accuracy Probe</h2>")
+            # acc_output = gr.HTML(label="Accuracy Probe")
     gr.Examples(
         examples=EXAMPLES,
         inputs=[message, system_prompt],
+        # outputs=[se_output, acc_output],
+        outputs=[se_output],
         fn=generate,
     )
+    generate_event = generate_btn.click(
         generate,
         inputs=[message, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        # outputs=[se_output, acc_output]
+        outputs=[se_output]
     )
+    stop_btn.click(fn=None, inputs=None, outputs=None, cancels=[generate_event])
 if __name__ == "__main__":

debug.ipynb CHANGED Viewed

@@ -67,19 +67,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "probe = probe_data['t_bmodel']\n",
-    "layer_range = probe_data['sep_layer_range']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1c0e30b73cab48069e985203c598a9b0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -89,6 +91,13 @@
      },
      "metadata": {},
      "output_type": "display_data"
     }
    ],
    "source": [
@@ -96,14 +105,145 @@
     "from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer\n",
     "\n",
     "model_id = \"meta-llama/Llama-2-7b-chat-hf\"\n",
-    "model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"cpu\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "tokenizer.use_default_system_prompt = False"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -113,14 +253,23 @@
       "tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,\n",
       "           526,   263,  8444, 20255, 29889,    13, 29966,   829, 14816, 29903,\n",
       "          6778,    13,    13,  5816,   338,   278,  7483,   310,  3444, 29973,\n",
-      "           518, 29914, 25580, 29962]]) torch.Size([1, 34])\n"
      ]
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)\n"
      ]
     }
    ],
@@ -161,17 +310,22 @@
     "\n",
     "generated_text = \"\"\n",
     "highlighted_text = \"\"\n",
     "\n",
     "for new_text in streamer:\n",
     "    print(new_text)\n",
     "    generated_text += new_text\n",
     "    current_input_ids = tokenizer.encode(generated_text, return_tensors=\"pt\").to(model.device)\n",
     "    print(current_input_ids, current_input_ids.shape)\n",
     "    with torch.no_grad():\n",
     "        outputs = model(current_input_ids, output_hidden_states=True)\n",
-    "        print(outputs)\n",
     "        hidden = outputs.hidden_states    \n",
-    "        print(hidden.shape)\n",
     "        # Stack second last token embeddings from all layers \n",
     "        # if len(hidden) == 1:  # FIX: runtime error for mistral-7b on bioasq\n",
     "        #     sec_last_input = hidden[0]\n",
@@ -179,9 +333,9 @@
     "        #     sec_last_input = hidden[-2]\n",
     "        # else:\n",
     "        #     sec_last_input = hidden[n_generated - 2]\n",
-    "        # sec_last_token_embedding = torch.stack([layer[:, -1, :].cpu() for layer in sec_last_input])\n",
-    "        # print(sec_last_token_embedding.shape)\n",
-    "    last_hidden_state = outputs.hidden_states[-1][:, -1, :].cpu().numpy()\n",
     "    print(last_hidden_state.shape)  \n",
     "    # TODO potentially need to only compute uncertainty for the last token in sentence?\n"
    ]
@@ -194,8 +348,7 @@
    "source": [
     "# concat hidden states\n",
     "\n",
-    "\n",
-    "hidden_states = np.concatenate(np.array(hidden_states)[layer_range], axis=1)\n",
     "# predict with probe\n",
     "pred = probe.predict(hidden_states)\n",
     "print(pred)"

    "metadata": {},
    "outputs": [],
    "source": [
+    "se_probe = probe_data['t_bmodel']\n",
+    "se_layer_range = probe_data['sep_layer_range']\n",
+    "acc_probe = probe_data['t_amodel']\n",
+    "acc_layer_range = probe_data['ap_layer_range']"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "30a1c8e576f6448bb228b4ae9a3a8a48",
        "version_major": 2,
        "version_minor": 0
       },
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some parameters are on the meta device device because they were offloaded to the disk.\n"
+     ]
     }
    ],
    "source": [
     "from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer\n",
     "\n",
     "model_id = \"meta-llama/Llama-2-7b-chat-hf\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"auto\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "tokenizer.use_default_system_prompt = False"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Љ ( \"ass\n",
+      "ЪЏ\n",
+      "հ MO-OC\n",
+      "tensor(30488, device='mps:0') Љ 1.0 -0.014414779243550946\n",
+      "tensor(313, device='mps:0') ( -0.9998164331881116 0.9597905489862286\n",
+      "tensor(376, device='mps:0') \" 0.9999998197256226 -0.9792630307582237\n",
+      "tensor(465, device='mps:0') ass -0.9999994897301452 0.9680999957882863\n",
+      "tensor(13, device='mps:0') \n",
+      " -0.99999964561314 0.9983907264450047\n",
+      "tensor(31147, device='mps:0') Ъ 1.0 -0.9999976710226259\n",
+      "tensor(30282, device='mps:0') Џ 1.0 0.9999912572082477\n",
+      "tensor(13, device='mps:0') \n",
+      " 0.9999999999869607 0.9999964462206883\n",
+      "tensor(31488, device='mps:0') հ 1.0 -1.0\n",
+      "tensor(341, device='mps:0') M 0.9045896738793786 0.5590883316684834\n",
+      "tensor(29949, device='mps:0') O -0.9999999803476437 -0.5270551643185932\n",
+      "tensor(29899, device='mps:0') - 0.9992488974195408 0.9987826119127319\n",
+      "tensor(29949, device='mps:0') O -0.9713693636571169 0.9993573968241007\n",
+      "tensor(29907, device='mps:0') C -0.9999999701427968 0.9904799691607524\n",
+      " <span style=\"background-color: #FF0000; color: black\">Љ</span> <span style=\"background-color: #00FF00; color: black\">(</span> <span style=\"background-color: #FF0000; color: black\">\"</span> <span style=\"background-color: #00FF00; color: black\">ass</span> <span style=\"background-color: #00FF00; color: black\">\n",
+      "</span> <span style=\"background-color: #FF0000; color: black\">Ъ</span> <span style=\"background-color: #FF0000; color: black\">Џ</span> <span style=\"background-color: #FF0000; color: black\">\n",
+      "</span> <span style=\"background-color: #FF0000; color: black\">հ</span> <span style=\"background-color: #FF1818; color: black\">M</span> <span style=\"background-color: #00FF00; color: black\">O</span> <span style=\"background-color: #FF0000; color: black\">-</span> <span style=\"background-color: #07FF07; color: black\">O</span> <span style=\"background-color: #00FF00; color: black\">C</span>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import Tuple\n",
+    "\n",
+    "MAX_INPUT_TOKEN_LENGTH = 512\n",
+    "\n",
+    "\n",
+    "def generate(\n",
+    "    message: str,\n",
+    "    system_prompt: str,\n",
+    "    max_new_tokens: int = 10,\n",
+    "    temperature: float = 0.6,\n",
+    "    top_p: float = 0.9,\n",
+    "    top_k: int = 50,\n",
+    "    repetition_penalty: float = 1.2,\n",
+    ") -> Tuple[str, str]:\n",
+    "    conversation = []\n",
+    "    if system_prompt:\n",
+    "        conversation.append({\"role\": \"system\", \"content\": system_prompt})\n",
+    "    conversation.append({\"role\": \"user\", \"content\": message})\n",
+    "\n",
+    "    input_ids = tokenizer.apply_chat_template(conversation, return_tensors=\"pt\")\n",
+    "    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:\n",
+    "        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]\n",
+    "    input_ids = input_ids.to(model.device)\n",
+    "\n",
+    "    #### Generate without threading\n",
+    "    generation_kwargs = dict(\n",
+    "        input_ids=input_ids,\n",
+    "        max_new_tokens=max_new_tokens,\n",
+    "        do_sample=True,\n",
+    "        top_p=top_p,\n",
+    "        top_k=top_k,\n",
+    "        temperature=temperature,\n",
+    "        repetition_penalty=repetition_penalty,\n",
+    "        output_hidden_states=True,\n",
+    "        return_dict_in_generate=True,\n",
+    "        attention_mask=torch.ones_like(input_ids),\n",
+    "    )\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(**generation_kwargs)\n",
+    "    generated_tokens = outputs.sequences[0, input_ids.shape[1]:]\n",
+    "    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)\n",
+    "    print(generated_text)\n",
+    "    # hidden states\n",
+    "    hidden = outputs.hidden_states  # list of tensors, one for each token, then (batch size, sequence length, hidden size)\n",
+    "\n",
+    "    se_highlighted_text = \"\"\n",
+    "    acc_highlighted_text = \"\"\n",
+    "\n",
+    "    # skip the first hidden state as it is the prompt\n",
+    "    for i in range(1, len(hidden)):\n",
+    "\n",
+    "        # Semantic Uncertainty Probe\n",
+    "        token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden[i]]).numpy()   # (num_layers, hidden_size)\n",
+    "        se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)\n",
+    "        se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1\n",
+    "        \n",
+    "        # Accuracy Probe\n",
+    "        acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)\n",
+    "        acc_probe_pred = (1 - acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1]) * 2 - 1\n",
+    "        \n",
+    "        output_id = outputs.sequences[0, input_ids.shape[1]+i]\n",
+    "        output_word = tokenizer.decode(output_id)\n",
+    "        print(output_id, output_word, se_probe_pred, acc_probe_pred)  \n",
+    "\n",
+    "        se_new_highlighted_text = highlight_text(output_word, se_probe_pred)\n",
+    "        acc_new_highlighted_text = highlight_text(output_word, acc_probe_pred)\n",
+    "        se_highlighted_text += f\" {se_new_highlighted_text}\"\n",
+    "        acc_highlighted_text += f\" {acc_new_highlighted_text}\"\n",
+    "        \n",
+    "    return se_highlighted_text, acc_highlighted_text\n",
+    "\n",
+    "\n",
+    "def highlight_text(text: str, uncertainty_score: float) -> str:\n",
+    "    if uncertainty_score > 0:\n",
+    "        html_color = \"#%02X%02X%02X\" % (\n",
+    "            255,\n",
+    "            int(255 * (1 - uncertainty_score)),\n",
+    "            int(255 * (1 - uncertainty_score)),\n",
+    "        )\n",
+    "    else:\n",
+    "        html_color = \"#%02X%02X%02X\" % (\n",
+    "            int(255 * (1 + uncertainty_score)),\n",
+    "            255,\n",
+    "            int(255 * (1 + uncertainty_score)),\n",
+    "        )\n",
+    "    return '<span style=\"background-color: {}; color: black\">{}</span>'.format(\n",
+    "        html_color, text\n",
+    "    )\n",
+    "\n",
+    "message = \"What is the capital of France?\"\n",
+    "system_prompt = \"\"\n",
+    "se_highlighted_text, acc_highlighted_text = generate(message, system_prompt)\n",
+    "print(se_highlighted_text)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
       "tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,\n",
       "           526,   263,  8444, 20255, 29889,    13, 29966,   829, 14816, 29903,\n",
       "          6778,    13,    13,  5816,   338,   278,  7483,   310,  3444, 29973,\n",
+      "           518, 29914, 25580, 29962]]) torch.Size([1, 34])\n",
+      "\n",
+      " \n"
      ]
     },
     {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 37\u001b[0m\n\u001b[1;32m     35\u001b[0m generated_text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     36\u001b[0m highlighted_text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 37\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m output \u001b[38;5;129;01min\u001b[39;00m streamer:\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28mprint\u001b[39m(output)\n\u001b[1;32m     39\u001b[0m     generated_text \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m output\n",
+      "File \u001b[0;32m~/anaconda3/envs/llm-test/lib/python3.11/site-packages/transformers/generation/streamers.py:223\u001b[0m, in \u001b[0;36mTextIteratorStreamer.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    222\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 223\u001b[0m     value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext_queue\u001b[38;5;241m.\u001b[39mget(timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout)\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m value \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstop_signal:\n\u001b[1;32m    225\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m()\n",
+      "File \u001b[0;32m~/anaconda3/envs/llm-test/lib/python3.11/queue.py:180\u001b[0m, in \u001b[0;36mQueue.get\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m    178\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m remaining \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.0\u001b[39m:\n\u001b[1;32m    179\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m Empty\n\u001b[0;32m--> 180\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnot_empty\u001b[38;5;241m.\u001b[39mwait(remaining)\n\u001b[1;32m    181\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get()\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnot_full\u001b[38;5;241m.\u001b[39mnotify()\n",
+      "File \u001b[0;32m~/anaconda3/envs/llm-test/lib/python3.11/threading.py:324\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    323\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 324\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m waiter\u001b[38;5;241m.\u001b[39macquire(\u001b[38;5;28;01mTrue\u001b[39;00m, timeout)\n\u001b[1;32m    325\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    326\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m waiter\u001b[38;5;241m.\u001b[39macquire(\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
    ],
     "\n",
     "generated_text = \"\"\n",
     "highlighted_text = \"\"\n",
+    "for output in streamer:\n",
+    "    print(output)\n",
+    "    generated_text += output\n",
     "\n",
+    "    # yield generated_text\n",
     "for new_text in streamer:\n",
     "    print(new_text)\n",
     "    generated_text += new_text\n",
+    "    print(generated_text)\n",
     "    current_input_ids = tokenizer.encode(generated_text, return_tensors=\"pt\").to(model.device)\n",
     "    print(current_input_ids, current_input_ids.shape)\n",
     "    with torch.no_grad():\n",
     "        outputs = model(current_input_ids, output_hidden_states=True)\n",
     "        hidden = outputs.hidden_states    \n",
+    "        print(len(hidden))\n",
+    "        print(hidden[-1].shape)\n",
     "        # Stack second last token embeddings from all layers \n",
     "        # if len(hidden) == 1:  # FIX: runtime error for mistral-7b on bioasq\n",
     "        #     sec_last_input = hidden[0]\n",
     "        #     sec_last_input = hidden[-2]\n",
     "        # else:\n",
     "        #     sec_last_input = hidden[n_generated - 2]\n",
+    "        sec_last_token_embedding = torch.stack([layer[:, -1, :].cpu() for layer in hidden])\n",
+    "        print(sec_last_token_embedding.shape)\n",
+    "    last_hidden_state = hidden[-1][:, -1, :].cpu().numpy()\n",
     "    print(last_hidden_state.shape)  \n",
     "    # TODO potentially need to only compute uncertainty for the last token in sentence?\n"
    ]
    "source": [
     "# concat hidden states\n",
     "\n",
+    "sec_last_token_embedding = np.concatenate(sec_last_token_embedding.cpu().numpy()[layer_range], axis=1)\n",
     "# predict with probe\n",
     "pred = probe.predict(hidden_states)\n",
     "print(pred)"