Spaces:

lillab-demos
/

cogen

Sleeping

App Files Files Community

momergul commited on Sep 23

Commit

14eba99

•

1 Parent(s): 554adbb

Update

Browse files

Files changed (3) hide show

app.py +153 -89
joint_inference.py +1 -6
models.py +2 -2

app.py CHANGED Viewed

@@ -21,9 +21,9 @@ css="""
 """
 def initialize_game() -> List[List[str]]:
-    context_dicts = [generate_complete_game() for _ in range(2)]
-    roles = ["speaker"] * 3 + ["listener"] * 3
     speaker_images = []
     listener_images = []
     targets = []
@@ -71,6 +71,7 @@ def get_model_response(
 @spaces.GPU(duration=20)
 def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token, adapter_name):
     model.model.set_adapter(adapter_name)
     model = model.cuda()
     with torch.no_grad():
         captions, _, _, _, _ = model.generate(
@@ -85,6 +86,7 @@ def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask
 def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
                           s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name):
     model.model.set_adapter(adapter_name)
     model = model.cuda()
     with torch.no_grad():
         _, _, joint_log_probs = model.comprehension_side([
@@ -95,71 +97,118 @@ def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_at
         response = image_paths[target_idx]
     return response
-def interaction(model, processor, index_to_token, model_iteration: str) -> Tuple[List[str], List[str]]:
-    image_role_pairs = initialize_game()
-    conversation = []
-    turn = 0
-    num_correct = 0
-    human_role = None
-    adapter_name = "initial" if model_iteration == "Initial System" else "final"
-    internal_model = model
-    for speaker_image, listener_image, target_image, model_role in image_role_pairs:
-        acc_message = f"{num_correct}/{turn}"
-        if model_role == "speaker":
-            human_role = "Listener"
-            turn += 1
-            turn_message = f"{turn}/6"
-            human_context = listener_image
-            model_context = speaker_image
-            target_idx = human_context.index(target_image)
-            conversation.extend([
-                f"TURN: {turn}/6",
-                f"Guess the target image given the speaker's description. ",
-            ])
-            model_message = get_model_response(internal_model, adapter_name, processor, index_to_token, model_role, model_context, target_image=target_image)
-            conversation.append(f"Model: {model_message}")
-            conversation.append("You: The target is Image ")
-            user_message = yield human_context, conversation, human_role, turn_message, acc_message
-            conversation[-1] += f"{user_message}"
-            if int(user_message) == target_idx + 1:
-                conversation.append("Correct!\n")
-                num_correct += 1
-            else:
-                conversation.append(f"Incorrect!\n")
-        else:
-            # listener
-            human_role = "Speaker"
-            turn += 1
-            turn_message = f"{turn}/6"
-            human_context = speaker_image
-            model_context = listener_image
-            target_idx = human_context.index(target_image)
-            conversation.extend([
-                f"TURN: {turn}/6",
-                f"Generate a description for the target image. Your target is Image {target_idx + 1}",
-            ])
-            user_message = yield human_context, conversation, human_role, turn_message, acc_message
-            conversation.append(f"You: {user_message}")
-            model_message = get_model_response(internal_model, adapter_name, processor, index_to_token, model_role, model_context, user_message=user_message)
-            model_idx = human_context.index(model_message)
-            if int(model_idx) == int(target_idx):
-                conversation.append("The model guessed correctly!\n")
-                num_correct += 1
-            else:
-                conversation.append(f"The model guessed incorrectly.\n")
-    acc_message = f"{num_correct}/{turn}"
-    conversation.append("The game is over!")
-    yield human_context, conversation, human_role, turn_message, acc_message
 def create_app():
     with gr.Blocks(css=css) as app:
         gr.Markdown("# Tangram Reference Game")
         gr.Markdown(
             '### You will be playing a sequence of reference games against a model. To start a game, first select whether ' +\
@@ -207,51 +256,66 @@ def create_app():
                     interactive=False,
                 )
-        send_btn = gr.Button("Send")
-        interaction_generator = None
         model = get_model()
         processor = get_processor()
         index_to_token = get_index_to_token()
-        print("Heyo!")
         def start_interaction(model_iteration):
             if model_iteration is None:
                 return [], "Please select a model iteration.", "", "", "", gr.update(interactive=False), \
-                    gr.update(interactive=False), gr.update(interactive=False)
-            nonlocal interaction_generator
             nonlocal model
             nonlocal processor
             nonlocal index_to_token
-            interaction_generator = interaction(model, processor, index_to_token, model_iteration)
-            images, conversation, role, turn, acc_message = next(interaction_generator)
-            human_listener = role == "Listener"
-            return [(f"tangram_pngs/{img}", f"Image {i+1}") for i, img in enumerate(images)], "\n".join(conversation), role, turn, acc_message, \
-                gr.update(interactive=not human_listener), gr.update(interactive=human_listener), gr.update(interactive=True)
-        def send_message(message, radio_choice):
-            nonlocal interaction_generator
-            if interaction_generator is None:
-                return [], "Please start the interaction first.", "", gr.update(interactive=False), gr.update(interactive=False, value=None)
-            try:
-                user_output = message if radio_choice is None else radio_choice
-                images, conversation, role, turn, acc_message = interaction_generator.send(user_output)
-                human_listener = role == "Listener"
-                return [(f"tangram_pngs/{img}", f"Image {i+1}") for i, img in enumerate(images)], "\n".join(conversation), role, turn, acc_message, \
-                    gr.update(interactive=not human_listener, value=""), gr.update(interactive=human_listener, value=None), gr.update(interactive=True)
-            except StopIteration:
-                return [], conversation_output.value, current_role.value, current_turn.value, accuracy.value, gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
         start_btn.click(
             start_interaction,
             inputs=[model_iteration],
-            outputs=[image_output, conversation_output, current_role, current_turn, accuracy, user_input, radio_buttons, send_btn]
         )
-        send_btn.click(send_message, inputs=[user_input, radio_buttons], outputs=[image_output, conversation_output, current_role, current_turn, accuracy, user_input, radio_buttons, send_btn])
     return app
 app = create_app()
 app.launch()

 """
 def initialize_game() -> List[List[str]]:
+    context_dicts = [generate_complete_game() for _ in range(4)]
+    roles = ["listener"] * 3 + ["speaker"] * 3 + ["listener"] * 3 + ["speaker"] * 3
     speaker_images = []
     listener_images = []
     targets = []
 @spaces.GPU(duration=20)
 def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token, adapter_name):
     model.model.set_adapter(adapter_name)
+    print(adapter_name)
     model = model.cuda()
     with torch.no_grad():
         captions, _, _, _, _ = model.generate(
 def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
                           s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name):
     model.model.set_adapter(adapter_name)
+    print(adapter_name)
     model = model.cuda()
     with torch.no_grad():
         _, _, joint_log_probs = model.comprehension_side([
         response = image_paths[target_idx]
     return response
+def initialize_interaction(model_iteration):
+    # initialize the overall history
+    new_history = {
+        'adapter_name' : 'initial' if model_iteration == "Initial System" else "final",
+        'image_role_pairs' : initialize_game(),
+        'conversation' : [],
+        'turn' : 0,
+        'num_correct' : 0,
+    }
+    # Initialize the first turn (always a listener)
+    turn = new_history['turn']
+    image_role_pairs = new_history['image_role_pairs']
+    speaker_image, listener_image, target_image, _ = image_role_pairs[turn]
+    target_idx = speaker_image.index(target_image)
+    new_history['conversation'].extend([
+        f"TURN: {turn + 1}/12",
+        f"Generate a description for the target image. Your target is Image {target_idx + 1}"
+    ])
+    return new_history
+def progress_game(user_message, model, processor, index_to_token, current_state):
+    # First get the game state
+    turn = current_state['turn']
+    image_role_pairs = current_state['image_role_pairs']
+    speaker_image, listener_image, target_image, model_role = image_role_pairs[turn]
+    human_role = "Speaker" if model_role == "listener" else "Listener"
+    # Next, move on with current turn
+    if model_role == "listener":
+        human_context = speaker_image
+        model_context = listener_image
+        # If model is a listener, the human must have sent a message
+        current_state['conversation'].append(f"You: {user_message}")
+        model_message = get_model_response(
+            model, current_state['adapter_name'], processor, index_to_token, model_role,
+            model_context, user_message=user_message
+        )
+        model_idx = human_context.index(model_message)
+        target_idx = human_context.index(target_image)
+        if int(model_idx) == int(target_idx):
+            current_state['conversation'].append("The model guessed correctly!\n")
+            current_state['num_correct'] += 1
+        else:
+            current_state['conversation'].append(f"The model guessed incorrectly.\n")
+    else:
+        human_context = listener_image
+        model_context = speaker_image
+        # If model is a speaker, the human must have made a guess
+        target_idx = human_context.index(target_image)
+        current_state['conversation'][-1] += f"{user_message}"
+        if int(user_message) == target_idx + 1:
+            current_state['conversation'].append("Correct!\n")
+            current_state['num_correct'] += 1
+        else:
+            current_state['conversation'].append(f"Incorrect!\n")
+    # We move on to the next turn
+    current_state['turn'] += 1
+    acc_message = f"{current_state['num_correct']}/{current_state['turn']}"
+    turn_message = f"{current_state['turn'] + 1}/12"
+    if current_state['turn'] == len(image_role_pairs):
+        current_state['conversation'].append('The game is over!')
+        return human_context, current_state['conversation'], human_role, turn_message, acc_message, {}
+    speaker_image, listener_image, target_image, model_role = image_role_pairs[current_state['turn']]
+    human_role = "Listener" if model_role == "speaker" else "Speaker"
+    if model_role == "speaker":
+        human_context = listener_image
+        model_context = speaker_image
+        current_state['conversation'].extend([
+            f"TURN: {current_state['turn'] + 1}/12",
+            f"Guess the target image given the speaker's description. ",
+        ])
+        model_message = get_model_response(model, current_state['adapter_name'], processor, index_to_token,
+                                           model_role, model_context, target_image=target_image)
+        current_state['conversation'].append(f"Model: {model_message}")
+        current_state['conversation'].append("You: The target is Image ")
+    else:
+        human_context = speaker_image
+        model_context = listener_image
+        target_idx = human_context.index(target_image)
+        current_state['conversation'].extend([
+            f"TURN: {current_state['turn'] + 1}/12",
+            f"Generate a description for the target image. Your target is Image {target_idx + 1}",
+        ])
+    return human_context, current_state['conversation'], human_role, turn_message, acc_message, current_state
+def get_current_images(current_history):
+    turn = current_history['turn']
+    image_role_pairs = current_history['image_role_pairs']
+    speaker_image, listener_image, target_image, model_role = image_role_pairs[turn]
+    human_context = listener_image if model_role == "speaker" else speaker_image
+    return human_context
+def get_human_role(current_history):
+    turn = current_history['turn']
+    image_role_pairs = current_history['image_role_pairs']
+    speaker_image, listener_image, target_image, model_role = image_role_pairs[turn]
+    return "Listener" if model_role == "speaker" else "Speaker"
 def create_app():
     with gr.Blocks(css=css) as app:
+        game_history = gr.State(value={})
         gr.Markdown("# Tangram Reference Game")
         gr.Markdown(
             '### You will be playing a sequence of reference games against a model. To start a game, first select whether ' +\
                     interactive=False,
                 )
+        send_btn = gr.Button("Send", interactive=False)
         model = get_model()
         processor = get_processor()
         index_to_token = get_index_to_token()
         def start_interaction(model_iteration):
+            # Initialize the interaction
             if model_iteration is None:
                 return [], "Please select a model iteration.", "", "", "", gr.update(interactive=False), \
+                    gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True), {}
+            current_history = initialize_interaction(model_iteration)
+            # Unpack the relevant items
+            images = get_current_images(current_history)
+            conversation = current_history["conversation"]
+            role = get_human_role(current_history)
+            human_listener = role == "Listener"
+            current_turn = current_history['turn'] + 1
+            turn_msg = f"{current_turn}/12"
+            acc_msg = "0/0"
+            return [(f"tangram_pngs/{img}", f"Image {i+1}") for i, img in enumerate(images)], "\n".join(conversation), role, turn_msg, acc_msg, \
+                gr.update(interactive=not human_listener), gr.update(interactive=human_listener), gr.update(interactive=True), gr.update(interactive=False), current_history
+        def send_message(message, radio_choice, current_state):
             nonlocal model
             nonlocal processor
             nonlocal index_to_token
+            # Game ended
+            if current_state['turn'] == len(current_state['image_role_pairs']):
+                return [], conversation_output.value, current_role.value, current_turn.value, accuracy.value, gr.update(interactive=False), \
+                    gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=True, value=None), {}
+            # Regular game progress
+            user_output = message if radio_choice is None else radio_choice
+            images, conversation, role, turn, acc_message, current_state = progress_game(user_output, model, processor, index_to_token, current_state)
+            human_listener = role == "Listener"
+            return [(f"tangram_pngs/{img}", f"Image {i+1}") for i, img in enumerate(images)], "\n".join(conversation), role, turn, \
+                acc_message, gr.update(interactive=not human_listener, value=""), gr.update(interactive=human_listener, value=None), \
+                gr.update(interactive=True), gr.update(interactive=False), current_state
         start_btn.click(
             start_interaction,
             inputs=[model_iteration],
+            outputs=[
+                image_output, conversation_output, current_role, current_turn, accuracy,
+                user_input, radio_buttons, send_btn, model_iteration, game_history],
+            queue=False
+        )
+        send_btn.click(
+            send_message,
+            inputs=[user_input, radio_buttons, game_history],
+            outputs=[image_output, conversation_output, current_role, current_turn, accuracy, user_input,
+                     radio_buttons, send_btn, model_iteration, game_history],
+            queue=True
         )
     return app
 app = create_app()
+app.queue()
 app.launch()

joint_inference.py CHANGED Viewed

@@ -346,7 +346,6 @@ class IdeficsJointInferenceModel(nn.Module):
         speaker = self.get_speaker()
         generation_config = GenerationConfig(
             max_new_tokens=max_steps,
-            min_new_tokens=1,
             do_sample=True,
             temperature=temperature,
             top_k=top_k, top_p=top_p,
@@ -429,6 +428,7 @@ class IdeficsJointInferenceModel(nn.Module):
         speaker = self.get_speaker()
         generation_config = GenerationConfig(
             max_new_tokens=max_steps,
             do_sample=True,
             temperature=temperature,
             top_k=top_k, top_p=top_p,
@@ -438,11 +438,6 @@ class IdeficsJointInferenceModel(nn.Module):
             return_dict_in_generate=True
         )
-        print(torch.any(torch.isnan(s_input_tokens)))
-        print(torch.any(torch.isnan(s_attn_mask)))
-        print(torch.any(torch.isnan(images)))
-        print(torch.any(torch.isnan(s_image_attn_mask)))
         outputs = speaker.generate(
             input_ids=s_input_tokens,
             attention_mask=s_attn_mask,

         speaker = self.get_speaker()
         generation_config = GenerationConfig(
             max_new_tokens=max_steps,
             do_sample=True,
             temperature=temperature,
             top_k=top_k, top_p=top_p,
         speaker = self.get_speaker()
         generation_config = GenerationConfig(
             max_new_tokens=max_steps,
+            min_new_tokens=1,
             do_sample=True,
             temperature=temperature,
             top_k=top_k, top_p=top_p,
             return_dict_in_generate=True
         )
         outputs = speaker.generate(
             input_ids=s_input_tokens,
             attention_mask=s_attn_mask,

models.py CHANGED Viewed

@@ -11,7 +11,7 @@ def get_model():
     # Initialize the model
     repo = 'lil-lab/cogen'
     checkpoint = "HuggingFaceM4/idefics2-8b"
-    model = Idefics2ForConditionalGeneration.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).cuda()
     # Add LoRA adapters
     target_modules=r'(.*(vision_model|modality_projection|perceiver_resampler).*(out_proj|fc1|fc2|down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)|(.*(k_proj|q_proj|v_proj).*$)'
@@ -39,7 +39,7 @@ def get_model():
     )
     model.add_adapter('final', lora_config)
     model.load_adapter(repo, "final", revision="r3_full")
-    model = IdeficsJointInferenceModel(0.5, 0, model=model).cuda()
     model.eval()
     return model

     # Initialize the model
     repo = 'lil-lab/cogen'
     checkpoint = "HuggingFaceM4/idefics2-8b"
+    model = Idefics2ForConditionalGeneration.from_pretrained(checkpoint, torch_dtype=torch.bfloat16)
     # Add LoRA adapters
     target_modules=r'(.*(vision_model|modality_projection|perceiver_resampler).*(out_proj|fc1|fc2|down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)|(.*(k_proj|q_proj|v_proj).*$)'
     )
     model.add_adapter('final', lora_config)
     model.load_adapter(repo, "final", revision="r3_full")
+    model = IdeficsJointInferenceModel(0.5, 0, model=model)
     model.eval()
     return model