File size: 6,991 Bytes
9de012e
 
 
0cfc205
9de012e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cfc205
9de012e
 
 
 
 
 
 
 
 
 
 
 
0cfc205
9de012e
 
 
 
 
 
 
 
 
 
 
 
 
 
0cfc205
9de012e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f046265
 
 
 
 
 
 
 
 
 
9de012e
 
 
 
 
 
 
 
31cc188
 
 
 
 
 
 
9de012e
31cc188
 
9de012e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55cc62e
9de012e
 
31cc188
9de012e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import os
import re
import spaces

from leo.inference import inference

MESH_DIR = 'assets/mesh'
MESH_NAMES = sorted([os.path.splitext(fname)[0] for fname in os.listdir(MESH_DIR)])
STEP_COUNTS = 6

def change_scene(dropdown_scene: str):
    # reset 3D scene and chatbot history
    return os.path.join(MESH_DIR, f'{dropdown_scene}.glb')

with gr.Blocks(title='LEO Demo') as demo:
    gr.HTML(value="<h1 align='center'>Task-oriented Sequential Grounding in 3D Scenes </h1>")

    with gr.Row():
        with gr.Column(scale=5):
            dropdown_scene = gr.Dropdown(
                choices=MESH_NAMES,
                value='scene0050_00',
                interactive=True,
                label='Select a 3D scene',
            )
            model_3d = gr.Model3D(
                value=os.path.join(MESH_DIR, f'scene0050_00.glb'),
                clear_color=[0.0, 0.0, 0.0, 0.0],
                label='3D Scene',
                camera_position=(80, 100, 6),
                height=659,
            )
            gr.HTML(
                """<center><strong>
                👆 SCROLL and DRAG on the 3D Scene
                to zoom in/out and rotate. Press CTRL and DRAG to pan.
                </strong></center>
                """
            )
    
    dropdown_scene.change(
        fn=change_scene,
        inputs=[dropdown_scene],
        outputs=[model_3d],
        queue=False
    )

    # LEO task-to-plan inference wrapper
    @spaces.GPU
    def leo_task_to_plan(task_description):
        task_input = {
            "task_description": task_description,
            "scan_id": "scene0050_00"
        }
        plan = inference("scene0050_00", task_input, predict_mode=True)
        plan = plan[0]['pred_plan_text']
        # parts = re.split(r'(\d+\.)', plan)[1:]
        # steps = [parts[i] + parts[i + 1].rstrip() for i in range(0, len(parts), 2)]
        return plan

    # LEO ground inference wrapper
    @spaces.GPU
    def leo_plan_to_masks(task_description, *action_steps):
        formatted_action_steps = [
            {"action": step, "target_id": "unknown", "label": "unknown"} for step in action_steps if step != ""
        ]
        task_input = {
            "task_description": task_description,
            "action_steps": formatted_action_steps,
            "scan_id": "scene0050_00"
        }
        masks = inference("scene0050_00", task_input, predict_mode=False)
        masks = [tensor.item() for tensor in masks]
        return [f"assets/mask/scene0050_00/scene0050_00_obj_{mask}.glb" for mask in masks] + ["assets/mask/scene0050_00/scene0050_00_obj_empty.glb"] * (STEP_COUNTS - len(masks))

    # LEO task-to-plan and ground inference wrapper
    @spaces.GPU
    def leo_task_to_plan_and_masks(task_description):
        task_input = {
            "task_description": task_description,
            "scan_id": "scene0050_00"
        }
        plan = inference("scene0050_00", task_input, predict_mode=True)
        plan_text = plan[0]['pred_plan_text']
        parts = re.split(r'(\d+\.)', plan_text)[1:]
        steps = [parts[i] + parts[i + 1].rstrip() for i in range(0, len(parts), 2)]
        steps += ["### PLANNING HAS ENDED, SEE ABOVE FOR DETAILS ###"] * (STEP_COUNTS - len(steps))
        
        masks = plan[0]['predict_object_id']
        mask_paths = [f"assets/mask/scene0050_00/scene0050_00_obj_{mask}.glb" for mask in masks]
        mask_paths += ["assets/mask/scene0050_00/scene0050_00_obj_empty.glb"] * (STEP_COUNTS - len(masks)) # fill with empty mask
        
        output = []
        for i in range(STEP_COUNTS):
            output.append(steps[i])
            output.append(mask_paths[i])
        return output
    
    # with gr.Tab("LEO Task-to-Plan"):
    #     gr.Interface(
    #         fn=leo_task_to_plan,
    #         inputs=[gr.Textbox(label="Task Description")],
    #         outputs=["text"],
    #         examples=[
    #             ["Freshen up in the bathroom."]
    #         ],
    #         title="LEO Task-to-Plan: Input task, Output plan text"
    #     )

    with gr.Tab("LEO Plan-to-Masks"):
        gr.Interface(
            fn=leo_plan_to_masks,
            inputs=[gr.Textbox(label="Task Description")] + [gr.Textbox(label=f"Action Step {i+1}") for i in range(STEP_COUNTS)],
            outputs=[gr.Model3D(
                clear_color=[0.0, 0.0, 0.0, 0.0], camera_position=(80, 100, 6), label=f"3D Model for Step {i+1} (if the step exists)") for i in range(STEP_COUNTS)],
            examples=[
                [
                    "Start Working at the desk.", 
                    "1. Walk to the desk.", 
                    "2. Sit on the brown leather sofa chair in front of the desk.", 
                    "3. Turn on the opened laptop in front of you on the desk.", 
                    "4. Grab the cup beside the laptop to drink."
                    ] + [""] * (STEP_COUNTS - 4)
            ],
            title="LEO Plan-to-Masks: Input plan, Output 3D Masks for each step, Red denotes predicted target object",
            description="Please input a task description and action steps. Examples can be found at the bottom of the interface."
        )

    with gr.Tab("LEO Task-to-Plan and Masks"):
        gr.Interface(
            fn=leo_task_to_plan_and_masks,
            inputs=[gr.Textbox(label="Task Description")],
            outputs=[
                item for sublist in zip(
                    [gr.Textbox(label=f"Action Step {i+1}") for i in range(STEP_COUNTS)],
                    [gr.Model3D(
                        clear_color=[0.0, 0.0, 0.0, 0.0],
                        camera_position=(80, 100, 6),
                        label=f"3D Model for Step {i+1} (if the step exists)"
                    ) for i in range(STEP_COUNTS)]
                ) for item in sublist
            ],
            examples=[
                ["Start Working at the desk."]
            ],
            title="LEO Task-to-Plan and Masks: Input task, Output plan text and 3D Masks for each step, Red denotes predicted target object",
            description="Please input a task description. Examples can be found at the bottom of the interface."
            # js="""
            # function() {
            #     const stepCounts = """ + str(STEP_COUNTS) + """;
            #     const stepElems = document.querySelectorAll('.output_interface .textbox_output');
            #     const modelElems = document.querySelectorAll('.output_interface .model3d_output');
            #     for (let i = 0; i < stepCounts; i++) {
            #         if (stepElems[i].value === '### PLANNING HAS ENDED, SEE ABOVE FOR DETAILS ###' || modelElems[i].src.includes('scene0050_00_obj_empty.glb')) {
            #             stepElems[i].style.display = 'none';
            #             modelElems[i].style.display = 'none';
            #         }
            #     }
            # }
            # """
        )

demo.queue().launch(share=True, allowed_paths=['assets'])