File size: 8,677 Bytes
d2dd1cd
 
 
 
 
 
 
 
 
6dcf9e0
fc1d077
6dcf9e0
d2dd1cd
e584282
1675c32
fc1d077
1675c32
fc1d077
1675c32
b74e139
e584282
 
6cfd4ab
 
 
097f543
7a54de6
 
 
e584282
 
d2dd1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1820b1
 
891cfab
7d26797
 
8079dae
7d26797
8079dae
7d26797
d2dd1cd
 
 
 
 
 
d1820b1
 
 
 
 
 
d2dd1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65516a6
d2dd1cd
65516a6
 
d2dd1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1820b1
d2dd1cd
 
 
6dcf9e0
 
 
 
 
 
 
 
 
 
 
 
 
 
d2dd1cd
 
 
 
 
d1820b1
d2dd1cd
 
 
 
 
3a8c535
f4c8778
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python

from __future__ import annotations

import os

import gradio as gr

from inference_followyourpose import merge_config_then_run
import sys

sys.path.append('FollowYourPose')


# result = subprocess.run(['bash', './data/download.sh'], stdout=subprocess.PIPE)
import subprocess
zip_file = './example_video.zip'
output_dir = './data'
subprocess.run(['unzip', zip_file, '-d', output_dir])

current_dir = os.getcwd()
print("path is :", current_dir)
print("current_dir is :", os.listdir(current_dir))
print("dir is :", os.listdir(os.path.join(current_dir,'data')))
print("data/example_video is :", os.listdir(os.path.join(current_dir,'data/example_video')))

HF_TOKEN = os.getenv('HF_TOKEN')
pipe = merge_config_then_run()



with gr.Blocks(css='style.css') as demo:
    gr.HTML(
    """
    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
    <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
        🕺🕺🕺 Follow Your Pose 💃💃💃 </font></center> <br> <center>Pose-Guided Text-to-Video Generation using Pose-Free Videos
    </h1>
    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
            <a href="https://mayuelala.github.io/">Yue Ma*</a>
            <a href="https://github.com/YingqingHe">Yingqing He*</a> , <a href="http://vinthony.github.io/">Xiaodong Cun</a>, 
            <a href="https://xinntao.github.io/"> Xintao Wang </a>,
            <a href="https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=zh-CN">Ying Shan</a>,
            <a href="https://scholar.google.com/citations?user=Xrh1OIUAAAAJ&hl=zh-CN">Xiu Li</a>,
            <a href="http://cqf.io">Qifeng Chen</a>
    </h2>

    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                  <span class="link-block">
                    [<a href="https://arxiv.org/abs/2304.01186" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>]
                </span>

                  <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://github.com/mayuelala/FollowYourPose" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>]
                </span>

                <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://follow-your-pose.github.io/" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Homepage</span>
                  </a>]
                </span>
    </h2>
    <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        TL;DR: We tune 2D stable-diffusion to generate the character videos from pose and text description.
    </h2>
    </div>
    """)


    gr.HTML("""
    <p>In order to run the demo successfully, we recommend the length of video is about <b>3~5 seconds</b>.
    The temporal crop offset and sampling stride are used to adjust the starting point and interval of video samples.
    Due to the GPU limit of this demo, it currently generates 8-frame videos. For generating longer videos (e.g. 32 frames) shown on our webpage, we recommend trying our GitHub <a href=https://github.com/mayuelala/FollowYourPose> code  </a> on your own GPU.
    </p>
    <p>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue.</p>
    <br/>
    <a href="https://huggingface.co/spaces/YueMafighting/FollowYourPose?duplicate=true">
    <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
    """)

    with gr.Row():
        with gr.Column():
            with gr.Accordion('Input Video', open=True):
                # user_input_video = gr.File(label='Input Source Video')
                user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
                video_type = gr.Dropdown(
                  label='The type of input video',
                  choices=[
                      "Raw Video",
                      "Skeleton Video"
                  ], value="Raw Video")
                with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
                    n_sample_frame = gr.Slider(label='Number of Frames',
                                        minimum=0,
                                        maximum=32,
                                        step=1,
                                        value=8)
                    stride = gr.Slider(label='Temporal stride',
                                            minimum=0,
                                            maximum=20,
                                            step=1,
                                            value=1)

                with gr.Accordion('Spatial Crop offset', open=False):
                    left_crop = gr.Number(label='Left crop',
                              value=0,
                              precision=0)
                    right_crop = gr.Number(label='Right crop',
                              value=0,
                              precision=0)
                    top_crop = gr.Number(label='Top crop',
                              value=0,
                              precision=0)
                    bottom_crop = gr.Number(label='Bottom crop',
                              value=0,
                              precision=0)
                    offset_list = [
                         left_crop,
                         right_crop,
                         top_crop,
                         bottom_crop,
                    ]
                
                ImageSequenceDataset_list = [
                   n_sample_frame,
                   stride
                ] + offset_list
                

            with gr.Accordion('Text Prompt', open=True):

                target_prompt = gr.Textbox(label='Target Prompt',
                                    info='The simple background may achieve better results(e.g., "beach", "moon" prompt is better than "street" and "market")',
                                    max_lines=1,
                                    placeholder='Example: "Iron man on the beach"',
                                    value='Iron man on the beach')





            run_button = gr.Button('Generate')

        with gr.Column():
            result = gr.Video(label='Result')
            # result.style(height=512, width=512)
            with gr.Accordion('DDIM Parameters', open=True):
                num_steps = gr.Slider(label='Number of Steps',
                                      info='larger value has better editing capacity, but takes more time and memory.',
                                      minimum=0,
                                      maximum=50,
                                      step=1,
                                      value=50)
                guidance_scale = gr.Slider(label='CFG Scale',
                                           minimum=0,
                                           maximum=50,
                                           step=0.1,
                                           value=12.0)
    with gr.Row():
        from example import style_example
        examples = style_example
        
        gr.Examples(examples=examples,
                    inputs = [
                        user_input_video,
                        target_prompt,
                        num_steps,
                        guidance_scale,
                        video_type,
                        *ImageSequenceDataset_list
                    ],
                    outputs=result,
                    fn=pipe.run,
                    cache_examples=True,
                    )
    inputs = [
            user_input_video,
            target_prompt,
            num_steps,
            guidance_scale,
            video_type,
            *ImageSequenceDataset_list
    ]
    target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
    run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

demo.queue().launch()
# demo.queue().launch(share=False, server_name='0.0.0.0', server_port=80)