Spaces:
Sleeping
Sleeping
video not really work that well
Browse files- __pycache__/clip_transform.cpython-39.pyc +0 -0
- app.py +120 -2
- clip_transform.py +51 -0
- requirements.txt +4 -1
__pycache__/clip_transform.cpython-39.pyc
ADDED
Binary file (1.9 kB). View file
|
|
app.py
CHANGED
@@ -7,6 +7,7 @@ import numpy as np
|
|
7 |
import streamlit as st
|
8 |
from streamlit_webrtc import WebRtcMode, webrtc_streamer
|
9 |
import pydub
|
|
|
10 |
# import av
|
11 |
# import cv2
|
12 |
from sample_utils.turn import get_ice_servers
|
@@ -23,8 +24,65 @@ system_one = {
|
|
23 |
"audio_bit_rate": 16000,
|
24 |
# "audio_bit_rate": 32000,
|
25 |
# "audio_bit_rate": 48000,
|
|
|
|
|
|
|
26 |
}
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
playing = st.checkbox("Playing", value=True)
|
30 |
|
@@ -94,6 +152,22 @@ async def queued_audio_frames_callback(
|
|
94 |
|
95 |
return new_frames
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
webrtc_ctx = webrtc_streamer(
|
98 |
key="charles",
|
99 |
desired_playing_state=playing,
|
@@ -105,18 +179,31 @@ webrtc_ctx = webrtc_streamer(
|
|
105 |
async_processing=True,
|
106 |
)
|
107 |
|
108 |
-
system_one_audio_status = st.empty()
|
109 |
|
110 |
if not webrtc_ctx.state.playing:
|
111 |
exit
|
112 |
|
113 |
-
system_one_audio_status.write("Initializing
|
114 |
system_one_audio_output = st.empty()
|
|
|
|
|
|
|
115 |
system_one_audio_history = []
|
116 |
system_one_audio_history_output = st.empty()
|
117 |
|
118 |
|
119 |
sound_chunk = pydub.AudioSegment.empty()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
while True:
|
121 |
if webrtc_ctx.state.playing:
|
122 |
# handle video
|
@@ -125,6 +212,37 @@ while True:
|
|
125 |
while len(video_frames_deque) > 0:
|
126 |
frame = video_frames_deque.popleft()
|
127 |
video_frames.append(frame)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
# handle audio
|
130 |
audio_frames = []
|
|
|
7 |
import streamlit as st
|
8 |
from streamlit_webrtc import WebRtcMode, webrtc_streamer
|
9 |
import pydub
|
10 |
+
import torch
|
11 |
# import av
|
12 |
# import cv2
|
13 |
from sample_utils.turn import get_ice_servers
|
|
|
24 |
"audio_bit_rate": 16000,
|
25 |
# "audio_bit_rate": 32000,
|
26 |
# "audio_bit_rate": 48000,
|
27 |
+
|
28 |
+
# "vision_embeddings_fps": 5,
|
29 |
+
"vision_embeddings_fps": 2,
|
30 |
}
|
31 |
|
32 |
+
system_one["video_detection_emotions"] = [
|
33 |
+
"Happiness",
|
34 |
+
"Sadness",
|
35 |
+
"Fear",
|
36 |
+
"Disgust",
|
37 |
+
"Anger",
|
38 |
+
"Surprise",
|
39 |
+
"Boredom",
|
40 |
+
"Interest",
|
41 |
+
"Excitement",
|
42 |
+
"Guilt",
|
43 |
+
"Shame",
|
44 |
+
"Relief",
|
45 |
+
"Love",
|
46 |
+
"Embarrassment",
|
47 |
+
"Pride",
|
48 |
+
"Envy",
|
49 |
+
"Jealousy",
|
50 |
+
"Anxiety",
|
51 |
+
"Hope",
|
52 |
+
"Despair",
|
53 |
+
"Frustration",
|
54 |
+
"Confusion",
|
55 |
+
"Curiosity",
|
56 |
+
"Contentment",
|
57 |
+
"Indifference",
|
58 |
+
"Anticipation",
|
59 |
+
"Gratitude",
|
60 |
+
"Bitterness"
|
61 |
+
]
|
62 |
+
system_one["video_detection_engement"] = [
|
63 |
+
"Facial_Expressions",
|
64 |
+
"Open_Body_Language",
|
65 |
+
"Closed_Body_Language",
|
66 |
+
"Eye_Contact",
|
67 |
+
"Interest",
|
68 |
+
"Boredom",
|
69 |
+
"Confusion",
|
70 |
+
"Frustration",
|
71 |
+
"Question_Asking",
|
72 |
+
"Engaged_Language",
|
73 |
+
"Short_Responses",
|
74 |
+
"Distraction_Signs"
|
75 |
+
]
|
76 |
+
system_one["video_detection_present"] = [
|
77 |
+
"a person",
|
78 |
+
"no one",
|
79 |
+
" ",
|
80 |
+
"multiple people",
|
81 |
+
"a group of people",
|
82 |
+
]
|
83 |
+
|
84 |
+
system_one_audio_status = st.empty()
|
85 |
+
|
86 |
|
87 |
playing = st.checkbox("Playing", value=True)
|
88 |
|
|
|
152 |
|
153 |
return new_frames
|
154 |
|
155 |
+
system_one_audio_status.write("Initializing CLIP model")
|
156 |
+
from clip_transform import CLIPTransform
|
157 |
+
clip_transform = CLIPTransform()
|
158 |
+
|
159 |
+
system_one_audio_status.write("Initializing CLIP templates")
|
160 |
+
|
161 |
+
embeddings = clip_transform.text_to_embeddings(system_one["video_detection_emotions"])
|
162 |
+
system_one["video_detection_emotions_embeddings"] = embeddings
|
163 |
+
|
164 |
+
embeddings = clip_transform.text_to_embeddings(system_one["video_detection_engement"])
|
165 |
+
system_one["video_detection_engement_embeddings"] = embeddings
|
166 |
+
|
167 |
+
embeddings = clip_transform.text_to_embeddings(system_one["video_detection_present"])
|
168 |
+
system_one["video_detection_present_embeddings"] = embeddings
|
169 |
+
|
170 |
+
system_one_audio_status.write("Initializing webrtc_streamer")
|
171 |
webrtc_ctx = webrtc_streamer(
|
172 |
key="charles",
|
173 |
desired_playing_state=playing,
|
|
|
179 |
async_processing=True,
|
180 |
)
|
181 |
|
|
|
182 |
|
183 |
if not webrtc_ctx.state.playing:
|
184 |
exit
|
185 |
|
186 |
+
system_one_audio_status.write("Initializing streaming")
|
187 |
system_one_audio_output = st.empty()
|
188 |
+
|
189 |
+
system_one_video_output = st.empty()
|
190 |
+
|
191 |
system_one_audio_history = []
|
192 |
system_one_audio_history_output = st.empty()
|
193 |
|
194 |
|
195 |
sound_chunk = pydub.AudioSegment.empty()
|
196 |
+
current_video_embedding = None
|
197 |
+
current_video_embedding_timestamp = time.monotonic()
|
198 |
+
|
199 |
+
|
200 |
+
def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
|
201 |
+
dot_product = torch.mm(embeddings, video_embedding.T)
|
202 |
+
similarity_image_label = [(float("{:.4f}".format(dot_product[i][0])), embeddings_labels[i]) for i in range(len(embeddings_labels))]
|
203 |
+
similarity_image_label.sort(reverse=True)
|
204 |
+
return similarity_image_label
|
205 |
+
|
206 |
+
|
207 |
while True:
|
208 |
if webrtc_ctx.state.playing:
|
209 |
# handle video
|
|
|
212 |
while len(video_frames_deque) > 0:
|
213 |
frame = video_frames_deque.popleft()
|
214 |
video_frames.append(frame)
|
215 |
+
get_embeddings = False
|
216 |
+
get_embeddings |= current_video_embedding is None
|
217 |
+
current_time = time.monotonic()
|
218 |
+
elapsed_time = current_time - current_video_embedding_timestamp
|
219 |
+
get_embeddings |= elapsed_time > 1. / system_one['vision_embeddings_fps']
|
220 |
+
if get_embeddings and len(video_frames) > 0:
|
221 |
+
current_video_embedding_timestamp = current_time
|
222 |
+
current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
|
223 |
+
|
224 |
+
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
|
225 |
+
emotions_top_3 = ""
|
226 |
+
for i in range(3):
|
227 |
+
emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
228 |
+
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
|
229 |
+
engagement_top_3 = ""
|
230 |
+
for i in range(3):
|
231 |
+
engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
232 |
+
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
|
233 |
+
present_top_3 = ""
|
234 |
+
for i in range(3):
|
235 |
+
present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
|
236 |
+
|
237 |
+
# table_content = "**System 1 Video:**\n\n"
|
238 |
+
table_content = "| System 1 Video | |\n| --- | --- |\n"
|
239 |
+
table_content += f"| Present | {present_top_3} |\n"
|
240 |
+
table_content += f"| Emotion | {emotions_top_3} |\n"
|
241 |
+
table_content += f"| Engagement | {engagement_top_3} |\n"
|
242 |
+
system_one_video_output.markdown(table_content)
|
243 |
+
# system_one_video_output.markdown(f"**System 1 Video:** \n [Emotion: {emotions_top_3}], \n [Engagement: {engagement_top_3}], \n [Present: {present_top_3}] ")
|
244 |
+
# for similarity, image_label in similarity_image_label:
|
245 |
+
# print (f"{similarity} {image_label}")
|
246 |
|
247 |
# handle audio
|
248 |
audio_frames = []
|
clip_transform.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
from clip_retrieval.load_clip import load_clip, get_tokenizer
|
7 |
+
# from clip_retrieval.clip_client import ClipClient, Modality
|
8 |
+
|
9 |
+
class CLIPTransform:
|
10 |
+
def __init__(self):
|
11 |
+
# os.environ["OMP_NUM_THREADS"] = "20"
|
12 |
+
# torch.set_num_threads(20)
|
13 |
+
# Load model
|
14 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
15 |
+
# if self.device == "cpu" and torch.backends.mps.is_available():
|
16 |
+
# self.device = torch.device("mps")
|
17 |
+
# self._clip_model="ViT-L/14"
|
18 |
+
self._clip_model="open_clip:ViT-H-14"
|
19 |
+
# self._clip_model="open_clip:ViT-L-14"
|
20 |
+
# self._clip_model="open_clip:datacomp_xl_s13b_b90k"
|
21 |
+
# import open_clip
|
22 |
+
# pretrained = dict(open_clip.list_pretrained())
|
23 |
+
# checkpoint = pretrained[self._clip_model]
|
24 |
+
self.model, self.preprocess = load_clip(self._clip_model, use_jit=True, device=self.device)
|
25 |
+
self.tokenizer = get_tokenizer(self._clip_model)
|
26 |
+
|
27 |
+
print ("using device", self.device)
|
28 |
+
|
29 |
+
def text_to_embeddings(self, prompts):
|
30 |
+
# if prompt is a string, convert to list
|
31 |
+
if type(prompts) is str:
|
32 |
+
prompts = [prompts]
|
33 |
+
text = self.tokenizer(prompts).to(self.device)
|
34 |
+
with torch.no_grad():
|
35 |
+
prompt_embededdings = self.model.encode_text(text)
|
36 |
+
prompt_embededdings /= prompt_embededdings.norm(dim=-1, keepdim=True)
|
37 |
+
return(prompt_embededdings)
|
38 |
+
|
39 |
+
def image_to_embeddings(self, input_im):
|
40 |
+
input_im = Image.fromarray(input_im)
|
41 |
+
prepro = self.preprocess(input_im).unsqueeze(0).to(self.device)
|
42 |
+
with torch.no_grad():
|
43 |
+
image_embeddings = self.model.encode_image(prepro)
|
44 |
+
image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
|
45 |
+
return(image_embeddings)
|
46 |
+
|
47 |
+
def preprocessed_image_to_emdeddings(self, prepro):
|
48 |
+
with torch.no_grad():
|
49 |
+
image_embeddings = self.model.encode_image(prepro)
|
50 |
+
image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
|
51 |
+
return(image_embeddings)
|
requirements.txt
CHANGED
@@ -10,4 +10,7 @@ streamlit_webrtc
|
|
10 |
twilio
|
11 |
python-dotenv
|
12 |
watchdog
|
13 |
-
pydub
|
|
|
|
|
|
|
|
10 |
twilio
|
11 |
python-dotenv
|
12 |
watchdog
|
13 |
+
pydub
|
14 |
+
torch
|
15 |
+
numpy
|
16 |
+
clip-retrieval == 2.36.1
|