Spaces:
Sleeping
Sleeping
switch to open_clip
Browse files- __pycache__/clip_transform.cpython-39.pyc +0 -0
- app.py +80 -57
- clip_transform.py +16 -11
- debug.py +4 -0
- requirements.txt +1 -1
__pycache__/clip_transform.cpython-39.pyc
CHANGED
Binary files a/__pycache__/clip_transform.cpython-39.pyc and b/__pycache__/clip_transform.cpython-39.pyc differ
|
|
app.py
CHANGED
@@ -29,56 +29,81 @@ system_one = {
|
|
29 |
"vision_embeddings_fps": 2,
|
30 |
}
|
31 |
|
|
|
32 |
system_one["video_detection_emotions"] = [
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"Embarrassment",
|
47 |
-
"Pride",
|
48 |
-
"Envy",
|
49 |
-
"Jealousy",
|
50 |
-
"Anxiety",
|
51 |
-
"Hope",
|
52 |
-
"Despair",
|
53 |
-
"Frustration",
|
54 |
-
"Confusion",
|
55 |
-
"Curiosity",
|
56 |
-
"Contentment",
|
57 |
-
"Indifference",
|
58 |
-
"Anticipation",
|
59 |
-
"Gratitude",
|
60 |
-
"Bitterness"
|
61 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
system_one["video_detection_engement"] = [
|
63 |
-
"
|
64 |
-
"
|
65 |
-
"
|
66 |
-
"
|
67 |
-
"
|
68 |
-
"
|
69 |
-
"
|
70 |
-
"
|
71 |
-
"
|
72 |
-
"Engaged_Language",
|
73 |
-
"Short_Responses",
|
74 |
-
"Distraction_Signs"
|
75 |
]
|
76 |
system_one["video_detection_present"] = [
|
77 |
-
"a
|
78 |
-
"
|
79 |
-
" ",
|
80 |
-
"
|
81 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
]
|
83 |
|
84 |
system_one_audio_status = st.empty()
|
@@ -203,6 +228,13 @@ def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
|
|
203 |
similarity_image_label.sort(reverse=True)
|
204 |
return similarity_image_label
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
while True:
|
208 |
if webrtc_ctx.state.playing:
|
@@ -221,18 +253,9 @@ while True:
|
|
221 |
current_video_embedding_timestamp = current_time
|
222 |
current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
228 |
-
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
|
229 |
-
engagement_top_3 = ""
|
230 |
-
for i in range(3):
|
231 |
-
engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
232 |
-
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
|
233 |
-
present_top_3 = ""
|
234 |
-
for i in range(3):
|
235 |
-
present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
|
236 |
|
237 |
# table_content = "**System 1 Video:**\n\n"
|
238 |
table_content = "| System 1 Video | |\n| --- | --- |\n"
|
|
|
29 |
"vision_embeddings_fps": 2,
|
30 |
}
|
31 |
|
32 |
+
|
33 |
system_one["video_detection_emotions"] = [
|
34 |
+
"a happy person",
|
35 |
+
"the person is happy",
|
36 |
+
"the person's emotional state is happy",
|
37 |
+
"a sad person",
|
38 |
+
"a scared person",
|
39 |
+
"a disgusted person",
|
40 |
+
"an angry person",
|
41 |
+
"a suprised person",
|
42 |
+
"a bored person",
|
43 |
+
"an interested person",
|
44 |
+
"a guilty person",
|
45 |
+
"an indiffert person",
|
46 |
+
"a distracted person",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
]
|
48 |
+
|
49 |
+
|
50 |
+
# system_one["video_detection_emotions"] = [
|
51 |
+
# "Happiness",
|
52 |
+
# "Sadness",
|
53 |
+
# "Fear",
|
54 |
+
# "Disgust",
|
55 |
+
# "Anger",
|
56 |
+
# "Surprise",
|
57 |
+
# "Boredom",
|
58 |
+
# "Interest",
|
59 |
+
# "Excitement",
|
60 |
+
# "Guilt",
|
61 |
+
# "Shame",
|
62 |
+
# "Relief",
|
63 |
+
# "Love",
|
64 |
+
# "Embarrassment",
|
65 |
+
# "Pride",
|
66 |
+
# "Envy",
|
67 |
+
# "Jealousy",
|
68 |
+
# "Anxiety",
|
69 |
+
# "Hope",
|
70 |
+
# "Despair",
|
71 |
+
# "Frustration",
|
72 |
+
# "Confusion",
|
73 |
+
# "Curiosity",
|
74 |
+
# "Contentment",
|
75 |
+
# "Indifference",
|
76 |
+
# "Anticipation",
|
77 |
+
# "Gratitude",
|
78 |
+
# "Bitterness"
|
79 |
+
# ]
|
80 |
system_one["video_detection_engement"] = [
|
81 |
+
"the person is engaged in the conversation",
|
82 |
+
"the person is not engaged in the conversation",
|
83 |
+
"the person is looking at me",
|
84 |
+
"the person is not looking at me",
|
85 |
+
"the person is talking to me",
|
86 |
+
"the person is not talking to me",
|
87 |
+
"the person is engaged",
|
88 |
+
"the person is talking",
|
89 |
+
"the person is listening",
|
|
|
|
|
|
|
90 |
]
|
91 |
system_one["video_detection_present"] = [
|
92 |
+
"the view from a webcam",
|
93 |
+
"the view from a webcam we see a person",
|
94 |
+
# "the view from a webcam. I see a person",
|
95 |
+
# "the view from a webcam. The person is looking at the camera",
|
96 |
+
# "i am a webcam",
|
97 |
+
# "i am a webcam and i see a person",
|
98 |
+
# "i am a webcam and i see a person. The person is looking at me",
|
99 |
+
# "a person",
|
100 |
+
# "a person on a Zoom call",
|
101 |
+
# "a person on a FaceTime call",
|
102 |
+
# "a person on a WebCam call",
|
103 |
+
# "no one",
|
104 |
+
# " ",
|
105 |
+
# "multiple people",
|
106 |
+
# "a group of people",
|
107 |
]
|
108 |
|
109 |
system_one_audio_status = st.empty()
|
|
|
228 |
similarity_image_label.sort(reverse=True)
|
229 |
return similarity_image_label
|
230 |
|
231 |
+
def get_top_3_similarities_as_a_string(video_embedding, embeddings, embeddings_labels):
|
232 |
+
similarities = get_dot_similarities(video_embedding, embeddings, embeddings_labels)
|
233 |
+
top_3 = ""
|
234 |
+
range_len = 3 if len(similarities) > 3 else len(similarities)
|
235 |
+
for i in range(range_len):
|
236 |
+
top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
237 |
+
return top_3
|
238 |
|
239 |
while True:
|
240 |
if webrtc_ctx.state.playing:
|
|
|
253 |
current_video_embedding_timestamp = current_time
|
254 |
current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
|
255 |
|
256 |
+
emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
|
257 |
+
engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
|
258 |
+
present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
# table_content = "**System 1 Video:**\n\n"
|
261 |
table_content = "| System 1 Video | |\n| --- | --- |\n"
|
clip_transform.py
CHANGED
@@ -3,8 +3,7 @@ import os
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
from PIL import Image
|
6 |
-
|
7 |
-
# from clip_retrieval.clip_client import ClipClient, Modality
|
8 |
|
9 |
class CLIPTransform:
|
10 |
def __init__(self):
|
@@ -14,15 +13,21 @@ class CLIPTransform:
|
|
14 |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
15 |
# if self.device == "cpu" and torch.backends.mps.is_available():
|
16 |
# self.device = torch.device("mps")
|
17 |
-
|
18 |
-
|
19 |
-
# self._clip_model="
|
20 |
-
# self.
|
21 |
-
|
22 |
-
#
|
23 |
-
#
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
print ("using device", self.device)
|
28 |
|
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
from PIL import Image
|
6 |
+
import open_clip
|
|
|
7 |
|
8 |
class CLIPTransform:
|
9 |
def __init__(self):
|
|
|
13 |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
14 |
# if self.device == "cpu" and torch.backends.mps.is_available():
|
15 |
# self.device = torch.device("mps")
|
16 |
+
|
17 |
+
# # ViT-H-14
|
18 |
+
# self._clip_model="ViT-H-14"
|
19 |
+
# self._pretrained='laion2B-s32B-b79K'
|
20 |
+
|
21 |
+
# # ViT-B-32
|
22 |
+
# self._clip_model="ViT-B-32"
|
23 |
+
# self._pretrained='laion2b_s34b_b79k'
|
24 |
+
|
25 |
+
# ViT-L/14 1.71gb
|
26 |
+
self._clip_model="ViT-L-14"
|
27 |
+
self._pretrained='datacomp_xl_s13b_b90k'
|
28 |
+
|
29 |
+
self.model, _, self.preprocess = open_clip.create_model_and_transforms(self._clip_model, pretrained=self._pretrained)
|
30 |
+
self.tokenizer = open_clip.get_tokenizer(self._clip_model)
|
31 |
|
32 |
print ("using device", self.device)
|
33 |
|
debug.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from clip_transform import CLIPTransform
|
2 |
+
clip_transform = CLIPTransform()
|
3 |
+
|
4 |
+
print ("Initializing CLIP templates")
|
requirements.txt
CHANGED
@@ -13,4 +13,4 @@ watchdog
|
|
13 |
pydub
|
14 |
torch
|
15 |
numpy
|
16 |
-
|
|
|
13 |
pydub
|
14 |
torch
|
15 |
numpy
|
16 |
+
open_clip_torch
|