cocktailpeanut commited on
Commit
9dfb729
1 Parent(s): c32e4c7
Files changed (2) hide show
  1. app.py +40 -39
  2. requirements.txt +4 -4
app.py CHANGED
@@ -11,23 +11,10 @@ import torchvision.transforms as transforms
11
  from decord import VideoReader
12
  from PIL import Image, ImageDraw, ImageFont
13
  from transformers import AutoModel, AutoTokenizer
 
 
 
14
 
15
- import spaces
16
-
17
- title_markdown = ("""
18
- <div style="display: flex; justify-content: flex-start; align-items: center; text-align: center;">
19
- <div style="margin-right: 20px; display: flex; align-items: center;">
20
- <a href="https://github.com/ShareGPT4Omni/ShareGPT4Video" style="text-decoration: none; display: flex; align-items: center;">
21
- <img src="https://raw.githubusercontent.com/ShareGPT4V/ShareGPT4V-Resources/master/images/share4video_tight.png" alt="ShareGPT4Video🚀" style="max-width: 120px; height: auto;">
22
- </a>
23
- </div>
24
- <div>
25
- <h1>ShareGPT4Video: Improving Video Understanding and Generation with Better Captions</h1>
26
- <h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
27
- <h5 style="margin: 0;"> <a href="https://sharegpt4video.github.io/">[Project Page]</a> <a href="https://github.com/ShareGPT4Omni/ShareGPT4Video">[Code]</a> <a href="https://arxiv.org/abs/2406.04325v1">[Paper]</a>
28
- </div>
29
- </div>
30
- """)
31
 
32
  block_css = """
33
  #buttons button {
@@ -35,17 +22,14 @@ block_css = """
35
  }
36
  """
37
 
38
- learn_more_markdown = ("""
39
- ### License
40
- The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
41
- """)
42
-
43
-
44
  new_path = 'Lin-Chen/ShareCaptioner-Video'
45
  tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
46
  model = AutoModel.from_pretrained(
47
- new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
48
- model.cuda()
 
 
49
  model.tokenizer = tokenizer
50
 
51
 
@@ -120,7 +104,8 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
120
  text_embeds = model.encode_text(
121
  subtext, add_special_tokens=need_bos)
122
  embeds.append(text_embeds)
123
- im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
 
124
  need_bos = False
125
  if i < len(images):
126
  try:
@@ -129,11 +114,13 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
129
  image = images[i].convert('RGB')
130
 
131
  image = HD_transform(image, hd_num=hd_num)
132
- image = model.vis_processor(image).unsqueeze(0).cuda()
 
133
  image_embeds = model.encode_img(image)
134
  print(image_embeds.shape)
135
  embeds.append(image_embeds)
136
- im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
 
137
  pt1 = pts
138
  embeds = torch.cat(embeds, dim=1)
139
  im_mask = torch.cat(im_mask, dim=1)
@@ -232,14 +219,17 @@ def encode_resized_image(image_path, max_size=1024):
232
  return base64.b64encode(buffer.getvalue()).decode('utf-8')
233
 
234
 
235
- @spaces.GPU(duration=60)
236
  def generate_slidingcaptioning(video_path):
237
  imgs = load_quota_video(video_path)
238
  q = 'This is the first frame of a video, describe it in detail.'
239
  query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
240
  img = imgs[0]
241
- with torch.cuda.amp.autocast():
242
- response = model_gen(model, query, img, hd_num=9)
 
 
 
243
  print(response)
244
  responses = [response]
245
  images = [img]
@@ -253,7 +243,10 @@ def generate_slidingcaptioning(video_path):
253
  new_img.paste(image1, (0, 0))
254
  new_img.paste(image2, (0, height+50))
255
  query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
256
- with torch.cuda.amp.autocast():
 
 
 
257
  response = model_gen(model, query, new_img, hd_num=9)
258
  responses.append(response)
259
  images.append(new_img)
@@ -263,29 +256,39 @@ def generate_slidingcaptioning(video_path):
263
  idx+1, idx*2, txt)
264
  query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
265
  print(query)
266
- with torch.cuda.amp.autocast():
 
 
 
267
  summ = model_gen(model, query, None, hd_num=16)
268
  print(summ)
269
  return summ
270
 
271
 
272
- @spaces.GPU(duration=60)
273
  def generate_fastcaptioning(video_path):
274
  q = 'Here are a few key frames of a video, discribe this video in detail.'
275
  query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
276
  imgs = load_quota_video(video_path)
277
  img = img_process(imgs)
278
- with torch.cuda.amp.autocast():
 
 
 
 
279
  response = model_gen(model, query, img, hd_num=16,
280
- do_sample=False, beam=3)
281
  return response
282
 
283
 
284
- @spaces.GPU(duration=60)
285
  def generate_promptrecaptioning(text):
286
  q = f'Translate this brief generation prompt into a detailed caption: {text}'
287
  query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
288
- with torch.cuda.amp.autocast():
 
 
 
289
  response = model_gen(model, query, None)
290
  return response
291
 
@@ -298,7 +301,6 @@ def save_video_to_local(video_path):
298
 
299
 
300
  with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
301
- gr.Markdown(title_markdown)
302
  state = gr.State()
303
  state_ = gr.State()
304
  first_run = gr.State()
@@ -333,7 +335,6 @@ with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=bloc
333
  textbox_out = gr.Textbox(
334
  show_label=False, placeholder="Output", container=False
335
  )
336
- gr.Markdown(learn_more_markdown)
337
 
338
  submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
339
  submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])
 
11
  from decord import VideoReader
12
  from PIL import Image, ImageDraw, ImageFont
13
  from transformers import AutoModel, AutoTokenizer
14
+ import devicetorch
15
+
16
+ #import spaces
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  block_css = """
20
  #buttons button {
 
22
  }
23
  """
24
 
25
+ device = devicetorch.get(torch)
 
 
 
 
 
26
  new_path = 'Lin-Chen/ShareCaptioner-Video'
27
  tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
28
  model = AutoModel.from_pretrained(
29
+ #new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
30
+ new_path, torch_dtype=torch.float16, trust_remote_code=True).to(device).eval()
31
+ #model.cuda()
32
+ model.to(device)
33
  model.tokenizer = tokenizer
34
 
35
 
 
104
  text_embeds = model.encode_text(
105
  subtext, add_special_tokens=need_bos)
106
  embeds.append(text_embeds)
107
+ #im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
108
+ im_mask.append(torch.zeros(text_embeds.shape[:2]).to(device))
109
  need_bos = False
110
  if i < len(images):
111
  try:
 
114
  image = images[i].convert('RGB')
115
 
116
  image = HD_transform(image, hd_num=hd_num)
117
+ #image = model.vis_processor(image).unsqueeze(0).cuda()
118
+ image = model.vis_processor(image).unsqueeze(0).to(device)
119
  image_embeds = model.encode_img(image)
120
  print(image_embeds.shape)
121
  embeds.append(image_embeds)
122
+ #im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
123
+ im_mask.append(torch.ones(image_embeds.shape[:2]).to(device))
124
  pt1 = pts
125
  embeds = torch.cat(embeds, dim=1)
126
  im_mask = torch.cat(im_mask, dim=1)
 
219
  return base64.b64encode(buffer.getvalue()).decode('utf-8')
220
 
221
 
222
+ #@spaces.GPU(duration=60)
223
  def generate_slidingcaptioning(video_path):
224
  imgs = load_quota_video(video_path)
225
  q = 'This is the first frame of a video, describe it in detail.'
226
  query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
227
  img = imgs[0]
228
+ if device == "cuda":
229
+ with torch.cuda.amp.autocast():
230
+ response = model_gen(model, query, img, hd_num=9)
231
+ else:
232
+ response = model_gen(model, query, img, hd_num=9)
233
  print(response)
234
  responses = [response]
235
  images = [img]
 
243
  new_img.paste(image1, (0, 0))
244
  new_img.paste(image2, (0, height+50))
245
  query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
246
+ if device == "cuda":
247
+ with torch.cuda.amp.autocast():
248
+ response = model_gen(model, query, new_img, hd_num=9)
249
+ else:
250
  response = model_gen(model, query, new_img, hd_num=9)
251
  responses.append(response)
252
  images.append(new_img)
 
256
  idx+1, idx*2, txt)
257
  query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
258
  print(query)
259
+ if device == "cuda":
260
+ with torch.cuda.amp.autocast():
261
+ summ = model_gen(model, query, None, hd_num=16)
262
+ else:
263
  summ = model_gen(model, query, None, hd_num=16)
264
  print(summ)
265
  return summ
266
 
267
 
268
+ #@spaces.GPU(duration=60)
269
  def generate_fastcaptioning(video_path):
270
  q = 'Here are a few key frames of a video, discribe this video in detail.'
271
  query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
272
  imgs = load_quota_video(video_path)
273
  img = img_process(imgs)
274
+ if device == "cuda":
275
+ with torch.cuda.amp.autocast():
276
+ response = model_gen(model, query, img, hd_num=16,
277
+ do_sample=False, beam=3)
278
+ else:
279
  response = model_gen(model, query, img, hd_num=16,
280
+ do_sample=False, beam=3)
281
  return response
282
 
283
 
284
+ #@spaces.GPU(duration=60)
285
  def generate_promptrecaptioning(text):
286
  q = f'Translate this brief generation prompt into a detailed caption: {text}'
287
  query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
288
+ if device == "cuda":
289
+ with torch.cuda.amp.autocast():
290
+ response = model_gen(model, query, None)
291
+ else:
292
  response = model_gen(model, query, None)
293
  return response
294
 
 
301
 
302
 
303
  with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
 
304
  state = gr.State()
305
  state_ = gr.State()
306
  first_run = gr.State()
 
335
  textbox_out = gr.Textbox(
336
  show_label=False, placeholder="Output", container=False
337
  )
 
338
 
339
  submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
340
  submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- torch==2.1.2
2
- torchvision==0.16.2
3
  transformers==4.37.2
4
  tokenizers==0.15.1
5
  sentencepiece==0.1.99
@@ -13,12 +13,12 @@ scikit-learn==1.2.2
13
  gradio==4.16.0
14
  gradio_client==0.8.1
15
  openai
16
- spaces
17
  requests
18
  httpx==0.24.0
19
  uvicorn
20
  fastapi
21
- decord
22
  einops==0.6.1
23
  einops-exts==0.0.4
24
  timm==0.6.13
 
1
+ #torch==2.1.2
2
+ #torchvision==0.16.2
3
  transformers==4.37.2
4
  tokenizers==0.15.1
5
  sentencepiece==0.1.99
 
13
  gradio==4.16.0
14
  gradio_client==0.8.1
15
  openai
16
+ #spaces
17
  requests
18
  httpx==0.24.0
19
  uvicorn
20
  fastapi
21
+ #decord
22
  einops==0.6.1
23
  einops-exts==0.0.4
24
  timm==0.6.13