sooolee commited on
Commit
b89b5ed
1 Parent(s): 370fa95

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +13 -6
handler.py CHANGED
@@ -2,18 +2,19 @@ from typing import List, Any
2
  import torch
3
  from peft import PeftModel, PeftConfig
4
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
5
 
6
  def preprocessing(data):
7
  texts = list()
8
  i = 0
9
- if len(data) <= i+4000:
10
  texts = data
11
  else:
12
  while len(data[i:]) != 0:
13
- if len(data[i:]) > 4000:
14
- string = str(data[i:i+4000])
15
  texts.append(string)
16
- i = i + 3800
17
  else:
18
  string = str(data[i:])
19
  texts.append(string)
@@ -33,10 +34,16 @@ class EndpointHandler:
33
 
34
  def __call__(self, data: Any) -> List[str]:
35
 
36
- inputs = data.pop("inputs", data)
 
 
 
 
 
 
37
 
38
  # process input
39
- texts = preprocessing(inputs)
40
  inputs = self.tokenizer(texts, return_tensors="pt", padding=True, ) # truncation=True
41
 
42
  with torch.no_grad():
 
2
  import torch
3
  from peft import PeftModel, PeftConfig
4
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
 
7
  def preprocessing(data):
8
  texts = list()
9
  i = 0
10
+ if len(data) <= i+3000:
11
  texts = data
12
  else:
13
  while len(data[i:]) != 0:
14
+ if len(data[i:]) > 3000:
15
+ string = str(data[i:i+3000])
16
  texts.append(string)
17
+ i = i + 2800
18
  else:
19
  string = str(data[i:])
20
  texts.append(string)
 
34
 
35
  def __call__(self, data: Any) -> List[str]:
36
 
37
+ video_id = data.pop("inputs", data)
38
+ dict = YouTubeTranscriptApi.get_transcript(video_id)
39
+
40
+ transcript = ""
41
+
42
+ for i in range(len(dict)):
43
+ transcript += dict[i]['text']
44
 
45
  # process input
46
+ texts = preprocessing(transcript)
47
  inputs = self.tokenizer(texts, return_tensors="pt", padding=True, ) # truncation=True
48
 
49
  with torch.no_grad():