Update handler.py
Browse files- handler.py +13 -6
handler.py
CHANGED
@@ -2,18 +2,19 @@ from typing import List, Any
|
|
2 |
import torch
|
3 |
from peft import PeftModel, PeftConfig
|
4 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
5 |
|
6 |
def preprocessing(data):
|
7 |
texts = list()
|
8 |
i = 0
|
9 |
-
if len(data) <= i+
|
10 |
texts = data
|
11 |
else:
|
12 |
while len(data[i:]) != 0:
|
13 |
-
if len(data[i:]) >
|
14 |
-
string = str(data[i:i+
|
15 |
texts.append(string)
|
16 |
-
i = i +
|
17 |
else:
|
18 |
string = str(data[i:])
|
19 |
texts.append(string)
|
@@ -33,10 +34,16 @@ class EndpointHandler:
|
|
33 |
|
34 |
def __call__(self, data: Any) -> List[str]:
|
35 |
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# process input
|
39 |
-
texts = preprocessing(
|
40 |
inputs = self.tokenizer(texts, return_tensors="pt", padding=True, ) # truncation=True
|
41 |
|
42 |
with torch.no_grad():
|
|
|
2 |
import torch
|
3 |
from peft import PeftModel, PeftConfig
|
4 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
|
7 |
def preprocessing(data):
|
8 |
texts = list()
|
9 |
i = 0
|
10 |
+
if len(data) <= i+3000:
|
11 |
texts = data
|
12 |
else:
|
13 |
while len(data[i:]) != 0:
|
14 |
+
if len(data[i:]) > 3000:
|
15 |
+
string = str(data[i:i+3000])
|
16 |
texts.append(string)
|
17 |
+
i = i + 2800
|
18 |
else:
|
19 |
string = str(data[i:])
|
20 |
texts.append(string)
|
|
|
34 |
|
35 |
def __call__(self, data: Any) -> List[str]:
|
36 |
|
37 |
+
video_id = data.pop("inputs", data)
|
38 |
+
dict = YouTubeTranscriptApi.get_transcript(video_id)
|
39 |
+
|
40 |
+
transcript = ""
|
41 |
+
|
42 |
+
for i in range(len(dict)):
|
43 |
+
transcript += dict[i]['text']
|
44 |
|
45 |
# process input
|
46 |
+
texts = preprocessing(transcript)
|
47 |
inputs = self.tokenizer(texts, return_tensors="pt", padding=True, ) # truncation=True
|
48 |
|
49 |
with torch.no_grad():
|