Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,18 +2,25 @@ import gradio as gr
|
|
2 |
from langchain_community.vectorstores import Qdrant
|
3 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
5 |
import re
|
6 |
-
from
|
7 |
-
from langchain_community.document_loaders.youtube import TranscriptFormat
|
8 |
|
9 |
|
10 |
def get_text(video_id):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def create_qdrant_database(url):
|
19 |
|
|
|
2 |
from langchain_community.vectorstores import Qdrant
|
3 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
import re
|
7 |
+
from pytubefix import YouTube
|
|
|
8 |
|
9 |
|
10 |
def get_text(video_id):
|
11 |
+
yt = YouTube(video_id)
|
12 |
+
|
13 |
+
caption = yt.captions.get_by_language_code('en')
|
14 |
+
transcript = caption.generate_srt_captions()
|
15 |
+
|
16 |
+
# Split the transcript into lines
|
17 |
+
lines = transcript.splitlines()
|
18 |
+
|
19 |
+
# Extract text from every third line (lines 3, 6, 9, ...)
|
20 |
+
extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
|
21 |
+
|
22 |
+
print(extracted_text)
|
23 |
+
return extracted_text
|
24 |
|
25 |
def create_qdrant_database(url):
|
26 |
|