Saim-11 commited on
Commit
e997c7d
1 Parent(s): d12d928

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -2,18 +2,25 @@ import gradio as gr
2
  from langchain_community.vectorstores import Qdrant
3
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
5
  import re
6
- from langchain_community.document_loaders import YoutubeLoader
7
- from langchain_community.document_loaders.youtube import TranscriptFormat
8
 
9
 
10
  def get_text(video_id):
11
- loader = YoutubeLoader.from_youtube_url(
12
- video_id,
13
- transcript_format=TranscriptFormat.TEXT,
14
- chunk_size_seconds=30,
15
- )
16
- return "\n\n".join(map(repr, loader.load()))
 
 
 
 
 
 
 
17
 
18
  def create_qdrant_database(url):
19
 
 
2
  from langchain_community.vectorstores import Qdrant
3
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
  import re
7
+ from pytubefix import YouTube
 
8
 
9
 
10
  def get_text(video_id):
11
+ yt = YouTube(video_id)
12
+
13
+ caption = yt.captions.get_by_language_code('en')
14
+ transcript = caption.generate_srt_captions()
15
+
16
+ # Split the transcript into lines
17
+ lines = transcript.splitlines()
18
+
19
+ # Extract text from every third line (lines 3, 6, 9, ...)
20
+ extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
21
+
22
+ print(extracted_text)
23
+ return extracted_text
24
 
25
  def create_qdrant_database(url):
26