import gradio as gr from langchain_community.vectorstores import Qdrant from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from pytubefix import YouTube from qdrant_client import QdrantClient from langchain_groq import ChatGroq import re # Function to extract the transcript text def get_text(video_id): yt = YouTube(video_id) caption = yt.captions.get_by_language_code('en') transcript = caption.generate_srt_captions() # Split the transcript into lines lines = transcript.splitlines() # Extract text from every third line (lines 3, 6, 9, ...) extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4)) return extracted_text # Function to create the Qdrant database def create_qdrant_database(url): text = get_text(url) text_splitter = RecursiveCharacterTextSplitter( chunk_size=10000, chunk_overlap=1000 ) docs = text_splitter.split_text(text) model_name = 'BAAI/bge-large-en' model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': False} embeddings = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) collection_name = "Youtube_Videos" qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333" api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw" qdrant = Qdrant.from_texts( texts=docs, embedding=embeddings, url=qdrant_url, prefer_grpc=False, collection_name=collection_name, api_key=api_key, timeout=50 ) return "Qdrant database created" # Function to answer questions based on the created Qdrant database def get_answer(question): qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333" api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw" # Initialize the embeddings and Qdrant client model_name = 'BAAI/bge-large-en' model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': False} embeddings = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) client = QdrantClient( url=qdrant_url, prefer_grpc=False, api_key=api_key, timeout=50 ) collection_name = "Youtube_Videos" db = Qdrant( client=client, embeddings=embeddings, collection_name=collection_name, ) # Initialize ChatGroq model api_key = "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" model_name = "llama-3.1-70b-versatile" model = ChatGroq(api_key=api_key, model=model_name, temperature=0) # Search for the relevant document and generate the answer docs = db.similarity_search_with_score(query=question, k=1) for doc, score in docs: return model.invoke(f"{question} : {doc.page_content}") # Gradio Interface with gr.Blocks() as demo: with gr.Row(): with gr.Column(): url_input = gr.Textbox(label="YouTube Video URL") output_text = gr.Textbox(label="Result") run_button = gr.Button("Create Qdrant Database") run_button.click(fn=create_qdrant_database, inputs=url_input, outputs=output_text) with gr.Column(): question_input = gr.Textbox(label="Ask a Question") answer_output = gr.Textbox(label="Answer") ask_button = gr.Button("Get Answer") ask_button.click(fn=get_answer, inputs=question_input, outputs=answer_output) demo.launch()