Saim-11 commited on
Commit
f95208e
1 Parent(s): 6b6dada

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -24
app.py CHANGED
@@ -2,14 +2,14 @@ import gradio as gr
2
  from langchain_community.vectorstores import Qdrant
3
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from youtube_transcript_api import YouTubeTranscriptApi
6
- import re
7
  from pytubefix import YouTube
 
 
 
8
 
9
-
10
- def get_text(url):
11
- yt = YouTube(url)
12
-
13
  caption = yt.captions.get_by_language_code('en')
14
  transcript = caption.generate_srt_captions()
15
 
@@ -19,19 +19,18 @@ def get_text(url):
19
  # Extract text from every third line (lines 3, 6, 9, ...)
20
  extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
21
 
22
- #print(extracted_text)
23
  return extracted_text
24
 
 
25
  def create_qdrant_database(url):
26
-
27
  text = get_text(url)
28
 
29
  text_splitter = RecursiveCharacterTextSplitter(
30
- chunk_size=10000, # Adjusted chunk size for better processing
31
- chunk_overlap=1000 # Adjusted overlap for context preservation
32
  )
33
 
34
- docs = text_splitter.split_text(text) # Split into list of text chunks
35
 
36
  model_name = 'BAAI/bge-large-en'
37
  model_kwargs = {'device': 'cpu'}
@@ -44,11 +43,11 @@ def create_qdrant_database(url):
44
 
45
  collection_name = "Youtube_Videos"
46
 
47
- qdrant_url="https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
48
- api_key="zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
49
 
50
  qdrant = Qdrant.from_texts(
51
- texts=docs, # Pass the list of documents
52
  embedding=embeddings,
53
  url=qdrant_url,
54
  prefer_grpc=False,
@@ -59,16 +58,58 @@ def create_qdrant_database(url):
59
 
60
  return "Qdrant database created"
61
 
62
- # Gradio Interface
63
- def gradio_interface(url):
64
- result = create_qdrant_database(url)
65
- return result
66
 
67
- with gr.Blocks() as demo:
68
- url_input = gr.Textbox(label="YouTube Video URL")
69
- output_text = gr.Textbox(label="Result")
70
- run_button = gr.Button("Create Qdrant Database")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- run_button.click(fn=gradio_interface, inputs=url_input, outputs=output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- demo.launch()
 
2
  from langchain_community.vectorstores import Qdrant
3
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
5
  from pytubefix import YouTube
6
+ from qdrant_client import QdrantClient
7
+ from langchain_groq import ChatGroq
8
+ import re
9
 
10
+ # Function to extract the transcript text
11
+ def get_text(video_id):
12
+ yt = YouTube(video_id)
 
13
  caption = yt.captions.get_by_language_code('en')
14
  transcript = caption.generate_srt_captions()
15
 
 
19
  # Extract text from every third line (lines 3, 6, 9, ...)
20
  extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
21
 
 
22
  return extracted_text
23
 
24
+ # Function to create the Qdrant database
25
  def create_qdrant_database(url):
 
26
  text = get_text(url)
27
 
28
  text_splitter = RecursiveCharacterTextSplitter(
29
+ chunk_size=10000,
30
+ chunk_overlap=1000
31
  )
32
 
33
+ docs = text_splitter.split_text(text)
34
 
35
  model_name = 'BAAI/bge-large-en'
36
  model_kwargs = {'device': 'cpu'}
 
43
 
44
  collection_name = "Youtube_Videos"
45
 
46
+ qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
47
+ api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
48
 
49
  qdrant = Qdrant.from_texts(
50
+ texts=docs,
51
  embedding=embeddings,
52
  url=qdrant_url,
53
  prefer_grpc=False,
 
58
 
59
  return "Qdrant database created"
60
 
61
+ # Function to answer questions based on the created Qdrant database
62
+ def get_answer(question):
63
+ qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
64
+ api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
65
 
66
+ # Initialize the embeddings and Qdrant client
67
+ model_name = 'BAAI/bge-large-en'
68
+ model_kwargs = {'device': 'cpu'}
69
+ encode_kwargs = {'normalize_embeddings': False}
70
+ embeddings = HuggingFaceBgeEmbeddings(
71
+ model_name=model_name,
72
+ model_kwargs=model_kwargs,
73
+ encode_kwargs=encode_kwargs
74
+ )
75
+
76
+ client = QdrantClient(
77
+ url=qdrant_url,
78
+ prefer_grpc=False,
79
+ api_key=api_key,
80
+ timeout=50
81
+ )
82
+ collection_name = "Youtube_Videos"
83
+ db = Qdrant(
84
+ client=client,
85
+ embeddings=embeddings,
86
+ collection_name=collection_name,
87
 
88
+ )
89
+
90
+ # Initialize ChatGroq model
91
+ api_key = "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"
92
+ model_name = "llama-3.1-70b-versatile"
93
+ model = ChatGroq(api_key=api_key, model=model_name, temperature=0)
94
+
95
+ # Search for the relevant document and generate the answer
96
+ docs = db.similarity_search_with_score(query=question, k=1)
97
+ for doc, score in docs:
98
+ return model.invoke(f"{question} : {doc.page_content}")
99
+
100
+ # Gradio Interface
101
+ with gr.Blocks() as demo:
102
+ with gr.Row():
103
+ with gr.Column():
104
+ url_input = gr.Textbox(label="YouTube Video URL")
105
+ output_text = gr.Textbox(label="Result")
106
+ run_button = gr.Button("Create Qdrant Database")
107
+ run_button.click(fn=create_qdrant_database, inputs=url_input, outputs=output_text)
108
+
109
+ with gr.Column():
110
+ question_input = gr.Textbox(label="Ask a Question")
111
+ answer_output = gr.Textbox(label="Answer")
112
+ ask_button = gr.Button("Get Answer")
113
+ ask_button.click(fn=get_answer, inputs=question_input, outputs=answer_output)
114
 
115
+ demo.launch()