Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,14 +2,14 @@ import gradio as gr
|
|
2 |
from langchain_community.vectorstores import Qdrant
|
3 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
-
import re
|
7 |
from pytubefix import YouTube
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
def get_text(
|
11 |
-
yt = YouTube(
|
12 |
-
|
13 |
caption = yt.captions.get_by_language_code('en')
|
14 |
transcript = caption.generate_srt_captions()
|
15 |
|
@@ -19,19 +19,18 @@ def get_text(url):
|
|
19 |
# Extract text from every third line (lines 3, 6, 9, ...)
|
20 |
extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
|
21 |
|
22 |
-
#print(extracted_text)
|
23 |
return extracted_text
|
24 |
|
|
|
25 |
def create_qdrant_database(url):
|
26 |
-
|
27 |
text = get_text(url)
|
28 |
|
29 |
text_splitter = RecursiveCharacterTextSplitter(
|
30 |
-
chunk_size=10000,
|
31 |
-
chunk_overlap=1000
|
32 |
)
|
33 |
|
34 |
-
docs = text_splitter.split_text(text)
|
35 |
|
36 |
model_name = 'BAAI/bge-large-en'
|
37 |
model_kwargs = {'device': 'cpu'}
|
@@ -44,11 +43,11 @@ def create_qdrant_database(url):
|
|
44 |
|
45 |
collection_name = "Youtube_Videos"
|
46 |
|
47 |
-
qdrant_url="https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
48 |
-
api_key="zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
|
49 |
|
50 |
qdrant = Qdrant.from_texts(
|
51 |
-
texts=docs,
|
52 |
embedding=embeddings,
|
53 |
url=qdrant_url,
|
54 |
prefer_grpc=False,
|
@@ -59,16 +58,58 @@ def create_qdrant_database(url):
|
|
59 |
|
60 |
return "Qdrant database created"
|
61 |
|
62 |
-
#
|
63 |
-
def
|
64 |
-
|
65 |
-
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
demo.launch()
|
|
|
2 |
from langchain_community.vectorstores import Qdrant
|
3 |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
5 |
from pytubefix import YouTube
|
6 |
+
from qdrant_client import QdrantClient
|
7 |
+
from langchain_groq import ChatGroq
|
8 |
+
import re
|
9 |
|
10 |
+
# Function to extract the transcript text
|
11 |
+
def get_text(video_id):
|
12 |
+
yt = YouTube(video_id)
|
|
|
13 |
caption = yt.captions.get_by_language_code('en')
|
14 |
transcript = caption.generate_srt_captions()
|
15 |
|
|
|
19 |
# Extract text from every third line (lines 3, 6, 9, ...)
|
20 |
extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
|
21 |
|
|
|
22 |
return extracted_text
|
23 |
|
24 |
+
# Function to create the Qdrant database
|
25 |
def create_qdrant_database(url):
|
|
|
26 |
text = get_text(url)
|
27 |
|
28 |
text_splitter = RecursiveCharacterTextSplitter(
|
29 |
+
chunk_size=10000,
|
30 |
+
chunk_overlap=1000
|
31 |
)
|
32 |
|
33 |
+
docs = text_splitter.split_text(text)
|
34 |
|
35 |
model_name = 'BAAI/bge-large-en'
|
36 |
model_kwargs = {'device': 'cpu'}
|
|
|
43 |
|
44 |
collection_name = "Youtube_Videos"
|
45 |
|
46 |
+
qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
47 |
+
api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
|
48 |
|
49 |
qdrant = Qdrant.from_texts(
|
50 |
+
texts=docs,
|
51 |
embedding=embeddings,
|
52 |
url=qdrant_url,
|
53 |
prefer_grpc=False,
|
|
|
58 |
|
59 |
return "Qdrant database created"
|
60 |
|
61 |
+
# Function to answer questions based on the created Qdrant database
|
62 |
+
def get_answer(question):
|
63 |
+
qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
|
64 |
+
api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
|
65 |
|
66 |
+
# Initialize the embeddings and Qdrant client
|
67 |
+
model_name = 'BAAI/bge-large-en'
|
68 |
+
model_kwargs = {'device': 'cpu'}
|
69 |
+
encode_kwargs = {'normalize_embeddings': False}
|
70 |
+
embeddings = HuggingFaceBgeEmbeddings(
|
71 |
+
model_name=model_name,
|
72 |
+
model_kwargs=model_kwargs,
|
73 |
+
encode_kwargs=encode_kwargs
|
74 |
+
)
|
75 |
+
|
76 |
+
client = QdrantClient(
|
77 |
+
url=qdrant_url,
|
78 |
+
prefer_grpc=False,
|
79 |
+
api_key=api_key,
|
80 |
+
timeout=50
|
81 |
+
)
|
82 |
+
collection_name = "Youtube_Videos"
|
83 |
+
db = Qdrant(
|
84 |
+
client=client,
|
85 |
+
embeddings=embeddings,
|
86 |
+
collection_name=collection_name,
|
87 |
|
88 |
+
)
|
89 |
+
|
90 |
+
# Initialize ChatGroq model
|
91 |
+
api_key = "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"
|
92 |
+
model_name = "llama-3.1-70b-versatile"
|
93 |
+
model = ChatGroq(api_key=api_key, model=model_name, temperature=0)
|
94 |
+
|
95 |
+
# Search for the relevant document and generate the answer
|
96 |
+
docs = db.similarity_search_with_score(query=question, k=1)
|
97 |
+
for doc, score in docs:
|
98 |
+
return model.invoke(f"{question} : {doc.page_content}")
|
99 |
+
|
100 |
+
# Gradio Interface
|
101 |
+
with gr.Blocks() as demo:
|
102 |
+
with gr.Row():
|
103 |
+
with gr.Column():
|
104 |
+
url_input = gr.Textbox(label="YouTube Video URL")
|
105 |
+
output_text = gr.Textbox(label="Result")
|
106 |
+
run_button = gr.Button("Create Qdrant Database")
|
107 |
+
run_button.click(fn=create_qdrant_database, inputs=url_input, outputs=output_text)
|
108 |
+
|
109 |
+
with gr.Column():
|
110 |
+
question_input = gr.Textbox(label="Ask a Question")
|
111 |
+
answer_output = gr.Textbox(label="Answer")
|
112 |
+
ask_button = gr.Button("Get Answer")
|
113 |
+
ask_button.click(fn=get_answer, inputs=question_input, outputs=answer_output)
|
114 |
|
115 |
+
demo.launch()
|