adriiita commited on
Commit
44b8e54
1 Parent(s): 261f810

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -109
app.py CHANGED
@@ -1,117 +1,74 @@
1
- import gradio as gr
2
- from processors.input_processor import ContentProcessor
3
- from core.note_generator import NoteGenerator
4
- from core.quiz_generator import QuizGenerator
5
- import os
6
- from dotenv import load_dotenv
 
 
 
 
7
 
8
- # Load environment variables from .env file
9
- load_dotenv()
10
-
11
- # Verify API key is loaded
12
- api_key = os.getenv("OPENAI_API_KEY")
13
- if not api_key:
14
- raise ValueError("OPENAI_API_KEY not found in environment variables")
15
-
16
- processor = ContentProcessor()
17
- note_gen = NoteGenerator(api_key)
18
- quiz_gen = QuizGenerator(api_key)
19
-
20
- def process_pdf(pdf_file, num_questions):
21
- if pdf_file is None:
22
- return "Please upload a PDF file.", ""
23
-
24
- # Save uploaded file temporarily
25
- temp_path = pdf_file.name
26
-
27
- # Process content
28
- documents = processor.process_pdf(temp_path)
29
- content = "\n".join([doc.page_content for doc in documents])
30
 
31
- # Generate outputs
32
- notes = note_gen.generate_notes(content)
33
- quiz = quiz_gen.generate_quiz(content, num_questions)
 
34
 
35
- return notes, quiz
36
-
37
- def process_youtube(youtube_url, num_questions, progress=gr.Progress()):
38
- if not youtube_url:
39
- return "Please enter a YouTube URL.", ""
40
 
41
- try:
42
- progress(0, desc="Starting video processing...")
43
- progress(0.2, desc="Attempting to get subtitles...")
44
- documents = processor.process_youtube(youtube_url)
45
-
46
- progress(0.4, desc="Processing content...")
47
- content = "\n".join([doc.page_content for doc in documents])
48
-
49
- progress(0.6, desc="Generating notes...")
50
- notes = note_gen.generate_notes(content)
51
 
52
- progress(0.8, desc="Generating quiz...")
53
- quiz = quiz_gen.generate_quiz(content, num_questions)
54
-
55
- progress(1.0, desc="Done!")
56
- return notes, quiz
57
- except Exception as e:
58
- return f"Error processing YouTube URL: {str(e)}", ""
59
-
60
- # Create Gradio interface
61
- with gr.Blocks(title="AI Teaching Assistant") as demo:
62
- gr.Markdown("# AI Teaching Assistant")
63
- gr.Markdown("Generate study notes and quizzes from PDFs or YouTube videos")
64
-
65
- with gr.Tabs():
66
- with gr.TabItem("PDF Processing"):
67
- with gr.Row():
68
- with gr.Column():
69
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
70
- pdf_num_questions = gr.Slider(
71
- minimum=1,
72
- maximum=10,
73
- value=5,
74
- step=1,
75
- label="Number of Quiz Questions"
76
- )
77
- pdf_button = gr.Button("Process PDF")
78
-
79
- with gr.Row():
80
- with gr.Column():
81
- pdf_notes_output = gr.Textbox(label="Generated Notes", lines=10)
82
- with gr.Column():
83
- pdf_quiz_output = gr.Textbox(label="Generated Quiz", lines=10)
84
 
85
- pdf_button.click(
86
- fn=process_pdf,
87
- inputs=[pdf_input, pdf_num_questions],
88
- outputs=[pdf_notes_output, pdf_quiz_output]
89
- )
90
-
91
- with gr.TabItem("YouTube Processing"):
92
- with gr.Row():
93
- with gr.Column():
94
- youtube_input = gr.Textbox(label="YouTube URL")
95
- youtube_num_questions = gr.Slider(
96
- minimum=1,
97
- maximum=10,
98
- value=5,
99
- step=1,
100
- label="Number of Quiz Questions"
101
- )
102
- youtube_button = gr.Button("Process YouTube Video")
103
-
104
- with gr.Row():
105
- with gr.Column():
106
- youtube_notes_output = gr.Textbox(label="Generated Notes", lines=10)
107
- with gr.Column():
108
- youtube_quiz_output = gr.Textbox(label="Generated Quiz", lines=10)
109
 
110
- youtube_button.click(
111
- fn=process_youtube,
112
- inputs=[youtube_input, youtube_num_questions],
113
- outputs=[youtube_notes_output, youtube_quiz_output]
 
114
  )
115
-
116
- if __name__ == "__main__":
117
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import (
2
+ PyPDFLoader,
3
+ UnstructuredWordDocumentLoader,
4
+ YoutubeLoader
5
+ )
6
+ from langchain_community.document_loaders.generic import GenericLoader
7
+ from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from youtube_transcript_api import YouTubeTranscriptApi
10
+ import re
11
 
12
+ class ContentProcessor:
13
+ def __init__(self):
14
+ self.text_splitter = RecursiveCharacterTextSplitter(
15
+ chunk_size=1000,
16
+ chunk_overlap=200
17
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def process_pdf(self, file_path):
20
+ loader = PyPDFLoader(file_path)
21
+ pages = loader.load_and_split(self.text_splitter)
22
+ return pages
23
 
24
+ def process_docx(self, file_path):
25
+ loader = UnstructuredWordDocumentLoader(file_path)
26
+ pages = loader.load_and_split(self.text_splitter)
27
+ return pages
 
28
 
29
+ def process_youtube(self, video_url):
30
+ # Extract video ID from URL
31
+ video_id = self._extract_video_id(video_url)
32
+ if not video_id:
33
+ raise ValueError("Invalid YouTube URL")
 
 
 
 
 
34
 
35
+ try:
36
+ # Get transcript directly using youtube_transcript_api
37
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # Combine all transcript pieces
40
+ full_transcript = " ".join([entry['text'] for entry in transcript_list])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Create a document-like structure
43
+ from langchain.schema import Document
44
+ doc = Document(
45
+ page_content=full_transcript,
46
+ metadata={"source": video_url}
47
  )
48
+
49
+ # Split the document
50
+ return self.text_splitter.split_documents([doc])
51
+
52
+ except Exception as e:
53
+ raise Exception(f"Error getting transcript: {str(e)}")
54
+
55
+ def _extract_video_id(self, url):
56
+ # Handle different YouTube URL formats
57
+ patterns = [
58
+ r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
59
+ r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
60
+ ]
61
+
62
+ for pattern in patterns:
63
+ match = re.search(pattern, url)
64
+ if match:
65
+ return match.group(1)
66
+ return None
67
+
68
+ def process_audio(self, audio_file):
69
+ loader = GenericLoader(
70
+ audio_file,
71
+ parser=OpenAIWhisperParser()
72
+ )
73
+ transcript = loader.load()
74
+ return self.text_splitter.split_documents(transcript)