Spaces:

adriiita
/

AITeachingAssistant

Sleeping

App Files Files Community

adriiita commited on 9 days ago

Commit

44b8e54

•

1 Parent(s): 261f810

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -109

app.py CHANGED Viewed

@@ -1,117 +1,74 @@
-import gradio as gr
-from processors.input_processor import ContentProcessor
-from core.note_generator import NoteGenerator
-from core.quiz_generator import QuizGenerator
-import os
-from dotenv import load_dotenv
-# Load environment variables from .env file
-load_dotenv()
-# Verify API key is loaded
-api_key = os.getenv("OPENAI_API_KEY")
-if not api_key:
-    raise ValueError("OPENAI_API_KEY not found in environment variables")
-processor = ContentProcessor()
-note_gen = NoteGenerator(api_key)
-quiz_gen = QuizGenerator(api_key)
-def process_pdf(pdf_file, num_questions):
-    if pdf_file is None:
-        return "Please upload a PDF file.", ""
-    # Save uploaded file temporarily
-    temp_path = pdf_file.name
-    # Process content
-    documents = processor.process_pdf(temp_path)
-    content = "\n".join([doc.page_content for doc in documents])
-    # Generate outputs
-    notes = note_gen.generate_notes(content)
-    quiz = quiz_gen.generate_quiz(content, num_questions)
-    return notes, quiz
-def process_youtube(youtube_url, num_questions, progress=gr.Progress()):
-    if not youtube_url:
-        return "Please enter a YouTube URL.", ""
-    try:
-        progress(0, desc="Starting video processing...")
-        progress(0.2, desc="Attempting to get subtitles...")
-        documents = processor.process_youtube(youtube_url)
-        progress(0.4, desc="Processing content...")
-        content = "\n".join([doc.page_content for doc in documents])
-        progress(0.6, desc="Generating notes...")
-        notes = note_gen.generate_notes(content)
-        progress(0.8, desc="Generating quiz...")
-        quiz = quiz_gen.generate_quiz(content, num_questions)
-        progress(1.0, desc="Done!")
-        return notes, quiz
-    except Exception as e:
-        return f"Error processing YouTube URL: {str(e)}", ""
-# Create Gradio interface
-with gr.Blocks(title="AI Teaching Assistant") as demo:
-    gr.Markdown("# AI Teaching Assistant")
-    gr.Markdown("Generate study notes and quizzes from PDFs or YouTube videos")
-    with gr.Tabs():
-        with gr.TabItem("PDF Processing"):
-            with gr.Row():
-                with gr.Column():
-                    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-                    pdf_num_questions = gr.Slider(
-                        minimum=1,
-                        maximum=10,
-                        value=5,
-                        step=1,
-                        label="Number of Quiz Questions"
-                    )
-                    pdf_button = gr.Button("Process PDF")
-            with gr.Row():
-                with gr.Column():
-                    pdf_notes_output = gr.Textbox(label="Generated Notes", lines=10)
-                with gr.Column():
-                    pdf_quiz_output = gr.Textbox(label="Generated Quiz", lines=10)
-            pdf_button.click(
-                fn=process_pdf,
-                inputs=[pdf_input, pdf_num_questions],
-                outputs=[pdf_notes_output, pdf_quiz_output]
-            )
-        with gr.TabItem("YouTube Processing"):
-            with gr.Row():
-                with gr.Column():
-                    youtube_input = gr.Textbox(label="YouTube URL")
-                    youtube_num_questions = gr.Slider(
-                        minimum=1,
-                        maximum=10,
-                        value=5,
-                        step=1,
-                        label="Number of Quiz Questions"
-                    )
-                    youtube_button = gr.Button("Process YouTube Video")
-            with gr.Row():
-                with gr.Column():
-                    youtube_notes_output = gr.Textbox(label="Generated Notes", lines=10)
-                with gr.Column():
-                    youtube_quiz_output = gr.Textbox(label="Generated Quiz", lines=10)
-            youtube_button.click(
-                fn=process_youtube,
-                inputs=[youtube_input, youtube_num_questions],
-                outputs=[youtube_notes_output, youtube_quiz_output]
             )
-if __name__ == "__main__":
-    demo.launch()

+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    UnstructuredWordDocumentLoader,
+    YoutubeLoader
+)
+from langchain_community.document_loaders.generic import GenericLoader
+from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from youtube_transcript_api import YouTubeTranscriptApi
+import re
+class ContentProcessor:
+    def __init__(self):
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
+        )
+    def process_pdf(self, file_path):
+        loader = PyPDFLoader(file_path)
+        pages = loader.load_and_split(self.text_splitter)
+        return pages
+    def process_docx(self, file_path):
+        loader = UnstructuredWordDocumentLoader(file_path)
+        pages = loader.load_and_split(self.text_splitter)
+        return pages
+    def process_youtube(self, video_url):
+        # Extract video ID from URL
+        video_id = self._extract_video_id(video_url)
+        if not video_id:
+            raise ValueError("Invalid YouTube URL")
+        try:
+            # Get transcript directly using youtube_transcript_api
+            transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
+            # Combine all transcript pieces
+            full_transcript = " ".join([entry['text'] for entry in transcript_list])
+            # Create a document-like structure
+            from langchain.schema import Document
+            doc = Document(
+                page_content=full_transcript,
+                metadata={"source": video_url}
             )
+            # Split the document
+            return self.text_splitter.split_documents([doc])
+        except Exception as e:
+            raise Exception(f"Error getting transcript: {str(e)}")
+    def _extract_video_id(self, url):
+        # Handle different YouTube URL formats
+        patterns = [
+            r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
+            r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, url)
+            if match:
+                return match.group(1)
+        return None
+    def process_audio(self, audio_file):
+        loader = GenericLoader(
+            audio_file,
+            parser=OpenAIWhisperParser()
+        )
+        transcript = loader.load()
+        return self.text_splitter.split_documents(transcript)