adriiita commited on
Commit
c347d26
1 Parent(s): 55bc5a4

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ .env
5
+ .venv
6
+ env/
7
+ venv/
8
+ ENV/
9
+ *.pdf
10
+ *.docx
11
+ *.txt
12
+ !requirements.txt
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from processors.input_processor import ContentProcessor
3
+ from core.note_generator import NoteGenerator
4
+ from core.quiz_generator import QuizGenerator
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables from .env file
9
+ load_dotenv()
10
+
11
+ # Verify API key is loaded
12
+ api_key = os.getenv("GROQ_API_KEY")
13
+ if not api_key:
14
+ # Try getting from HF secret
15
+ api_key = os.getenv("GROQ_API_KEY")
16
+ if not api_key:
17
+ raise ValueError("GROQ_API_KEY not found in environment variables")
18
+
19
+ processor = ContentProcessor()
20
+ note_gen = NoteGenerator(api_key)
21
+ quiz_gen = QuizGenerator(api_key)
22
+
23
+ def process_pdf(pdf_file, num_questions):
24
+ if pdf_file is None:
25
+ return "Please upload a PDF file.", ""
26
+
27
+ # Save uploaded file temporarily
28
+ temp_path = pdf_file.name
29
+
30
+ # Process content
31
+ documents = processor.process_pdf(temp_path)
32
+ content = "\n".join([doc.page_content for doc in documents])
33
+
34
+ # Generate outputs
35
+ notes = note_gen.generate_notes(content)
36
+ quiz = quiz_gen.generate_quiz(content, num_questions)
37
+
38
+ return notes, quiz
39
+
40
+ def process_youtube(youtube_url, num_questions):
41
+ if not youtube_url:
42
+ return "Please enter a YouTube URL.", ""
43
+
44
+ try:
45
+ documents = processor.process_youtube(youtube_url)
46
+ content = "\n".join([doc.page_content for doc in documents])
47
+
48
+ notes = note_gen.generate_notes(content)
49
+ quiz = quiz_gen.generate_quiz(content, num_questions)
50
+
51
+ return notes, quiz
52
+ except Exception as e:
53
+ return f"Error processing YouTube URL: {str(e)}", ""
54
+
55
+ # Create Gradio interface
56
+ with gr.Blocks(title="AI Teaching Assistant") as demo:
57
+ gr.Markdown("# AI Teaching Assistant")
58
+ gr.Markdown("Generate study notes and quizzes from PDFs or YouTube videos")
59
+
60
+ with gr.Tabs():
61
+ with gr.TabItem("PDF Processing"):
62
+ with gr.Row():
63
+ with gr.Column():
64
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
65
+ pdf_num_questions = gr.Slider(
66
+ minimum=1,
67
+ maximum=10,
68
+ value=5,
69
+ step=1,
70
+ label="Number of Quiz Questions"
71
+ )
72
+ pdf_button = gr.Button("Process PDF")
73
+
74
+ with gr.Row():
75
+ with gr.Column():
76
+ pdf_notes_output = gr.Textbox(label="Generated Notes", lines=10)
77
+ with gr.Column():
78
+ pdf_quiz_output = gr.Textbox(label="Generated Quiz", lines=10)
79
+
80
+ pdf_button.click(
81
+ fn=process_pdf,
82
+ inputs=[pdf_input, pdf_num_questions],
83
+ outputs=[pdf_notes_output, pdf_quiz_output]
84
+ )
85
+
86
+ with gr.TabItem("YouTube Processing"):
87
+ with gr.Row():
88
+ with gr.Column():
89
+ youtube_input = gr.Textbox(label="YouTube URL")
90
+ youtube_num_questions = gr.Slider(
91
+ minimum=1,
92
+ maximum=10,
93
+ value=5,
94
+ step=1,
95
+ label="Number of Quiz Questions"
96
+ )
97
+ youtube_button = gr.Button("Process YouTube Video")
98
+
99
+ with gr.Row():
100
+ with gr.Column():
101
+ youtube_notes_output = gr.Textbox(label="Generated Notes", lines=10)
102
+ with gr.Column():
103
+ youtube_quiz_output = gr.Textbox(label="Generated Quiz", lines=10)
104
+
105
+ youtube_button.click(
106
+ fn=process_youtube,
107
+ inputs=[youtube_input, youtube_num_questions],
108
+ outputs=[youtube_notes_output, youtube_quiz_output]
109
+ )
110
+
111
+ if __name__ == "__main__":
112
+ demo.launch(share=False)
core/note_generator.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ class NoteGenerator:
5
+ def __init__(self, api_key):
6
+ self.llm = ChatGroq(
7
+ temperature=0.7,
8
+ groq_api_key=api_key,
9
+ model_name="llama2-70b-4096" # Groq currently supports Llama2, not Llama3
10
+ )
11
+
12
+ self.note_prompt = PromptTemplate(
13
+ input_variables=["content"],
14
+ template="""
15
+ Create detailed, structured notes from the following content:
16
+ {content}
17
+
18
+ Format the notes with:
19
+ 1. Main topics and subtopics
20
+ 2. Key points and definitions
21
+ 3. Important examples
22
+ 4. Summary
23
+ """
24
+ )
25
+
26
+ self.chain = self.note_prompt | self.llm
27
+
28
+ def generate_notes(self, content):
29
+ return self.chain.invoke({"content": content}).content
core/quiz_generator.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ class QuizGenerator:
5
+ def __init__(self, api_key):
6
+ self.llm = ChatGroq(
7
+ temperature=0.7,
8
+ groq_api_key=api_key,
9
+ model_name="llama2-70b-4096" # Groq currently supports Llama2, not Llama3
10
+ )
11
+
12
+ self.quiz_prompt = PromptTemplate(
13
+ input_variables=["content", "num_questions"],
14
+ template="""
15
+ Create {num_questions} multiple-choice questions based on this content:
16
+ {content}
17
+
18
+ For each question:
19
+ 1. Provide the question
20
+ 2. List 4 possible answers
21
+ 3. Indicate the correct answer
22
+ 4. Add a brief explanation
23
+ """
24
+ )
25
+
26
+ self.chain = self.quiz_prompt | self.llm
27
+
28
+ def generate_quiz(self, content, num_questions=5):
29
+ return self.chain.invoke({
30
+ "content": content,
31
+ "num_questions": num_questions
32
+ }).content
processors/input_processor.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import (
2
+ PyPDFLoader,
3
+ UnstructuredWordDocumentLoader,
4
+ YoutubeLoader
5
+ )
6
+ from langchain_community.document_loaders.generic import GenericLoader
7
+ from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from youtube_transcript_api import YouTubeTranscriptApi
10
+ import re
11
+
12
+ class ContentProcessor:
13
+ def __init__(self):
14
+ self.text_splitter = RecursiveCharacterTextSplitter(
15
+ chunk_size=1000,
16
+ chunk_overlap=200
17
+ )
18
+
19
+ def process_pdf(self, file_path):
20
+ loader = PyPDFLoader(file_path)
21
+ pages = loader.load_and_split(self.text_splitter)
22
+ return pages
23
+
24
+ def process_docx(self, file_path):
25
+ loader = UnstructuredWordDocumentLoader(file_path)
26
+ pages = loader.load_and_split(self.text_splitter)
27
+ return pages
28
+
29
+ def process_youtube(self, video_url):
30
+ # Extract video ID from URL
31
+ video_id = self._extract_video_id(video_url)
32
+ if not video_id:
33
+ raise ValueError("Invalid YouTube URL")
34
+
35
+ try:
36
+ # Get transcript directly using youtube_transcript_api
37
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
38
+
39
+ # Combine all transcript pieces
40
+ full_transcript = " ".join([entry['text'] for entry in transcript_list])
41
+
42
+ # Create a document-like structure
43
+ from langchain.schema import Document
44
+ doc = Document(
45
+ page_content=full_transcript,
46
+ metadata={"source": video_url}
47
+ )
48
+
49
+ # Split the document
50
+ return self.text_splitter.split_documents([doc])
51
+
52
+ except Exception as e:
53
+ raise Exception(f"Error getting transcript: {str(e)}")
54
+
55
+ def _extract_video_id(self, url):
56
+ # Handle different YouTube URL formats
57
+ patterns = [
58
+ r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
59
+ r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
60
+ ]
61
+
62
+ for pattern in patterns:
63
+ match = re.search(pattern, url)
64
+ if match:
65
+ return match.group(1)
66
+ return None
67
+
68
+ def process_audio(self, audio_file):
69
+ loader = GenericLoader(
70
+ audio_file,
71
+ parser=OpenAIWhisperParser()
72
+ )
73
+ transcript = loader.load()
74
+ return self.text_splitter.split_documents(transcript)
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ langchain>=0.1.0
3
+ langchain-openai>=0.0.2
4
+ openai>=1.12.0
5
+ python-dotenv>=1.0.0
6
+ langchain-community>=0.0.1
7
+
8
+
9
+ # Document processing
10
+ PyPDF2>=3.0.0
11
+ unstructured>=0.10.0
12
+ python-docx>=0.8.11
13
+
14
+ # YouTube processing
15
+ youtube-transcript-api>=0.6.1
16
+ pytube>=15.0.0
17
+
18
+ # Text processing
19
+ tiktoken>=0.5.1
20
+
21
+ # Audio processing (optional, for future audio features)
22
+ openai-whisper>=20231117
23
+
24
+ # Development tools
25
+ uvicorn>=0.27.0
26
+ python-multipart>=0.0.9
27
+
28
+ # Groq dependencies
29
+ groq>=0.4.0