from langchain_community.document_loaders import ( PyPDFLoader, UnstructuredWordDocumentLoader, YoutubeLoader ) from langchain_community.document_loaders.generic import GenericLoader from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser from langchain.text_splitter import RecursiveCharacterTextSplitter from youtube_transcript_api import ( YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, NoTranscriptAvailable ) import re import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ContentProcessor: def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) def process_pdf(self, file_path): loader = PyPDFLoader(file_path) pages = loader.load_and_split(self.text_splitter) return pages def process_docx(self, file_path): loader = UnstructuredWordDocumentLoader(file_path) pages = loader.load_and_split(self.text_splitter) return pages def process_youtube(self, video_url): try: # Log the incoming URL logger.info(f"Processing YouTube URL: {video_url}") video_id = self._extract_video_id(video_url) if not video_id: logger.error(f"Invalid YouTube URL: {video_url}") raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.") # Log the extracted video ID logger.info(f"Extracted video ID: {video_id}") # List available transcripts try: transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) logger.info(f"Available transcripts: {transcript_list}") except Exception as e: logger.error(f"Error listing transcripts: {str(e)}") # Try to get the transcript try: transcript_list = YouTubeTranscriptApi.get_transcript(video_id) full_transcript = " ".join([entry['text'] for entry in transcript_list]) # Create a document-like structure from langchain.schema import Document doc = Document( page_content=full_transcript, metadata={"source": video_url} ) return self.text_splitter.split_documents([doc]) except Exception as e: logger.error(f"Error getting transcript: {str(e)}") raise Exception(f"Unable to access video transcript. Error: {str(e)}\nPlease try a video with available captions.") except Exception as e: logger.error(f"Process failed: {str(e)}") raise def _extract_video_id(self, url): # Handle different YouTube URL formats patterns = [ r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)', r'(?:youtube\.com\/shorts\/)([^&\n?]*)' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def process_audio(self, audio_file): loader = GenericLoader( audio_file, parser=OpenAIWhisperParser() ) transcript = loader.load() return self.text_splitter.split_documents(transcript)