adriiita commited on
Commit
c17054a
1 Parent(s): 268ef51

Update processors/input_processor.py

Browse files
Files changed (1) hide show
  1. processors/input_processor.py +40 -18
processors/input_processor.py CHANGED
@@ -13,6 +13,11 @@ from youtube_transcript_api import (
13
  NoTranscriptAvailable
14
  )
15
  import re
 
 
 
 
 
16
 
17
  class ContentProcessor:
18
  def __init__(self):
@@ -32,29 +37,46 @@ class ContentProcessor:
32
  return pages
33
 
34
  def process_youtube(self, video_url):
35
- video_id = self._extract_video_id(video_url)
36
- if not video_id:
37
- raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.")
38
-
39
  try:
40
- transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
41
- full_transcript = " ".join([entry['text'] for entry in transcript_list])
 
 
 
 
 
 
 
 
42
 
43
- # Create a document-like structure
44
- from langchain.schema import Document
45
- doc = Document(
46
- page_content=full_transcript,
47
- metadata={"source": video_url}
48
- )
49
 
50
- return self.text_splitter.split_documents([doc])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- except TranscriptsDisabled:
53
- raise Exception("This video does not have subtitles/captions enabled. Please try a different video that has captions available.")
54
- except NoTranscriptFound:
55
- raise Exception("No transcript was found for this video. Please try a different video that has captions available.")
56
  except Exception as e:
57
- raise Exception(f"Unable to get transcript: {str(e)}. Please ensure the video has captions enabled.")
 
58
 
59
  def _extract_video_id(self, url):
60
  # Handle different YouTube URL formats
 
13
  NoTranscriptAvailable
14
  )
15
  import re
16
+ import logging
17
+
18
+ # Set up logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
 
22
  class ContentProcessor:
23
  def __init__(self):
 
37
  return pages
38
 
39
  def process_youtube(self, video_url):
 
 
 
 
40
  try:
41
+ # Log the incoming URL
42
+ logger.info(f"Processing YouTube URL: {video_url}")
43
+
44
+ video_id = self._extract_video_id(video_url)
45
+ if not video_id:
46
+ logger.error(f"Invalid YouTube URL: {video_url}")
47
+ raise ValueError("This appears to be an invalid YouTube URL. Please check the URL and try again.")
48
+
49
+ # Log the extracted video ID
50
+ logger.info(f"Extracted video ID: {video_id}")
51
 
52
+ # List available transcripts
53
+ try:
54
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
55
+ logger.info(f"Available transcripts: {transcript_list}")
56
+ except Exception as e:
57
+ logger.error(f"Error listing transcripts: {str(e)}")
58
 
59
+ # Try to get the transcript
60
+ try:
61
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
62
+ full_transcript = " ".join([entry['text'] for entry in transcript_list])
63
+
64
+ # Create a document-like structure
65
+ from langchain.schema import Document
66
+ doc = Document(
67
+ page_content=full_transcript,
68
+ metadata={"source": video_url}
69
+ )
70
+
71
+ return self.text_splitter.split_documents([doc])
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error getting transcript: {str(e)}")
75
+ raise Exception(f"Unable to access video transcript. Error: {str(e)}\nPlease try a video with available captions.")
76
 
 
 
 
 
77
  except Exception as e:
78
+ logger.error(f"Process failed: {str(e)}")
79
+ raise
80
 
81
  def _extract_video_id(self, url):
82
  # Handle different YouTube URL formats