Jeremy Watt commited on
Commit
beed08a
1 Parent(s): 3d27671

first push to hf

Browse files
Files changed (32) hide show
  1. .gitignore +174 -0
  2. requirements.txt +5 -0
  3. ytdatakit/__init__.py +0 -0
  4. ytdatakit/about.py +15 -0
  5. ytdatakit/about/__init__.py +0 -0
  6. ytdatakit/about/app.py +17 -0
  7. ytdatakit/app.py +27 -0
  8. ytdatakit/youtube_channel_downloader/__init__.py +0 -0
  9. ytdatakit/youtube_channel_downloader/app.py +86 -0
  10. ytdatakit/youtube_channel_downloader/callbacks.py +21 -0
  11. ytdatakit/youtube_channel_downloader/config.py +1 -0
  12. ytdatakit/youtube_channel_downloader/state.py +24 -0
  13. ytdatakit/youtube_channel_downloader/yt_channel_download.py +56 -0
  14. ytdatakit/youtube_downloader/__init__.py +4 -0
  15. ytdatakit/youtube_downloader/app.py +85 -0
  16. ytdatakit/youtube_downloader/callbacks.py +10 -0
  17. ytdatakit/youtube_downloader/config.py +4 -0
  18. ytdatakit/youtube_downloader/state.py +17 -0
  19. ytdatakit/youtube_downloader/yt_download.py +65 -0
  20. ytdatakit/youtube_thumbnail_downloader/__init__.py +0 -0
  21. ytdatakit/youtube_thumbnail_downloader/app.py +93 -0
  22. ytdatakit/youtube_thumbnail_downloader/callbacks.py +61 -0
  23. ytdatakit/youtube_thumbnail_downloader/config.py +13 -0
  24. ytdatakit/youtube_thumbnail_downloader/state.py +34 -0
  25. ytdatakit/youtube_thumbnail_downloader/yt_thumbnail_downloader.py +73 -0
  26. ytdatakit/youtube_thumbnail_downloader/zip.py +14 -0
  27. ytdatakit/youtube_transcript_downloader/__init__.py +0 -0
  28. ytdatakit/youtube_transcript_downloader/app.py +70 -0
  29. ytdatakit/youtube_transcript_downloader/callbacks.py +53 -0
  30. ytdatakit/youtube_transcript_downloader/config.py +0 -0
  31. ytdatakit/youtube_transcript_downloader/state.py +12 -0
  32. ytdatakit/youtube_transcript_downloader/yt_transcript_download.py +51 -0
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scratch.ipynb
2
+ *.mp4a
3
+ *.mp3
4
+ *.mp4
5
+ venv*
6
+ *.db
7
+ *.faiss
8
+ .DS_Store
9
+ ._.DS_Store
10
+ **/.DS_Store
11
+ **/._.DS_Store
12
+ **/.env*
13
+ bug_reports/
14
+ .ruff_cache/
15
+ .vscode
16
+ notebook_tests/
17
+ scratch_notebooks/
18
+ demos/
19
+ release_notes/
20
+ site/
21
+ tests/test_files/text/test_preprocessed/*
22
+ !.ruff.toml
23
+ push_pypi.sh
24
+
25
+
26
+ # Byte-compiled / optimized / DLL files
27
+ __pycache__/
28
+ *.py[cod]
29
+ *$py.class
30
+
31
+ # C extensions
32
+ *.so
33
+
34
+ # Distribution / packaging
35
+ .Python
36
+ build/
37
+ develop-eggs/
38
+ dist/
39
+ downloads/
40
+ eggs/
41
+ .eggs/
42
+ lib/
43
+ lib64/
44
+ parts/
45
+ sdist/
46
+ var/
47
+ wheels/
48
+ share/python-wheels/
49
+ *.egg-info/
50
+ .installed.cfg
51
+ *.egg
52
+ MANIFEST
53
+
54
+ # PyInstaller
55
+ # Usually these files are written by a python script from a template
56
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
57
+ *.manifest
58
+ *.spec
59
+
60
+ # Installer logs
61
+ pip-log.txt
62
+ pip-delete-this-directory.txt
63
+
64
+ # Unit test / coverage reports
65
+ htmlcov/
66
+ .tox/
67
+ .nox/
68
+ .coverage
69
+ .coverage.*
70
+ .cache
71
+ nosetests.xml
72
+ coverage.xml
73
+ *.cover
74
+ *.py,cover
75
+ .hypothesis/
76
+ .pytest_cache/
77
+ cover/
78
+
79
+ # Translations
80
+ *.mo
81
+ *.pot
82
+
83
+ # Django stuff:
84
+ *.log
85
+ local_settings.py
86
+ db.sqlite3
87
+ db.sqlite3-journal
88
+
89
+ # Flask stuff:
90
+ instance/
91
+ .webassets-cache
92
+
93
+ # Scrapy stuff:
94
+ .scrapy
95
+
96
+ # Sphinx documentation
97
+ docs/_build/
98
+
99
+ # PyBuilder
100
+ .pybuilder/
101
+ target/
102
+
103
+ # Jupyter Notebook
104
+ .ipynb_checkpoints
105
+
106
+ # IPython
107
+ profile_default/
108
+ ipython_config.py
109
+
110
+ # pyenv
111
+ # For a library or package, you might want to ignore these files since the code is
112
+ # intended to run in multiple environments; otherwise, check them in:
113
+ # .python-version
114
+
115
+ # pipenv
116
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
117
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
118
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
119
+ # install all needed dependencies.
120
+ #Pipfile.lock
121
+
122
+ # poetry
123
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
124
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
125
+ # commonly ignored for libraries.
126
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
127
+ #poetry.lock
128
+
129
+ # pdm
130
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
131
+ #pdm.lock
132
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
133
+ # in version control.
134
+ # https://pdm.fming.dev/#use-with-ide
135
+ .pdm.toml
136
+
137
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
138
+ __pypackages__/
139
+
140
+ # Celery stuff
141
+ celerybeat-schedule
142
+ celerybeat.pid
143
+
144
+ # SageMath parsed files
145
+ *.sage.py
146
+
147
+ # Environments
148
+ .venv
149
+ venv/
150
+ venv.bak/
151
+
152
+ # Spyder project settings
153
+ .spyderproject
154
+ .spyproject
155
+
156
+ # Rope project settings
157
+ .ropeproject
158
+
159
+ # mkdocs documentation
160
+ /site
161
+
162
+ # mypy
163
+ .mypy_cache/
164
+ .dmypy.json
165
+ dmypy.json
166
+
167
+ # Pyre type checker
168
+ .pyre/
169
+
170
+ # pytype static type analyzer
171
+ .pytype/
172
+
173
+ # Cython debug symbols
174
+ cython_debug/
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ yt-dlp
2
+ scrapetube
3
+ youtube-transcript-api
4
+ pandas
5
+ streamlit
ytdatakit/__init__.py ADDED
File without changes
ytdatakit/about.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def about():
5
+ return st.markdown(
6
+ (
7
+ "### About \n"
8
+ "Some notes on how this works: \n\n"
9
+ "1. **youtube / google login**: you do **not** need to be logged into a google account to use the app, with one exception: age restricted videos"
10
+ "2. **age restricted videos**: this app cannot fetch age restricted videos yet, which requires a user login to google / youtube - this feature is not yet available"
11
+ "3. **video resolution**: not all videos have all possible resolutions, so you may not be able to fetch the resolution you want for some videos (as they don't exist) \n"
12
+ "4. **recommended hardware**: this is a very light weight app, so minimum specs should work fine"
13
+ "5. **proxies**: there is an option in the yt_download module to enter proxy server ips"
14
+ )
15
+ )
ytdatakit/about/__init__.py ADDED
File without changes
ytdatakit/about/app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def app():
5
+ st.markdown(
6
+ "### YTDatakit \n\n"
7
+ "**Download YouTube videos, transcripts, thumbnails, and channel data - all in one place.** \n\n"
8
+ "One app per tab - as detailed below. \n\n"
9
+ "**Tab 1:** 💡 About - you are here. \n\n"
10
+ "**Tab 2:** 🎞️ Video downloader - enter a YouTube / Shorts url and download its mp4 file. \n\n"
11
+ "**Tab 3:** 📜 Transcript downloader - download multiple YouTube / Shorts transcripts at once. \n\n"
12
+ "**Tab 4:** 📌 Thumbnail downloader - download multiple YouTube / Shorts thumbnails at once. \n\n"
13
+ "**Tab 5:** 📕 Channel downloader - download all YouTube video ids associated with a channel name. \n\n"
14
+ ""
15
+ "Each app is illustrated in the gif below. \n \n"
16
+ "![Alt Text](https://github.com/neonwatty/readme_gifs/blob/main/ytdatakit.gif?raw=true)"
17
+ )
ytdatakit/app.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ytdatakit.about.app import app as about_page
3
+ from ytdatakit.youtube_downloader.app import app as video_downloader
4
+ from ytdatakit.youtube_transcript_downloader.app import app as transcript_downloader
5
+ from ytdatakit.youtube_thumbnail_downloader.app import app as thumbnail_downloader
6
+ from ytdatakit.youtube_channel_downloader.app import app as channel_downloader
7
+
8
+ app_name = "ytdatakit"
9
+ st.set_page_config(page_title=app_name)
10
+ st.title(app_name)
11
+ st.markdown("###### Run this app locally by pulling [the official repo](https://github.com/neonwatty/ytdatakit)")
12
+
13
+
14
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(
15
+ ["💡 About", "🎞️ Video downloader", "📜 Transcript downloader", "📌 Thumbnail downloader", "📕 Channel downloader"]
16
+ )
17
+
18
+ with tab1:
19
+ about_page()
20
+ with tab2:
21
+ video_downloader()
22
+ with tab3:
23
+ transcript_downloader()
24
+ with tab4:
25
+ thumbnail_downloader()
26
+ with tab5:
27
+ channel_downloader()
ytdatakit/youtube_channel_downloader/__init__.py ADDED
File without changes
ytdatakit/youtube_channel_downloader/app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_channel_downloader.callbacks import fetch_channel_videos
2
+ from ytdatakit.youtube_channel_downloader.state import state_init, state_reset
3
+ import streamlit as st
4
+
5
+
6
+ def app():
7
+ state_init()
8
+ st.markdown(
9
+ """
10
+ <style>
11
+ .element-container:has(style){
12
+ display: none;
13
+ }
14
+ #button-download {
15
+ display: none;
16
+ }
17
+ .element-container:has(#button-download) {
18
+ display: none;
19
+ }
20
+ .element-container:has(#button-download) + div button {
21
+ background-color: green;
22
+ border-color: green;
23
+ }
24
+ #button-fetch {
25
+ display: none;
26
+ }
27
+ .element-container:has(#button-fetch) {
28
+ display: none;
29
+ }
30
+ .element-container:has(#button-fetch) + div button {
31
+ background-color: blue;
32
+ border-color: blue;
33
+ }
34
+ </style>
35
+ """,
36
+ unsafe_allow_html=True,
37
+ )
38
+
39
+ st.markdown(
40
+ """
41
+ <style>
42
+ .custom-font {
43
+ font-size:7.5px !important;
44
+ color: transparent;
45
+ }
46
+ </style>
47
+ """,
48
+ unsafe_allow_html=True,
49
+ )
50
+
51
+ video_channel_col_a, video_channel_col_b, video_channel_col_c, video_channel_col_empty = st.columns([4, 3, 2, 2])
52
+ with video_channel_col_a:
53
+ channel_name = st.text_input(
54
+ value=st.session_state.channel_name,
55
+ label="🔗 paste YouTube channel name here",
56
+ placeholder="e.g., littletfitness",
57
+ )
58
+ with video_channel_col_b:
59
+ st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
60
+ st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
61
+ fetch_btn = st.button(
62
+ "fetch channel video ids",
63
+ type="primary",
64
+ )
65
+ if fetch_btn:
66
+ if channel_name != st.session_state.channel_name:
67
+ state_reset()
68
+ if st.session_state.channel_fetch_count == 0:
69
+ df_table, df_download = fetch_channel_videos(channel_name)
70
+ st.session_state.channel_data_table = df_table
71
+ st.session_state.channel_data_download = df_download
72
+ st.session_state.channel_fetch_count += 1
73
+
74
+ with video_channel_col_c:
75
+ st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
76
+ st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
77
+ st.download_button(
78
+ label="download",
79
+ data=st.session_state.channel_data_download,
80
+ file_name="channel_data.csv",
81
+ mime="text/csv",
82
+ disabled=False if st.session_state.channel_fetch_count > 0 else True,
83
+ type="primary",
84
+ )
85
+ with st.container(border=True):
86
+ st.table(st.session_state.channel_data_table.head(10))
ytdatakit/youtube_channel_downloader/callbacks.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_channel_downloader.yt_channel_download import get_channel_videos
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+
6
+ @st.cache_data
7
+ def convert_df(df: pd.DataFrame) -> "csv":
8
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
9
+ return df.to_csv().encode("utf-8")
10
+
11
+
12
+ def fetch_channel_videos(channel_name: str):
13
+ # with st.spinner(text="channel video ids pull in progress..."):
14
+ video_ids, video_urls = get_channel_videos(channel_name)
15
+ if video_ids is not None and video_urls is not None:
16
+ df_table = pd.DataFrame(columns=["youtube_url", "video_id"])
17
+ df_table["youtube_url"] = video_urls
18
+ df_table["video_id"] = video_ids
19
+ df_download = convert_df(df_table)
20
+ return df_table, df_download
21
+ return None, None
ytdatakit/youtube_channel_downloader/config.py ADDED
@@ -0,0 +1 @@
 
 
1
+ default_channel_name = "Monkhaus"
ytdatakit/youtube_channel_downloader/state.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from ytdatakit.youtube_channel_downloader.config import default_channel_name
4
+
5
+
6
+ def state_init():
7
+ df = pd.DataFrame(columns=["youtube_url", "video_id"])
8
+ if "channel_data_table" not in st.session_state:
9
+ st.session_state.channel_data_table = df
10
+ if "channel_data_download" not in st.session_state:
11
+ st.session_state.channel_data_download = df.to_csv().encode("utf-8")
12
+ if "channel_name" not in st.session_state:
13
+ st.session_state.channel_name = default_channel_name
14
+ if "channel_fetch_count" not in st.session_state:
15
+ st.session_state.channel_fetch_count = 0
16
+
17
+
18
+ def state_reset():
19
+ df = pd.DataFrame(columns=["youtube_url", "video_id"])
20
+ if "channel_data_table" not in st.session_state:
21
+ st.session_state.channel_data_table = df
22
+ if "channel_data_download" not in st.session_state:
23
+ st.session_state.channel_data_download = df.to_csv().encode("utf-8")
24
+ st.session_state.channel_fetch_count = 0
ytdatakit/youtube_channel_downloader/yt_channel_download.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ import scrapetube
3
+ from typing import Tuple
4
+
5
+
6
+ def get_channel_id_from_name(channel_name: str) -> str | None:
7
+ ydl_opts = {
8
+ "quiet": True,
9
+ "skip_download": True,
10
+ "extract_flat": True,
11
+ "force_generic_extractor": True,
12
+ }
13
+
14
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
15
+ try:
16
+ info = ydl.extract_info(f"ytsearch1:{channel_name}", download=False)
17
+ return info["entries"][0]["channel_id"]
18
+ except Exception as e:
19
+ print(f"FAILURE: get_channel_id_from_name failed with exception {e}")
20
+ return None
21
+
22
+
23
+ def get_videourl_from_channel_id(channel_id: str) -> Tuple[list, list] | Tuple[None, None]:
24
+ try:
25
+ videos = scrapetube.get_channel(channel_id)
26
+ video_urls = []
27
+ video_ids = []
28
+ for video in videos:
29
+ vid = video["videoId"]
30
+ vurl = "https://www.youtube.com/watch?v=" + vid
31
+ video_ids.append(vid)
32
+ video_urls.append(vurl)
33
+ return video_ids, video_urls
34
+ except Exception as e:
35
+ print(f"FAILURE: get_videourls_from_channel_id failed with exception {e}")
36
+ return None, None
37
+
38
+
39
+ def get_channel_videos(channel_name: str) -> Tuple[list, list] | Tuple[None, None]:
40
+ try:
41
+ print("INFO: starting channel video id puller...")
42
+ channel_id = get_channel_id_from_name(channel_name)
43
+ if channel_id is not None:
44
+ video_ids, video_urls = get_videourl_from_channel_id(channel_id)
45
+ if video_ids is not None and video_urls is not None:
46
+ print("...done!")
47
+ return video_ids, video_urls
48
+ else:
49
+ print("...done!")
50
+ return None, None
51
+ else:
52
+ print("...done!")
53
+ return None, None
54
+ except Exception as e:
55
+ print(f"FAILURE: get_channel_videos failed with exception {e}")
56
+ return None, None
ytdatakit/youtube_downloader/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+ base_dir = os.path.dirname(os.path.abspath(__file__))
4
+ main_dir = os.path.dirname(base_dir)
ytdatakit/youtube_downloader/app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_downloader.config import video_choices
2
+ from ytdatakit.youtube_downloader.callbacks import callback_download_video
3
+ from ytdatakit.youtube_downloader.state import state_init
4
+ import streamlit as st
5
+
6
+
7
+ def app():
8
+ state_init()
9
+ st.markdown(
10
+ """
11
+ <style>
12
+ .element-container:has(style){
13
+ display: none;
14
+ }
15
+ #button-download {
16
+ display: none;
17
+ }
18
+ .element-container:has(#button-download) {
19
+ display: none;
20
+ }
21
+ .element-container:has(#button-download) + div button {
22
+ background-color: green;
23
+ border-color: green;
24
+ }
25
+ #button-fetch {
26
+ display: none;
27
+ }
28
+ .element-container:has(#button-fetch) {
29
+ display: none;
30
+ }
31
+ .element-container:has(#button-fetch) + div button {
32
+ background-color: blue;
33
+ border-color: blue;
34
+ }
35
+ </style>
36
+ """,
37
+ unsafe_allow_html=True,
38
+ )
39
+
40
+ st.markdown(
41
+ """
42
+ <style>
43
+ .custom-font {
44
+ font-size:7.5px !important;
45
+ color: transparent;
46
+ }
47
+ </style>
48
+ """,
49
+ unsafe_allow_html=True,
50
+ )
51
+
52
+ video_download_col_a, video_download_col_b, video_download_col_c = st.columns([4, 3, 2])
53
+ with video_download_col_a:
54
+ url_input = st.text_input(
55
+ value="https://www.youtube.com/watch?v=qQgyoHsknIk",
56
+ label="🔗 Paste YouTube / Shorts URL here",
57
+ placeholder="e.g., https://www.youtube.com/watch?v=.",
58
+ key="youtube_download_text_input",
59
+ )
60
+ with video_download_col_b:
61
+ resolution_dropdown = st.selectbox(options=video_choices, index=st.session_state.youtube_download_resolution_index, label="video resolution")
62
+ with video_download_col_c:
63
+ st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
64
+ st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
65
+ st.button(
66
+ "fetch video",
67
+ type="primary",
68
+ on_click=callback_download_video,
69
+ args=(
70
+ url_input,
71
+ resolution_dropdown,
72
+ ),
73
+ key="youtube_download_fetch_button",
74
+ )
75
+ with st.container(border=True):
76
+ with open(st.session_state.youtube_download_location, "rb") as file:
77
+ st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
78
+ st.download_button(
79
+ label="download video",
80
+ data=file,
81
+ file_name=st.session_state.youtube_download_location.split("/")[-1],
82
+ mime="video/mp4",
83
+ type="primary",
84
+ )
85
+ st.video(data=st.session_state.youtube_download_location, format="video/mp4")
ytdatakit/youtube_downloader/callbacks.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ytdatakit.youtube_downloader.yt_download import download_video
3
+ from ytdatakit.youtube_downloader.state import default_youtube_download_location
4
+ from ytdatakit.youtube_downloader.config import video_choices
5
+
6
+
7
+ def callback_download_video(url_input: str, resolution_dropdown: str) -> None:
8
+ temporary_video_location = download_video(url_input, default_youtube_download_location(), st.session_state.resolution_dropdown)
9
+ st.session_state.youtube_download_location = temporary_video_location
10
+ st.session_state.youtube_download_resolution_index = video_choices.index(resolution_dropdown)
ytdatakit/youtube_downloader/config.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ app_name = "ytdatakit"
2
+ video_choices = ["best", "1080", "720", "360"]
3
+ default_clip_video_path = "./data/input/blank.mp4"
4
+ default_clip_gif_path = "./data/input/blank.jpg"
ytdatakit/youtube_downloader/state.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_downloader.config import video_choices, default_clip_video_path
2
+ import streamlit as st
3
+ import tempfile
4
+
5
+
6
+ def default_youtube_download_location():
7
+ with tempfile.TemporaryDirectory() as tmpdirname:
8
+ return tmpdirname
9
+
10
+
11
+ def state_init():
12
+ if "resolution_dropdown" not in st.session_state:
13
+ st.session_state.resolution_dropdown = video_choices
14
+ if "youtube_download_location" not in st.session_state:
15
+ st.session_state.youtube_download_location = default_clip_video_path
16
+ if "youtube_download_resolution_index" not in st.session_state:
17
+ st.session_state.youtube_download_resolution_index = 0
ytdatakit/youtube_downloader/yt_download.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ from yt_dlp import YoutubeDL
3
+ import re
4
+
5
+
6
+ def is_valid_youtube_url(url: str) -> bool:
7
+ if not isinstance(url, str):
8
+ return False
9
+ pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
10
+ if "shorts" in url:
11
+ pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
12
+ return re.match(pattern, url) is not None
13
+
14
+
15
+ def download_video(url: str, savedir: str, resolution_dropdown: str, my_proxies: dict = {}) -> str:
16
+ try:
17
+ print("Downloading video from youtube...")
18
+ if is_valid_youtube_url(url):
19
+ with YoutubeDL() as ydl:
20
+ info_dict = ydl.extract_info(url, download=False)
21
+ video_url = info_dict.get("url", None)
22
+ video_id = info_dict.get("id", None)
23
+ video_title = info_dict.get("title", None)
24
+ video_title = re.sub(r"[^a-zA-Z0-9]", " ", video_title)
25
+
26
+ if video_title is None:
27
+ savepath = savedir + "/" + video_id + ".mp4"
28
+ else:
29
+ savepath = savedir + "/" + video_title + ".mp4"
30
+
31
+ ydl_opts = {
32
+ "format": "bestvideo+bestaudio/best",
33
+ "merge_output_format": "mp4",
34
+ "outtmpl": savepath,
35
+ }
36
+ if resolution_dropdown == "1080":
37
+ ydl_opts = {
38
+ "format": "bestvideo[height<=1080]+bestaudio/best",
39
+ "merge_output_format": "mp4",
40
+ "outtmpl": savepath,
41
+ }
42
+
43
+ if resolution_dropdown == "720":
44
+ ydl_opts = {
45
+ "format": "bestvideo[height<=720]+bestaudio/best",
46
+ "merge_output_format": "mp4",
47
+ "outtmpl": savepath,
48
+ }
49
+
50
+ if resolution_dropdown == "360":
51
+ ydl_opts = {
52
+ "format": "bestvideo[height<=360]+bestaudio/best",
53
+ "merge_output_format": "mp4",
54
+ "outtmpl": savepath,
55
+ }
56
+
57
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
58
+ ydl.download([url])
59
+
60
+ print("...done!")
61
+ return savepath
62
+ else:
63
+ raise ValueError(f"invalid input url: {url}")
64
+ except Exception as e:
65
+ raise ValueError(f"yt_download failed with exception {e}")
ytdatakit/youtube_thumbnail_downloader/__init__.py ADDED
File without changes
ytdatakit/youtube_thumbnail_downloader/app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ytdatakit.youtube_thumbnail_downloader.state import state_init
3
+ from ytdatakit.youtube_thumbnail_downloader.callbacks import fetch_thumbnails
4
+
5
+
6
+ def app():
7
+ state_init()
8
+ st.markdown(
9
+ """
10
+ <style>
11
+ .element-container:has(style){
12
+ display: none;
13
+ }
14
+ #button-download {
15
+ display: none;
16
+ }
17
+ .element-container:has(#button-download) {
18
+ display: none;
19
+ }
20
+ .element-container:has(#button-download) + div button {
21
+ background-color: green;
22
+ border-color: green;
23
+ }
24
+ #button-fetch {
25
+ display: none;
26
+ }
27
+ .element-container:has(#button-fetch) {
28
+ display: none;
29
+ }
30
+ .element-container:has(#button-fetch) + div button {
31
+ background-color: blue;
32
+ border-color: blue;
33
+ }
34
+ </style>
35
+ """,
36
+ unsafe_allow_html=True,
37
+ )
38
+
39
+ base = st.container(border=True)
40
+ with base:
41
+ text_urls = st.text_area(
42
+ "youtube urls separated by commas",
43
+ value=st.session_state.thumbnail_text_input_urls if "thumbnail_text_input_urls" in st.session_state else "",
44
+ placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....",
45
+ key="thumbnail_urls_input",
46
+ )
47
+ st.thumbnail_text_input_urls = text_urls
48
+ uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="thumbanils_file_uploader")
49
+ thumbnail_col_1, thumbnail_col_2, thumbnail_col_3 = st.columns([5, 8, 8])
50
+ with thumbnail_col_1:
51
+ st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
52
+ st.button(label="fetch thumbnails", type="primary", on_click=fetch_thumbnails, args=(uploaded_file, text_urls))
53
+
54
+ with thumbnail_col_2:
55
+ st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
56
+ if "thumbnails_zip_path" in st.session_state:
57
+ with open(st.session_state.thumbnails_zip_path, "rb") as file:
58
+ st.download_button(
59
+ label="download thumbnails",
60
+ data=file, # st.session_state.thumbnails_zip_path if "thumbnails_zip_path" in st.session_state else "./data/input/blank.zip",
61
+ file_name="thumbnails.zip",
62
+ mime="application/zip",
63
+ type="primary",
64
+ disabled=True if st.session_state.thumbnail_fetch_count == 0 else False,
65
+ )
66
+ else:
67
+ st.download_button(
68
+ label="download thumbnails",
69
+ data="./data/input/blank.zip",
70
+ file_name="thumbnails.zip",
71
+ mime="application/zip",
72
+ type="primary",
73
+ disabled=True,
74
+ )
75
+
76
+ with st.container(border=True):
77
+ for ind, thumbnail_savepath in enumerate(st.session_state.thumbnail_savepaths):
78
+ title = st.session_state.thumbnail_data_entries[ind]["video_title"]
79
+ thumbnail_savepath = st.session_state.thumbnail_savepaths[ind]
80
+ with st.container(border=True):
81
+ a, b, c = st.columns([1, 3, 1])
82
+ with b:
83
+ st.subheader(title)
84
+ st.image(thumbnail_savepath)
85
+ with open(thumbnail_savepath, "rb") as file:
86
+ st.download_button(
87
+ label="download thumbnail",
88
+ data=file,
89
+ file_name=title + ".jpg",
90
+ mime="image/jpg",
91
+ key=f"{title} download",
92
+ type="primary",
93
+ )
ytdatakit/youtube_thumbnail_downloader/callbacks.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_thumbnail_downloader.yt_thumbnail_downloader import get_batch_thumbnails
2
+ from ytdatakit.youtube_thumbnail_downloader.zip import zip_images
3
+ from ytdatakit.youtube_thumbnail_downloader.state import reset_state
4
+ from ytdatakit.youtube_thumbnail_downloader.config import default_thumbnail_location
5
+ import streamlit as st
6
+ from io import StringIO
7
+ import tempfile
8
+
9
+
10
+ def default_temp_savdir():
11
+ with tempfile.TemporaryDirectory() as tmpdirname:
12
+ return tmpdirname
13
+
14
+
15
+ def urls_normalizer(uploaded_file: "st.uploaded", text_urls: str) -> list:
16
+ youtube_urls = []
17
+ if uploaded_file is not None:
18
+ if text_urls is not None:
19
+ if len(text_urls.strip()) > 0:
20
+ st.warning("you can enter urls manually or from file but not both", icon="⚠️")
21
+ st.stop()
22
+
23
+ if uploaded_file.type == "text/plain":
24
+ stringio = StringIO(uploaded_file.read().decode("utf-8"))
25
+ for line in stringio:
26
+ youtube_urls.append(line.strip())
27
+ if text_urls is not None:
28
+ if len(text_urls.strip()) > 0:
29
+ if uploaded_file is not None:
30
+ st.warning("you can enter urls manually or from file but not both", icon="⚠️")
31
+ st.stop()
32
+ try:
33
+ text_urls_split = text_urls.split(",")
34
+ text_urls_split = [v.strip() for v in text_urls_split]
35
+ youtube_urls = text_urls_split
36
+ except: # noqa E722
37
+ st.warning("please check your manually entered urls", icon="⚠️")
38
+ st.stop()
39
+ return youtube_urls
40
+
41
+
42
+ def fetch_logic(youtube_urls: list) -> None:
43
+ if youtube_urls != st.session_state.thumbnail_raw_urls:
44
+ st.session_state.thumbnail_raw_urls = youtube_urls
45
+ reset_state()
46
+ if st.session_state.thumbnail_fetch_count == 0:
47
+ st.session_state.local_thumbnail_location = default_thumbnail_location()
48
+ savedir = "/".join(st.session_state.local_thumbnail_location.split("/")[:-2])
49
+ thumbnail_savepaths, thumbnail_data_entries = get_batch_thumbnails(youtube_urls, savedir)
50
+ st.session_state.thumbnail_savepaths = thumbnail_savepaths
51
+ st.session_state.thumbnail_data_entries = thumbnail_data_entries
52
+ st.session_state.thumbnail_fetch_count += 1
53
+
54
+ st.session_state.thumbnails_zip_path = savedir + "/" + "thumbnails.zip"
55
+ zip_images(thumbnail_savepaths)
56
+
57
+
58
+ def fetch_thumbnails(uploaded_file, text_urls):
59
+ # with st.spinner(text="thumbnail pull in progress..."):
60
+ youtube_urls = urls_normalizer(uploaded_file, text_urls)
61
+ fetch_logic(youtube_urls)
ytdatakit/youtube_thumbnail_downloader/config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import uuid
3
+
4
+ default_thumbnail_raw_urls = ""
5
+ default_thumbnail_savepaths = []
6
+ default_thumbnail_data_entries = []
7
+ default_thumbnail_text_input_urls = ""
8
+ default_thumbnails_zip_path = "./data/input/blank.zip"
9
+
10
+
11
+ def default_thumbnail_location():
12
+ with tempfile.TemporaryDirectory() as tmpdirname:
13
+ return tmpdirname + "/temp_" + str(uuid.uuid4()) + ".jpg"
ytdatakit/youtube_thumbnail_downloader/state.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ytdatakit.youtube_thumbnail_downloader.config import (
3
+ default_thumbnail_raw_urls,
4
+ default_thumbnail_savepaths,
5
+ default_thumbnail_location,
6
+ default_thumbnail_data_entries,
7
+ default_thumbnail_text_input_urls,
8
+ default_thumbnails_zip_path,
9
+ )
10
+
11
+
12
+ def state_init():
13
+ if "thumbnail_raw_urls" not in st.session_state:
14
+ st.session_state.thumbnail_raw_urls = default_thumbnail_raw_urls
15
+ if "thumbnail_savepaths" not in st.session_state:
16
+ st.session_state.thumbnail_savepaths = default_thumbnail_savepaths
17
+ if "thumbnail_data_entries" not in st.session_state:
18
+ st.session_state.thumbnail_data_entries = default_thumbnail_data_entries
19
+ if "thumbnail_fetch_count" not in st.session_state:
20
+ st.session_state.thumbnail_fetch_count = 0
21
+ if "default_thumbnail_location" not in st.session_state:
22
+ st.session_state.local_thumbnail_location = default_thumbnail_location()
23
+ if "youtube_thumbnails_expander" not in st.session_state:
24
+ st.session_state.youtube_thumbnails_expander = False
25
+
26
+
27
+ def reset_state():
28
+ st.session_state.thumbnail_savepaths = default_thumbnail_savepaths
29
+ st.session_state.thumbnail_text_input_urls = default_thumbnail_text_input_urls
30
+ st.session_state.thumbnails_zip_path = default_thumbnails_zip_path
31
+ st.session_state.thumbnail_text_input_urls = ""
32
+
33
+ st.session_state.thumbnail_fetch_count = 0
34
+ st.session_state.youtube_thumbnails_expander = False
ytdatakit/youtube_thumbnail_downloader/yt_thumbnail_downloader.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from yt_dlp import YoutubeDL
4
+
5
+
6
+ def is_valid_youtube_url(url: str) -> bool:
7
+ if not isinstance(url, str):
8
+ return False
9
+ pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
10
+ if "shorts" in url:
11
+ pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
12
+ return re.match(pattern, url) is not None
13
+
14
+
15
+ def download_thumbnail(yt_thumbnail_url: str, savepath: str) -> None:
16
+ img_data = requests.get(yt_thumbnail_url).content
17
+ with open(savepath, "wb") as handler:
18
+ handler.write(img_data)
19
+
20
+
21
+ def get_youtube_thumbnail_url(video_id: str) -> dict:
22
+ if video_id:
23
+ return {
24
+ "default": f"https://img.youtube.com/vi/{video_id}/default.jpg",
25
+ "mqdefault": f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg",
26
+ "hqdefault": f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
27
+ "sddefault": f"https://img.youtube.com/vi/{video_id}/sddefault.jpg",
28
+ "maxresdefault": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
29
+ }
30
+
31
+
32
+ def pull_yt_data(url: str, savedir: str, my_proxies: dict = {}) -> tuple:
33
+ try:
34
+ if is_valid_youtube_url(url):
35
+ with YoutubeDL() as ydl:
36
+ info_dict = ydl.extract_info(url, download=False)
37
+ video_url = info_dict.get("url", None)
38
+ video_id = info_dict.get("id", None)
39
+ video_title = info_dict.get("title", None)
40
+ entry = {}
41
+ entry["video_url"] = url
42
+ entry["video_id"] = video_id
43
+ entry["video_title"] = video_title
44
+ video_title = re.sub(r"[^a-zA-Z0-9]", "", video_title)
45
+
46
+ if video_title is None:
47
+ savepath = savedir + "/" + video_id + ".jpg"
48
+ else:
49
+ savepath = savedir + "/" + video_title + ".jpg"
50
+
51
+ if video_id:
52
+ thumbnail_url = get_youtube_thumbnail_url(video_id)["hqdefault"]
53
+ download_thumbnail(thumbnail_url, savepath)
54
+ print("...done!")
55
+ return savepath, entry
56
+ else:
57
+ raise ValueError(f"invalid input url: {url}")
58
+ except Exception as e:
59
+ raise ValueError(f"yt_download failed with exception {e}")
60
+
61
+
62
+ def get_batch_thumbnails(yt_urls: list, savedir: str, my_proxies: dict = {}):
63
+ thumbnail_savepaths = []
64
+ entries = []
65
+ for url in yt_urls:
66
+ try:
67
+ thumbnail_savepath, data_entry = pull_yt_data(url, savedir, my_proxies)
68
+ thumbnail_savepaths.append(thumbnail_savepath)
69
+ entries.append(data_entry)
70
+ except Exception as e:
71
+ print(f"url {url} failed with exception {e}")
72
+ pass
73
+ return thumbnail_savepaths, entries
ytdatakit/youtube_thumbnail_downloader/zip.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+ import os
3
+ import streamlit as st
4
+
5
+
6
+ def zip_images(image_paths: list):
7
+ print("INFO: zipping images...")
8
+ zip_filename = st.session_state.thumbnails_zip_path
9
+ with zipfile.ZipFile(zip_filename, "w") as zipf:
10
+ for image_path in image_paths:
11
+ _, filename = os.path.split(image_path)
12
+ zipf.write(image_path, arcname=filename)
13
+ print(f"Added {filename} to the zip file.")
14
+ print(f"...done! images have been zipped into {zip_filename}")
ytdatakit/youtube_transcript_downloader/__init__.py ADDED
File without changes
ytdatakit/youtube_transcript_downloader/app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_transcript_downloader.callbacks import fetch_transcripts
2
+ from ytdatakit.youtube_transcript_downloader.state import state_init
3
+ import streamlit as st
4
+
5
+
6
+ def app():
7
+ state_init()
8
+ st.markdown(
9
+ """
10
+ <style>
11
+ .element-container:has(style){
12
+ display: none;
13
+ }
14
+ #button-download {
15
+ display: none;
16
+ }
17
+ .element-container:has(#button-download) {
18
+ display: none;
19
+ }
20
+ .element-container:has(#button-download) + div button {
21
+ background-color: green;
22
+ border-color: green;
23
+ }
24
+ #button-fetch {
25
+ display: none;
26
+ }
27
+ .element-container:has(#button-fetch) {
28
+ display: none;
29
+ }
30
+ .element-container:has(#button-fetch) + div button {
31
+ background-color: blue;
32
+ border-color: blue;
33
+ }
34
+ </style>
35
+ """,
36
+ unsafe_allow_html=True,
37
+ )
38
+
39
+ base = st.container(border=True)
40
+ with base:
41
+ text_urls = st.text_area(
42
+ "youtube urls separated by commas",
43
+ value=st.session_state.transcript_raw_urls,
44
+ placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....",
45
+ key="transcript_urls_input",
46
+ )
47
+ uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="transcripts_file_uploader")
48
+ transcript_col_1, transcript_col_2, transcript_col_3 = st.columns([3, 4, 6])
49
+ with transcript_col_1:
50
+ st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
51
+ fetch_btn = st.button(
52
+ label="fetch transcripts",
53
+ type="primary",
54
+ )
55
+ if fetch_btn:
56
+ df_table, df_download = fetch_transcripts(uploaded_file, text_urls)
57
+ st.session_state.transcript_data_table = df_table
58
+ st.session_state.transcript_data_download = df_download
59
+ with transcript_col_2:
60
+ st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
61
+ st.download_button(
62
+ label="download transcripts",
63
+ data=st.session_state.transcript_data_download,
64
+ file_name="transcripts.csv",
65
+ mime="text/csv",
66
+ disabled=False,
67
+ type="primary",
68
+ )
69
+ with st.container(border=True):
70
+ st.table(st.session_state.transcript_data_table)
ytdatakit/youtube_transcript_downloader/callbacks.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ytdatakit.youtube_transcript_downloader.yt_transcript_download import get_batch_transcripts
2
+ from io import StringIO
3
+ import pandas as pd
4
+ import streamlit as st
5
+ import copy
6
+
7
+
8
+ @st.cache_data
9
+ def convert_df(df: pd.DataFrame) -> "csv":
10
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
11
+ return df.to_csv().encode("utf-8")
12
+
13
+
14
+ def fetch_transcripts(uploaded_file, text_urls):
15
+ # with st.spinner(text="transcript pull in progress..."):
16
+ youtube_urls = []
17
+ if uploaded_file is not None:
18
+ if text_urls is not None:
19
+ if len(text_urls.strip()) > 0:
20
+ st.warning("you can enter urls manually or from file but not both", icon="⚠️")
21
+ st.stop()
22
+
23
+ if uploaded_file.type == "text/plain":
24
+ stringio = StringIO(uploaded_file.read().decode("utf-8"))
25
+ for line in stringio:
26
+ youtube_urls.append(line.strip())
27
+ if text_urls is not None:
28
+ if len(text_urls.strip()) > 0:
29
+ if uploaded_file is not None:
30
+ st.warning("you can enter urls manually or from file but not both", icon="⚠️")
31
+ st.stop()
32
+ try:
33
+ text_urls_split = text_urls.split(",")
34
+ text_urls_split = [v.strip() for v in text_urls_split]
35
+ youtube_urls = text_urls_split
36
+ except: # noqa E722
37
+ st.warning("please check your manually entered urls", icon="⚠️")
38
+ st.stop()
39
+
40
+ batch_transcripts = get_batch_transcripts(youtube_urls)
41
+ df = pd.DataFrame(batch_transcripts)
42
+ df_download = convert_df(df)
43
+
44
+ def truncate_and_append(text, length, suffix):
45
+ if len(text) > length:
46
+ return text[:length] + suffix
47
+ return text
48
+
49
+ max_length = 100
50
+ suffix = "..."
51
+ df_table = copy.deepcopy(df).astype(str)
52
+ df_table["transcript"] = df_table["transcript"].apply(lambda x: truncate_and_append(x, max_length, suffix))
53
+ return df_table, df_download
ytdatakit/youtube_transcript_downloader/config.py ADDED
File without changes
ytdatakit/youtube_transcript_downloader/state.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+
5
+ def state_init():
6
+ df = pd.DataFrame(columns=["youtube_url", "video_id", "transcript"])
7
+ if "transcript_raw_urls" not in st.session_state:
8
+ st.session_state.transcript_raw_urls = ""
9
+ if "transcript_data_table" not in st.session_state:
10
+ st.session_state.transcript_data_table = df
11
+ if "transcript_data_download" not in st.session_state:
12
+ st.session_state.transcript_data_download = df.to_csv().encode("utf-8")
ytdatakit/youtube_transcript_downloader/yt_transcript_download.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+
5
+
6
+ def is_valid_youtube_url(url: str) -> bool:
7
+ if not isinstance(url, str):
8
+ return False
9
+ pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
10
+ if "shorts" in url:
11
+ pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
12
+ return re.match(pattern, url) is not None
13
+
14
+
15
+ def get_single_transcript(youtube_url: str) -> dict:
16
+ if is_valid_youtube_url(youtube_url):
17
+ if "shorts" in youtube_url:
18
+ video_id = youtube_url.split("/")[-1]
19
+ else:
20
+ video_id = youtube_url.split("=")[-1]
21
+ try:
22
+ video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
23
+ entry = {}
24
+ entry["youtube_url"] = youtube_url
25
+ entry["video_id"] = video_id
26
+ entry["transcript"] = video_transcript
27
+ return entry
28
+ except Exception as e:
29
+ if "Subtitles are disabled for this video" in str(e):
30
+ entry = {}
31
+ entry["youtube_url"] = youtube_url
32
+ entry["video_id"] = video_id
33
+ entry["transcript"] = "Subtitles are disabled for this video"
34
+ return entry
35
+ else:
36
+ print(e)
37
+ else:
38
+ print(f"FAILURE: youtube_url is not valid - {youtube_url}")
39
+
40
+
41
+ def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
42
+ try:
43
+ entries = []
44
+ for i, youtube_url in enumerate(youtube_urls):
45
+ entry = get_single_transcript(youtube_url)
46
+ if entry is not None:
47
+ entries.append(entry)
48
+ return entries
49
+ except Exception as e:
50
+ print(f"FAILURE: get_batch_transcripts function failed with exception {e}")
51
+ return []