Spaces:

neonwatty
/

ytdatakit

Running

App Files Files Community

Jeremy Watt commited on Aug 4

Commit

beed08a

•

1 Parent(s): 3d27671

first push to hf

Browse files

Files changed (32) hide show

.gitignore +174 -0
requirements.txt +5 -0
ytdatakit/__init__.py +0 -0
ytdatakit/about.py +15 -0
ytdatakit/about/__init__.py +0 -0
ytdatakit/about/app.py +17 -0
ytdatakit/app.py +27 -0
ytdatakit/youtube_channel_downloader/__init__.py +0 -0
ytdatakit/youtube_channel_downloader/app.py +86 -0
ytdatakit/youtube_channel_downloader/callbacks.py +21 -0
ytdatakit/youtube_channel_downloader/config.py +1 -0
ytdatakit/youtube_channel_downloader/state.py +24 -0
ytdatakit/youtube_channel_downloader/yt_channel_download.py +56 -0
ytdatakit/youtube_downloader/__init__.py +4 -0
ytdatakit/youtube_downloader/app.py +85 -0
ytdatakit/youtube_downloader/callbacks.py +10 -0
ytdatakit/youtube_downloader/config.py +4 -0
ytdatakit/youtube_downloader/state.py +17 -0
ytdatakit/youtube_downloader/yt_download.py +65 -0
ytdatakit/youtube_thumbnail_downloader/__init__.py +0 -0
ytdatakit/youtube_thumbnail_downloader/app.py +93 -0
ytdatakit/youtube_thumbnail_downloader/callbacks.py +61 -0
ytdatakit/youtube_thumbnail_downloader/config.py +13 -0
ytdatakit/youtube_thumbnail_downloader/state.py +34 -0
ytdatakit/youtube_thumbnail_downloader/yt_thumbnail_downloader.py +73 -0
ytdatakit/youtube_thumbnail_downloader/zip.py +14 -0
ytdatakit/youtube_transcript_downloader/__init__.py +0 -0
ytdatakit/youtube_transcript_downloader/app.py +70 -0
ytdatakit/youtube_transcript_downloader/callbacks.py +53 -0
ytdatakit/youtube_transcript_downloader/config.py +0 -0
ytdatakit/youtube_transcript_downloader/state.py +12 -0
ytdatakit/youtube_transcript_downloader/yt_transcript_download.py +51 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+scratch.ipynb
+*.mp4a
+*.mp3
+*.mp4
+venv*
+*.db
+*.faiss
+.DS_Store
+._.DS_Store
+**/.DS_Store
+**/._.DS_Store
+**/.env*
+bug_reports/
+.ruff_cache/
+.vscode
+notebook_tests/
+scratch_notebooks/
+demos/
+release_notes/
+site/
+tests/test_files/text/test_preprocessed/*
+!.ruff.toml
+push_pypi.sh
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.venv
+venv/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+yt-dlp
+scrapetube
+youtube-transcript-api
+pandas
+streamlit

ytdatakit/__init__.py ADDED Viewed

File without changes

ytdatakit/about.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+def about():
+    return st.markdown(
+        (
+            "### About \n"
+            "Some notes on how this works: \n\n"
+            "1.  **youtube / google login**: you do **not** need to be logged into a google account to use the app, with one exception: age restricted videos"
+            "2.  **age restricted videos**: this app cannot fetch age restricted videos yet, which requires a user login to google / youtube - this feature is not yet available"
+            "3.  **video resolution**: not all videos have all possible resolutions, so you may not be able to fetch the resolution you want for some videos (as they don't exist) \n"
+            "4.  **recommended hardware**: this is a very light weight app, so minimum specs should work fine"
+            "5.  **proxies**: there is an option in the yt_download module to enter proxy server ips"
+        )
+    )

ytdatakit/about/__init__.py ADDED Viewed

File without changes

ytdatakit/about/app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import streamlit as st
+def app():
+    st.markdown(
+        "### YTDatakit \n\n"
+        "**Download YouTube videos, transcripts, thumbnails, and channel data - all in one place.** \n\n"
+        "One app per tab - as detailed below. \n\n"
+        "**Tab 1:**  💡 About - you are here. \n\n"
+        "**Tab 2:**  🎞️ Video downloader - enter a YouTube / Shorts url and download its mp4 file. \n\n"
+        "**Tab 3:**  📜 Transcript downloader - download multiple YouTube / Shorts transcripts at once. \n\n"
+        "**Tab 4:**  📌 Thumbnail downloader - download multiple YouTube / Shorts thumbnails at once. \n\n"
+        "**Tab 5:**  📕 Channel downloader - download all YouTube video ids associated with a channel name. \n\n"
+        ""
+        "Each app is illustrated in the gif below. \n \n"
+        "![Alt Text](https://github.com/neonwatty/readme_gifs/blob/main/ytdatakit.gif?raw=true)"
+    )

ytdatakit/app.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import streamlit as st
+from ytdatakit.about.app import app as about_page
+from ytdatakit.youtube_downloader.app import app as video_downloader
+from ytdatakit.youtube_transcript_downloader.app import app as transcript_downloader
+from ytdatakit.youtube_thumbnail_downloader.app import app as thumbnail_downloader
+from ytdatakit.youtube_channel_downloader.app import app as channel_downloader
+app_name = "ytdatakit"
+st.set_page_config(page_title=app_name)
+st.title(app_name)
+st.markdown("###### Run this app locally by pulling [the official repo](https://github.com/neonwatty/ytdatakit)")
+tab1, tab2, tab3, tab4, tab5 = st.tabs(
+    ["💡 About", "🎞️ Video downloader", "📜 Transcript downloader", "📌 Thumbnail downloader", "📕 Channel downloader"]
+)
+with tab1:
+    about_page()
+with tab2:
+    video_downloader()
+with tab3:
+    transcript_downloader()
+with tab4:
+    thumbnail_downloader()
+with tab5:
+    channel_downloader()

ytdatakit/youtube_channel_downloader/__init__.py ADDED Viewed

File without changes

ytdatakit/youtube_channel_downloader/app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from ytdatakit.youtube_channel_downloader.callbacks import fetch_channel_videos
+from ytdatakit.youtube_channel_downloader.state import state_init, state_reset
+import streamlit as st
+def app():
+    state_init()
+    st.markdown(
+        """
+    <style>
+    .element-container:has(style){
+        display: none;
+    }
+    #button-download {
+        display: none;
+    }
+    .element-container:has(#button-download) {
+        display: none;
+    }
+    .element-container:has(#button-download) + div button {
+        background-color: green;
+        border-color: green;
+        }
+    #button-fetch {
+        display: none;
+    }
+    .element-container:has(#button-fetch) {
+        display: none;
+    }
+    .element-container:has(#button-fetch) + div button {
+        background-color: blue;
+        border-color: blue;
+        }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    st.markdown(
+        """
+    <style>
+    .custom-font {
+        font-size:7.5px !important;
+        color: transparent;
+    }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    video_channel_col_a, video_channel_col_b, video_channel_col_c, video_channel_col_empty = st.columns([4, 3, 2, 2])
+    with video_channel_col_a:
+        channel_name = st.text_input(
+            value=st.session_state.channel_name,
+            label="🔗 paste YouTube channel name here",
+            placeholder="e.g., littletfitness",
+        )
+    with video_channel_col_b:
+        st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
+        st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
+        fetch_btn = st.button(
+            "fetch channel video ids",
+            type="primary",
+        )
+        if fetch_btn:
+            if channel_name != st.session_state.channel_name:
+                state_reset()
+            if st.session_state.channel_fetch_count == 0:
+                df_table, df_download = fetch_channel_videos(channel_name)
+                st.session_state.channel_data_table = df_table
+                st.session_state.channel_data_download = df_download
+                st.session_state.channel_fetch_count += 1
+    with video_channel_col_c:
+        st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
+        st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
+        st.download_button(
+            label="download",
+            data=st.session_state.channel_data_download,
+            file_name="channel_data.csv",
+            mime="text/csv",
+            disabled=False if st.session_state.channel_fetch_count > 0 else True,
+            type="primary",
+        )
+    with st.container(border=True):
+        st.table(st.session_state.channel_data_table.head(10))

ytdatakit/youtube_channel_downloader/callbacks.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from ytdatakit.youtube_channel_downloader.yt_channel_download import get_channel_videos
+import pandas as pd
+import streamlit as st
+@st.cache_data
+def convert_df(df: pd.DataFrame) -> "csv":
+    # IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode("utf-8")
+def fetch_channel_videos(channel_name: str):
+    # with st.spinner(text="channel video ids pull in progress..."):
+    video_ids, video_urls = get_channel_videos(channel_name)
+    if video_ids is not None and video_urls is not None:
+        df_table = pd.DataFrame(columns=["youtube_url", "video_id"])
+        df_table["youtube_url"] = video_urls
+        df_table["video_id"] = video_ids
+        df_download = convert_df(df_table)
+        return df_table, df_download
+    return None, None

ytdatakit/youtube_channel_downloader/config.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ default_channel_name = "Monkhaus"

ytdatakit/youtube_channel_downloader/state.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+import pandas as pd
+from ytdatakit.youtube_channel_downloader.config import default_channel_name
+def state_init():
+    df = pd.DataFrame(columns=["youtube_url", "video_id"])
+    if "channel_data_table" not in st.session_state:
+        st.session_state.channel_data_table = df
+    if "channel_data_download" not in st.session_state:
+        st.session_state.channel_data_download = df.to_csv().encode("utf-8")
+    if "channel_name" not in st.session_state:
+        st.session_state.channel_name = default_channel_name
+    if "channel_fetch_count" not in st.session_state:
+        st.session_state.channel_fetch_count = 0
+def state_reset():
+    df = pd.DataFrame(columns=["youtube_url", "video_id"])
+    if "channel_data_table" not in st.session_state:
+        st.session_state.channel_data_table = df
+    if "channel_data_download" not in st.session_state:
+        st.session_state.channel_data_download = df.to_csv().encode("utf-8")
+    st.session_state.channel_fetch_count = 0

ytdatakit/youtube_channel_downloader/yt_channel_download.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import yt_dlp
+import scrapetube
+from typing import Tuple
+def get_channel_id_from_name(channel_name: str) -> str | None:
+    ydl_opts = {
+        "quiet": True,
+        "skip_download": True,
+        "extract_flat": True,
+        "force_generic_extractor": True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        try:
+            info = ydl.extract_info(f"ytsearch1:{channel_name}", download=False)
+            return info["entries"][0]["channel_id"]
+        except Exception as e:
+            print(f"FAILURE: get_channel_id_from_name failed with exception {e}")
+            return None
+def get_videourl_from_channel_id(channel_id: str) -> Tuple[list, list] | Tuple[None, None]:
+    try:
+        videos = scrapetube.get_channel(channel_id)
+        video_urls = []
+        video_ids = []
+        for video in videos:
+            vid = video["videoId"]
+            vurl = "https://www.youtube.com/watch?v=" + vid
+            video_ids.append(vid)
+            video_urls.append(vurl)
+        return video_ids, video_urls
+    except Exception as e:
+        print(f"FAILURE: get_videourls_from_channel_id failed with exception {e}")
+        return None, None
+def get_channel_videos(channel_name: str) -> Tuple[list, list] | Tuple[None, None]:
+    try:
+        print("INFO: starting channel video id puller...")
+        channel_id = get_channel_id_from_name(channel_name)
+        if channel_id is not None:
+            video_ids, video_urls = get_videourl_from_channel_id(channel_id)
+            if video_ids is not None and video_urls is not None:
+                print("...done!")
+                return video_ids, video_urls
+            else:
+                print("...done!")
+                return None, None
+        else:
+            print("...done!")
+            return None, None
+    except Exception as e:
+        print(f"FAILURE: get_channel_videos failed with exception {e}")
+        return None, None

ytdatakit/youtube_downloader/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+base_dir = os.path.dirname(os.path.abspath(__file__))
+main_dir = os.path.dirname(base_dir)

ytdatakit/youtube_downloader/app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from ytdatakit.youtube_downloader.config import video_choices
+from ytdatakit.youtube_downloader.callbacks import callback_download_video
+from ytdatakit.youtube_downloader.state import state_init
+import streamlit as st
+def app():
+    state_init()
+    st.markdown(
+        """
+    <style>
+    .element-container:has(style){
+        display: none;
+    }
+    #button-download {
+        display: none;
+    }
+    .element-container:has(#button-download) {
+        display: none;
+    }
+    .element-container:has(#button-download) + div button {
+        background-color: green;
+        border-color: green;
+        }
+    #button-fetch {
+        display: none;
+    }
+    .element-container:has(#button-fetch) {
+        display: none;
+    }
+    .element-container:has(#button-fetch) + div button {
+        background-color: blue;
+        border-color: blue;
+        }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    st.markdown(
+        """
+    <style>
+    .custom-font {
+        font-size:7.5px !important;
+        color: transparent;
+    }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    video_download_col_a, video_download_col_b, video_download_col_c = st.columns([4, 3, 2])
+    with video_download_col_a:
+        url_input = st.text_input(
+            value="https://www.youtube.com/watch?v=qQgyoHsknIk",
+            label="🔗 Paste YouTube / Shorts URL here",
+            placeholder="e.g., https://www.youtube.com/watch?v=.",
+            key="youtube_download_text_input",
+        )
+    with video_download_col_b:
+        resolution_dropdown = st.selectbox(options=video_choices, index=st.session_state.youtube_download_resolution_index, label="video resolution")
+    with video_download_col_c:
+        st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
+        st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
+        st.button(
+            "fetch video",
+            type="primary",
+            on_click=callback_download_video,
+            args=(
+                url_input,
+                resolution_dropdown,
+            ),
+            key="youtube_download_fetch_button",
+        )
+    with st.container(border=True):
+        with open(st.session_state.youtube_download_location, "rb") as file:
+            st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
+            st.download_button(
+                label="download video",
+                data=file,
+                file_name=st.session_state.youtube_download_location.split("/")[-1],
+                mime="video/mp4",
+                type="primary",
+            )
+        st.video(data=st.session_state.youtube_download_location, format="video/mp4")

ytdatakit/youtube_downloader/callbacks.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+from ytdatakit.youtube_downloader.yt_download import download_video
+from ytdatakit.youtube_downloader.state import default_youtube_download_location
+from ytdatakit.youtube_downloader.config import video_choices
+def callback_download_video(url_input: str, resolution_dropdown: str) -> None:
+    temporary_video_location = download_video(url_input, default_youtube_download_location(), st.session_state.resolution_dropdown)
+    st.session_state.youtube_download_location = temporary_video_location
+    st.session_state.youtube_download_resolution_index = video_choices.index(resolution_dropdown)

ytdatakit/youtube_downloader/config.py ADDED Viewed

	@@ -0,0 +1,4 @@

+app_name = "ytdatakit"
+video_choices = ["best", "1080", "720", "360"]
+default_clip_video_path = "./data/input/blank.mp4"
+default_clip_gif_path = "./data/input/blank.jpg"

ytdatakit/youtube_downloader/state.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from ytdatakit.youtube_downloader.config import video_choices, default_clip_video_path
+import streamlit as st
+import tempfile
+def default_youtube_download_location():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        return tmpdirname
+def state_init():
+    if "resolution_dropdown" not in st.session_state:
+        st.session_state.resolution_dropdown = video_choices
+    if "youtube_download_location" not in st.session_state:
+        st.session_state.youtube_download_location = default_clip_video_path
+    if "youtube_download_resolution_index" not in st.session_state:
+        st.session_state.youtube_download_resolution_index = 0

ytdatakit/youtube_downloader/yt_download.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import yt_dlp
+from yt_dlp import YoutubeDL
+import re
+def is_valid_youtube_url(url: str) -> bool:
+    if not isinstance(url, str):
+        return False
+    pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    if "shorts" in url:
+        pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    return re.match(pattern, url) is not None
+def download_video(url: str, savedir: str, resolution_dropdown: str, my_proxies: dict = {}) -> str:
+    try:
+        print("Downloading video from youtube...")
+        if is_valid_youtube_url(url):
+            with YoutubeDL() as ydl:
+                info_dict = ydl.extract_info(url, download=False)
+                video_url = info_dict.get("url", None)
+                video_id = info_dict.get("id", None)
+                video_title = info_dict.get("title", None)
+                video_title = re.sub(r"[^a-zA-Z0-9]", " ", video_title)
+                if video_title is None:
+                    savepath = savedir + "/" + video_id + ".mp4"
+                else:
+                    savepath = savedir + "/" + video_title + ".mp4"
+            ydl_opts = {
+                "format": "bestvideo+bestaudio/best",
+                "merge_output_format": "mp4",
+                "outtmpl": savepath,
+            }
+            if resolution_dropdown == "1080":
+                ydl_opts = {
+                    "format": "bestvideo[height<=1080]+bestaudio/best",
+                    "merge_output_format": "mp4",
+                    "outtmpl": savepath,
+                }
+            if resolution_dropdown == "720":
+                ydl_opts = {
+                    "format": "bestvideo[height<=720]+bestaudio/best",
+                    "merge_output_format": "mp4",
+                    "outtmpl": savepath,
+                }
+            if resolution_dropdown == "360":
+                ydl_opts = {
+                    "format": "bestvideo[height<=360]+bestaudio/best",
+                    "merge_output_format": "mp4",
+                    "outtmpl": savepath,
+                }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+            print("...done!")
+            return savepath
+        else:
+            raise ValueError(f"invalid input url: {url}")
+    except Exception as e:
+        raise ValueError(f"yt_download failed with exception {e}")

ytdatakit/youtube_thumbnail_downloader/__init__.py ADDED Viewed

File without changes

ytdatakit/youtube_thumbnail_downloader/app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import streamlit as st
+from ytdatakit.youtube_thumbnail_downloader.state import state_init
+from ytdatakit.youtube_thumbnail_downloader.callbacks import fetch_thumbnails
+def app():
+    state_init()
+    st.markdown(
+        """
+    <style>
+    .element-container:has(style){
+        display: none;
+    }
+    #button-download {
+        display: none;
+    }
+    .element-container:has(#button-download) {
+        display: none;
+    }
+    .element-container:has(#button-download) + div button {
+        background-color: green;
+        border-color: green;
+        }
+    #button-fetch {
+        display: none;
+    }
+    .element-container:has(#button-fetch) {
+        display: none;
+    }
+    .element-container:has(#button-fetch) + div button {
+        background-color: blue;
+        border-color: blue;
+        }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    base = st.container(border=True)
+    with base:
+        text_urls = st.text_area(
+            "youtube urls separated by commas",
+            value=st.session_state.thumbnail_text_input_urls if "thumbnail_text_input_urls" in st.session_state else "",
+            placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....",
+            key="thumbnail_urls_input",
+        )
+        st.thumbnail_text_input_urls = text_urls
+        uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="thumbanils_file_uploader")
+        thumbnail_col_1, thumbnail_col_2, thumbnail_col_3 = st.columns([5, 8, 8])
+        with thumbnail_col_1:
+            st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
+            st.button(label="fetch thumbnails", type="primary", on_click=fetch_thumbnails, args=(uploaded_file, text_urls))
+        with thumbnail_col_2:
+            st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
+            if "thumbnails_zip_path" in st.session_state:
+                with open(st.session_state.thumbnails_zip_path, "rb") as file:
+                    st.download_button(
+                        label="download thumbnails",
+                        data=file,  # st.session_state.thumbnails_zip_path if "thumbnails_zip_path" in st.session_state else "./data/input/blank.zip",
+                        file_name="thumbnails.zip",
+                        mime="application/zip",
+                        type="primary",
+                        disabled=True if st.session_state.thumbnail_fetch_count == 0 else False,
+                    )
+            else:
+                st.download_button(
+                    label="download thumbnails",
+                    data="./data/input/blank.zip",
+                    file_name="thumbnails.zip",
+                    mime="application/zip",
+                    type="primary",
+                    disabled=True,
+                )
+        with st.container(border=True):
+            for ind, thumbnail_savepath in enumerate(st.session_state.thumbnail_savepaths):
+                title = st.session_state.thumbnail_data_entries[ind]["video_title"]
+                thumbnail_savepath = st.session_state.thumbnail_savepaths[ind]
+                with st.container(border=True):
+                    a, b, c = st.columns([1, 3, 1])
+                    with b:
+                        st.subheader(title)
+                        st.image(thumbnail_savepath)
+                        with open(thumbnail_savepath, "rb") as file:
+                            st.download_button(
+                                label="download thumbnail",
+                                data=file,
+                                file_name=title + ".jpg",
+                                mime="image/jpg",
+                                key=f"{title} download",
+                                type="primary",
+                            )

ytdatakit/youtube_thumbnail_downloader/callbacks.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from ytdatakit.youtube_thumbnail_downloader.yt_thumbnail_downloader import get_batch_thumbnails
+from ytdatakit.youtube_thumbnail_downloader.zip import zip_images
+from ytdatakit.youtube_thumbnail_downloader.state import reset_state
+from ytdatakit.youtube_thumbnail_downloader.config import default_thumbnail_location
+import streamlit as st
+from io import StringIO
+import tempfile
+def default_temp_savdir():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        return tmpdirname
+def urls_normalizer(uploaded_file: "st.uploaded", text_urls: str) -> list:
+    youtube_urls = []
+    if uploaded_file is not None:
+        if text_urls is not None:
+            if len(text_urls.strip()) > 0:
+                st.warning("you can enter urls manually or from file but not both", icon="⚠️")
+                st.stop()
+        if uploaded_file.type == "text/plain":
+            stringio = StringIO(uploaded_file.read().decode("utf-8"))
+            for line in stringio:
+                youtube_urls.append(line.strip())
+    if text_urls is not None:
+        if len(text_urls.strip()) > 0:
+            if uploaded_file is not None:
+                st.warning("you can enter urls manually or from file but not both", icon="⚠️")
+                st.stop()
+            try:
+                text_urls_split = text_urls.split(",")
+                text_urls_split = [v.strip() for v in text_urls_split]
+                youtube_urls = text_urls_split
+            except:  # noqa E722
+                st.warning("please check your manually entered urls", icon="⚠️")
+                st.stop()
+    return youtube_urls
+def fetch_logic(youtube_urls: list) -> None:
+    if youtube_urls != st.session_state.thumbnail_raw_urls:
+        st.session_state.thumbnail_raw_urls = youtube_urls
+        reset_state()
+    if st.session_state.thumbnail_fetch_count == 0:
+        st.session_state.local_thumbnail_location = default_thumbnail_location()
+        savedir = "/".join(st.session_state.local_thumbnail_location.split("/")[:-2])
+        thumbnail_savepaths, thumbnail_data_entries = get_batch_thumbnails(youtube_urls, savedir)
+        st.session_state.thumbnail_savepaths = thumbnail_savepaths
+        st.session_state.thumbnail_data_entries = thumbnail_data_entries
+        st.session_state.thumbnail_fetch_count += 1
+        st.session_state.thumbnails_zip_path = savedir + "/" + "thumbnails.zip"
+        zip_images(thumbnail_savepaths)
+def fetch_thumbnails(uploaded_file, text_urls):
+    # with st.spinner(text="thumbnail pull in progress..."):
+    youtube_urls = urls_normalizer(uploaded_file, text_urls)
+    fetch_logic(youtube_urls)

ytdatakit/youtube_thumbnail_downloader/config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import tempfile
+import uuid
+default_thumbnail_raw_urls = ""
+default_thumbnail_savepaths = []
+default_thumbnail_data_entries = []
+default_thumbnail_text_input_urls = ""
+default_thumbnails_zip_path = "./data/input/blank.zip"
+def default_thumbnail_location():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        return tmpdirname + "/temp_" + str(uuid.uuid4()) + ".jpg"

ytdatakit/youtube_thumbnail_downloader/state.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+from ytdatakit.youtube_thumbnail_downloader.config import (
+    default_thumbnail_raw_urls,
+    default_thumbnail_savepaths,
+    default_thumbnail_location,
+    default_thumbnail_data_entries,
+    default_thumbnail_text_input_urls,
+    default_thumbnails_zip_path,
+)
+def state_init():
+    if "thumbnail_raw_urls" not in st.session_state:
+        st.session_state.thumbnail_raw_urls = default_thumbnail_raw_urls
+    if "thumbnail_savepaths" not in st.session_state:
+        st.session_state.thumbnail_savepaths = default_thumbnail_savepaths
+    if "thumbnail_data_entries" not in st.session_state:
+        st.session_state.thumbnail_data_entries = default_thumbnail_data_entries
+    if "thumbnail_fetch_count" not in st.session_state:
+        st.session_state.thumbnail_fetch_count = 0
+    if "default_thumbnail_location" not in st.session_state:
+        st.session_state.local_thumbnail_location = default_thumbnail_location()
+    if "youtube_thumbnails_expander" not in st.session_state:
+        st.session_state.youtube_thumbnails_expander = False
+def reset_state():
+    st.session_state.thumbnail_savepaths = default_thumbnail_savepaths
+    st.session_state.thumbnail_text_input_urls = default_thumbnail_text_input_urls
+    st.session_state.thumbnails_zip_path = default_thumbnails_zip_path
+    st.session_state.thumbnail_text_input_urls = ""
+    st.session_state.thumbnail_fetch_count = 0
+    st.session_state.youtube_thumbnails_expander = False

ytdatakit/youtube_thumbnail_downloader/yt_thumbnail_downloader.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import re
+import requests
+from yt_dlp import YoutubeDL
+def is_valid_youtube_url(url: str) -> bool:
+    if not isinstance(url, str):
+        return False
+    pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    if "shorts" in url:
+        pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    return re.match(pattern, url) is not None
+def download_thumbnail(yt_thumbnail_url: str, savepath: str) -> None:
+    img_data = requests.get(yt_thumbnail_url).content
+    with open(savepath, "wb") as handler:
+        handler.write(img_data)
+def get_youtube_thumbnail_url(video_id: str) -> dict:
+    if video_id:
+        return {
+            "default": f"https://img.youtube.com/vi/{video_id}/default.jpg",
+            "mqdefault": f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg",
+            "hqdefault": f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
+            "sddefault": f"https://img.youtube.com/vi/{video_id}/sddefault.jpg",
+            "maxresdefault": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
+        }
+def pull_yt_data(url: str, savedir: str, my_proxies: dict = {}) -> tuple:
+    try:
+        if is_valid_youtube_url(url):
+            with YoutubeDL() as ydl:
+                info_dict = ydl.extract_info(url, download=False)
+                video_url = info_dict.get("url", None)
+                video_id = info_dict.get("id", None)
+                video_title = info_dict.get("title", None)
+                entry = {}
+                entry["video_url"] = url
+                entry["video_id"] = video_id
+                entry["video_title"] = video_title
+                video_title = re.sub(r"[^a-zA-Z0-9]", "", video_title)
+                if video_title is None:
+                    savepath = savedir + "/" + video_id + ".jpg"
+                else:
+                    savepath = savedir + "/" + video_title + ".jpg"
+                if video_id:
+                    thumbnail_url = get_youtube_thumbnail_url(video_id)["hqdefault"]
+                    download_thumbnail(thumbnail_url, savepath)
+            print("...done!")
+            return savepath, entry
+        else:
+            raise ValueError(f"invalid input url: {url}")
+    except Exception as e:
+        raise ValueError(f"yt_download failed with exception {e}")
+def get_batch_thumbnails(yt_urls: list, savedir: str, my_proxies: dict = {}):
+    thumbnail_savepaths = []
+    entries = []
+    for url in yt_urls:
+        try:
+            thumbnail_savepath, data_entry = pull_yt_data(url, savedir, my_proxies)
+            thumbnail_savepaths.append(thumbnail_savepath)
+            entries.append(data_entry)
+        except Exception as e:
+            print(f"url {url} failed with exception {e}")
+            pass
+    return thumbnail_savepaths, entries

ytdatakit/youtube_thumbnail_downloader/zip.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import zipfile
+import os
+import streamlit as st
+def zip_images(image_paths: list):
+    print("INFO: zipping images...")
+    zip_filename = st.session_state.thumbnails_zip_path
+    with zipfile.ZipFile(zip_filename, "w") as zipf:
+        for image_path in image_paths:
+            _, filename = os.path.split(image_path)
+            zipf.write(image_path, arcname=filename)
+            print(f"Added {filename} to the zip file.")
+    print(f"...done!  images have been zipped into {zip_filename}")

ytdatakit/youtube_transcript_downloader/__init__.py ADDED Viewed

File without changes

ytdatakit/youtube_transcript_downloader/app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from ytdatakit.youtube_transcript_downloader.callbacks import fetch_transcripts
+from ytdatakit.youtube_transcript_downloader.state import state_init
+import streamlit as st
+def app():
+    state_init()
+    st.markdown(
+        """
+    <style>
+    .element-container:has(style){
+        display: none;
+    }
+    #button-download {
+        display: none;
+    }
+    .element-container:has(#button-download) {
+        display: none;
+    }
+    .element-container:has(#button-download) + div button {
+        background-color: green;
+        border-color: green;
+        }
+    #button-fetch {
+        display: none;
+    }
+    .element-container:has(#button-fetch) {
+        display: none;
+    }
+    .element-container:has(#button-fetch) + div button {
+        background-color: blue;
+        border-color: blue;
+        }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    base = st.container(border=True)
+    with base:
+        text_urls = st.text_area(
+            "youtube urls separated by commas",
+            value=st.session_state.transcript_raw_urls,
+            placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....",
+            key="transcript_urls_input",
+        )
+        uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="transcripts_file_uploader")
+        transcript_col_1, transcript_col_2, transcript_col_3 = st.columns([3, 4, 6])
+        with transcript_col_1:
+            st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
+            fetch_btn = st.button(
+                label="fetch transcripts",
+                type="primary",
+            )
+            if fetch_btn:
+                df_table, df_download = fetch_transcripts(uploaded_file, text_urls)
+                st.session_state.transcript_data_table = df_table
+                st.session_state.transcript_data_download = df_download
+        with transcript_col_2:
+            st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
+            st.download_button(
+                label="download transcripts",
+                data=st.session_state.transcript_data_download,
+                file_name="transcripts.csv",
+                mime="text/csv",
+                disabled=False,
+                type="primary",
+            )
+        with st.container(border=True):
+            st.table(st.session_state.transcript_data_table)

ytdatakit/youtube_transcript_downloader/callbacks.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from ytdatakit.youtube_transcript_downloader.yt_transcript_download import get_batch_transcripts
+from io import StringIO
+import pandas as pd
+import streamlit as st
+import copy
+@st.cache_data
+def convert_df(df: pd.DataFrame) -> "csv":
+    # IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode("utf-8")
+def fetch_transcripts(uploaded_file, text_urls):
+    # with st.spinner(text="transcript pull in progress..."):
+    youtube_urls = []
+    if uploaded_file is not None:
+        if text_urls is not None:
+            if len(text_urls.strip()) > 0:
+                st.warning("you can enter urls manually or from file but not both", icon="⚠️")
+                st.stop()
+        if uploaded_file.type == "text/plain":
+            stringio = StringIO(uploaded_file.read().decode("utf-8"))
+            for line in stringio:
+                youtube_urls.append(line.strip())
+    if text_urls is not None:
+        if len(text_urls.strip()) > 0:
+            if uploaded_file is not None:
+                st.warning("you can enter urls manually or from file but not both", icon="⚠️")
+                st.stop()
+            try:
+                text_urls_split = text_urls.split(",")
+                text_urls_split = [v.strip() for v in text_urls_split]
+                youtube_urls = text_urls_split
+            except:  # noqa E722
+                st.warning("please check your manually entered urls", icon="⚠️")
+                st.stop()
+        batch_transcripts = get_batch_transcripts(youtube_urls)
+        df = pd.DataFrame(batch_transcripts)
+        df_download = convert_df(df)
+        def truncate_and_append(text, length, suffix):
+            if len(text) > length:
+                return text[:length] + suffix
+            return text
+        max_length = 100
+        suffix = "..."
+        df_table = copy.deepcopy(df).astype(str)
+        df_table["transcript"] = df_table["transcript"].apply(lambda x: truncate_and_append(x, max_length, suffix))
+        return df_table, df_download

ytdatakit/youtube_transcript_downloader/config.py ADDED Viewed

File without changes

ytdatakit/youtube_transcript_downloader/state.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import streamlit as st
+import pandas as pd
+def state_init():
+    df = pd.DataFrame(columns=["youtube_url", "video_id", "transcript"])
+    if "transcript_raw_urls" not in st.session_state:
+        st.session_state.transcript_raw_urls = ""
+    if "transcript_data_table" not in st.session_state:
+        st.session_state.transcript_data_table = df
+    if "transcript_data_download" not in st.session_state:
+        st.session_state.transcript_data_download = df.to_csv().encode("utf-8")

ytdatakit/youtube_transcript_downloader/yt_transcript_download.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import re
+from typing import List, Dict
+from youtube_transcript_api import YouTubeTranscriptApi
+def is_valid_youtube_url(url: str) -> bool:
+    if not isinstance(url, str):
+        return False
+    pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    if "shorts" in url:
+        pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
+    return re.match(pattern, url) is not None
+def get_single_transcript(youtube_url: str) -> dict:
+    if is_valid_youtube_url(youtube_url):
+        if "shorts" in youtube_url:
+            video_id = youtube_url.split("/")[-1]
+        else:
+            video_id = youtube_url.split("=")[-1]
+        try:
+            video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
+            entry = {}
+            entry["youtube_url"] = youtube_url
+            entry["video_id"] = video_id
+            entry["transcript"] = video_transcript
+            return entry
+        except Exception as e:
+            if "Subtitles are disabled for this video" in str(e):
+                entry = {}
+                entry["youtube_url"] = youtube_url
+                entry["video_id"] = video_id
+                entry["transcript"] = "Subtitles are disabled for this video"
+                return entry
+            else:
+                print(e)
+    else:
+        print(f"FAILURE: youtube_url is not valid - {youtube_url}")
+def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
+    try:
+        entries = []
+        for i, youtube_url in enumerate(youtube_urls):
+            entry = get_single_transcript(youtube_url)
+            if entry is not None:
+                entries.append(entry)
+        return entries
+    except Exception as e:
+        print(f"FAILURE: get_batch_transcripts function failed with exception {e}")
+        return []