{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "dvidSKA14fhf" }, "source": [ "##Установка необходимых библиотек" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YImwMQLjASiK", "outputId": "16d2a28d-714a-4d88-8b1a-dc117070bcf0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.4/50.4 kB\u001b[0m \u001b[31m355.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.0/27.0 MB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.1/227.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m36.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m891.9/891.9 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m396.4/396.4 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.9/141.9 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "!pip install faiss-cpu sentence-transformers langchain langchain-community anthropic youtube-transcript-api -q\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Axccge0wAajT", "outputId": "2e28f056-293f-412e-adb8-104133755487" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: google-api-python-client in /usr/local/lib/python3.10/dist-packages (2.137.0)\n", "Collecting google-api-python-client\n", " Downloading google_api_python_client-2.144.0-py2.py3-none-any.whl.metadata (6.7 kB)\n", "Requirement already satisfied: httplib2<1.dev0,>=0.19.0 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (0.22.0)\n", "Requirement already satisfied: google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (2.27.0)\n", "Requirement already satisfied: google-auth-httplib2<1.0.0,>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (0.2.0)\n", "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (2.19.2)\n", "Requirement already satisfied: uritemplate<5,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (4.1.1)\n", "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (1.65.0)\n", "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.20.3)\n", "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (1.24.0)\n", "Requirement already satisfied: requests<3.0.0.dev0,>=2.18.0 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.32.3)\n", "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (5.5.0)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (0.4.0)\n", "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (4.9)\n", "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/local/lib/python3.10/dist-packages (from httplib2<1.dev0,>=0.19.0->google-api-python-client) (3.1.4)\n", "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (0.6.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.8)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2024.8.30)\n", "Downloading google_api_python_client-2.144.0-py2.py3-none-any.whl (12.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m66.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: google-api-python-client\n", " Attempting uninstall: google-api-python-client\n", " Found existing installation: google-api-python-client 2.137.0\n", " Uninstalling google-api-python-client-2.137.0:\n", " Successfully uninstalled google-api-python-client-2.137.0\n", "Successfully installed google-api-python-client-2.144.0\n" ] } ], "source": [ "pip install --upgrade google-api-python-client" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nL9Vg7lgA8nt" }, "outputs": [], "source": [ "# from langchain_community.embeddings import HuggingFaceEmbeddings\n", "# from langchain_community.vectorstores import FAISS\n", "# from langchain import Anthropic, LLMChain\n", "# from langchain.chains.combine_documents import create_stuff_documents_chain\n", "# from langchain.chains import create_retrieval_chain" ] }, { "cell_type": "markdown", "metadata": { "id": "Iav_VBRN4xr2" }, "source": [ "##Создаем транскрипты 3х плейлистов используя ютуб апи\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sFYWNeL-4xIr" }, "outputs": [], "source": [ "from youtube_transcript_api import YouTubeTranscriptApi\n", "from googleapiclient.discovery import build\n", "\n", "api_key = \"AIzaSyBzwEw3l9Or_l3ZfFniGBXJH7cK-s6-oQo\"\n", "\n", "\n", "def get_playlist_video_ids(playlist_id, api_key):\n", " youtube = build('youtube', 'v3', developerKey=api_key)\n", "\n", " video_ids = []\n", " next_page_token = None\n", "\n", " while True:\n", " # Получаем список видео в плейлисте\n", " request = youtube.playlistItems().list(\n", " part=\"contentDetails\",\n", " playlistId=playlist_id,\n", " maxResults=100, # Максимальное количество видео, которое можно получить за один запрос\n", " pageToken=next_page_token\n", " )\n", " response = request.execute()\n", "\n", " # Добавляем video_id в список\n", " video_ids.extend([item['contentDetails']['videoId'] for item in response['items']])\n", "\n", " # pagination\n", " next_page_token = response.get('nextPageToken')\n", "\n", " if not next_page_token:\n", " break\n", "\n", " return video_ids\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bnMIT3apAajU", "outputId": "f5387bde-46e1-43c1-b57d-923e2b77cb6e" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'kind': 'youtube#playlistItemListResponse',\n", " 'etag': '0wxbScWJEx_DaocUEV-JDgNAMHA',\n", " 'items': [{'kind': 'youtube#playlistItem',\n", " 'etag': 'U_uuou2Zq_IzljATHIKBnpq5pF0',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS41NkI0NEY2RDEwNTU3Q0M2',\n", " 'contentDetails': {'videoId': 'z9ccH9e5cAw',\n", " 'videoPublishedAt': '2024-06-24T09:00:05Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'NoecQqYGI39FM6InP5iIscg7lHE',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4yODlGNEE0NkRGMEEzMEQy',\n", " 'contentDetails': {'videoId': 'ff-S_tjr1OI',\n", " 'videoPublishedAt': '2024-06-25T08:51:18Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'TfPjzS1U2WEiC-qB0ncl9JnJqiI',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4wMTcyMDhGQUE4NTIzM0Y5',\n", " 'contentDetails': {'videoId': 'T_NW1nlq3ic',\n", " 'videoPublishedAt': '2024-06-26T08:58:11Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'MDquxrdeaV3UkCjADQYdR_8CRYE',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4wOTA3OTZBNzVEMTUzOTMy',\n", " 'contentDetails': {'videoId': 'sTVWtYORqjU',\n", " 'videoPublishedAt': '2024-06-27T11:10:12Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'Sftasrd9qUNA6VmlBG_mk5Vh3KE',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4xMkVGQjNCMUM1N0RFNEUx',\n", " 'contentDetails': {'videoId': '06rbC2eMXy0',\n", " 'videoPublishedAt': '2024-07-01T08:31:49Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'xxuE04WGmzx-zfjSMW6ZwvrO0qs',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS41MzJCQjBCNDIyRkJDN0VD',\n", " 'contentDetails': {'videoId': 'qeqzWqWxTog',\n", " 'videoPublishedAt': '2024-07-01T08:44:27Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'vHgQ6ae0zxARGd72v_-ex3CspUk',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS5DQUNERDQ2NkIzRUQxNTY1',\n", " 'contentDetails': {'videoId': 'DyL2uSTDumY',\n", " 'videoPublishedAt': '2024-07-02T08:42:57Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': '-l0vvdpMi4ZuLZ7K3781PewokHY',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS45NDk1REZENzhEMzU5MDQz',\n", " 'contentDetails': {'videoId': 'isiNNDXiRYY',\n", " 'videoPublishedAt': '2024-07-03T08:39:09Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': '9_B45Ia97bKTpwyzAXI_pjcr55M',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS5GNjNDRDREMDQxOThCMDQ2',\n", " 'contentDetails': {'videoId': 'AoUF4DtdV24',\n", " 'videoPublishedAt': '2024-07-04T08:15:49Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'kpIXTBdaOPE2S8_jCoZqQTDw934',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS5EMEEwRUY5M0RDRTU3NDJC',\n", " 'contentDetails': {'videoId': '5zORIoqJkF4',\n", " 'videoPublishedAt': '2024-07-04T15:32:43Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'b7eqdiletjAJV9GRuLjxXS4T3NA',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS45ODRDNTg0QjA4NkFBNkQy',\n", " 'contentDetails': {'videoId': 'JX8cGs4uC2Y',\n", " 'videoPublishedAt': '2024-07-05T08:10:57Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'Yie-XxEUmvDzc1Y71RtE3io-qCg',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4zMDg5MkQ5MEVDMEM1NTg2',\n", " 'contentDetails': {'videoId': 'lExBtpri2oU',\n", " 'videoPublishedAt': '2024-07-08T10:31:02Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'd-axDR7ToQUxNX8mp_tbV4HUdTc',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS41Mzk2QTAxMTkzNDk4MDhF',\n", " 'contentDetails': {'videoId': 'ur5hgkStOCg',\n", " 'videoPublishedAt': '2024-07-08T10:31:49Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'Uci8akxtRTs55XtlswkBrdGmJF8',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS5EQUE1NTFDRjcwMDg0NEMz',\n", " 'contentDetails': {'videoId': 'vTVjtDgmY9M',\n", " 'videoPublishedAt': '2024-07-09T09:07:02Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'Y4Hqj4hcljCA6z4-acwwdrDMOe0',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS41QTY1Q0UxMTVCODczNThE',\n", " 'contentDetails': {'videoId': 'AbimRQHQY4A',\n", " 'videoPublishedAt': '2024-07-10T08:45:18Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'ImpiFamIx3naHHTqbzpgHdrlsMM',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4yMUQyQTQzMjRDNzMyQTMy',\n", " 'contentDetails': {'videoId': 'nopExGduRHc',\n", " 'videoPublishedAt': '2024-07-12T07:00:41Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': '4cagmU4UdNCuukYPZHQKFE0AV-k',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS45RTgxNDRBMzUwRjQ0MDhC',\n", " 'contentDetails': {'videoId': 'ag4zmHI7QQM',\n", " 'videoPublishedAt': '2024-07-15T08:14:16Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': '29kjP__km-l-aOW8zEJ_hsul3r8',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS5ENDU4Q0M4RDExNzM1Mjcy',\n", " 'contentDetails': {'videoId': 'nQl1KC0yNrw',\n", " 'videoPublishedAt': '2024-07-16T07:36:39Z'}},\n", " {'kind': 'youtube#playlistItem',\n", " 'etag': 'KjXXwlnAuUfvD-gK1k2jPFHtqJU',\n", " 'id': 'UExZU0h0TlBiQUlObmJxWGpJYk4tYzdEb3JqQ1Q2ZVlPUS4yMDhBMkNBNjRDMjQxQTg1',\n", " 'contentDetails': {'videoId': '0BHc_kJoDEY',\n", " 'videoPublishedAt': '2024-07-17T09:04:26Z'}}],\n", " 'pageInfo': {'totalResults': 19, 'resultsPerPage': 50}}" ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "#check output\n", "youtube = build('youtube', 'v3', developerKey=api_key)\n", "request = youtube.playlistItems().list(\n", " part=\"contentDetails\",\n", " playlistId='PLYSHtNPbAINnbqXjIbN-c7DorjCT6eYOQ',\n", " maxResults=100, # Максимальное количество видео, которое можно получить за один запрос\n", " # pageToken=response.get('nextPageToken')\n", " )\n", "result1 = request.execute()\n", "result_pagetoken = result1.get('nextPageToken')\n", "result1\n", "#result_pagetoken - nothing\n" ] }, { "cell_type": "markdown", "metadata": { "id": "5yDtqgsjAajV" }, "source": [ "### EN transcripts all" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NfYgich3AajV" }, "outputs": [], "source": [ "\n", "def get_transcript_en(video_id, language_code='en'):\n", " try:\n", " transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language_code])\n", " transcript_text = \" \".join([entry['text'] for entry in transcript])\n", " return transcript_text\n", " except Exception as e:\n", " return str(e)\n", "\n", "def get_video_details(video_id, api_key):\n", " youtube = build('youtube', 'v3', developerKey=api_key)\n", "\n", " # Получаем информацию о видео\n", " request = youtube.videos().list(\n", " part=\"snippet\",\n", " id=video_id\n", " )\n", " response = request.execute()\n", "\n", " if 'items' in response and len(response['items']) > 0:\n", " return response['items'][0]['snippet']['title']\n", " else:\n", " return None\n", "\n", "def get_playlist_transcripts_en(playlist_url, api_key, language_code='en'):\n", " # Извлекаем playlist_id из URL\n", " playlist_id = playlist_url.split(\"list=\")[-1]\n", "\n", " # Получаем все video_id из плейлиста\n", " video_ids = get_playlist_video_ids(playlist_id, api_key)\n", "\n", " transcripts = []\n", "\n", " # Проходимся по всем видео и получаем транскрипты\n", " for video_id in video_ids:\n", " video_title = get_video_details(video_id, api_key)\n", " transcript = get_transcript_en(video_id, language_code)\n", " transcripts.append({'title': video_title, 'transcript': transcript})\n", "\n", " return transcripts\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "w_6MYrKTAajV" }, "outputs": [], "source": [ "# Sources:\n", "playlist_ml_en = \"https://www.youtube.com/watch?v=Gv9_4yMHFhI&list=PLblh5JKOoLUICTaGLRoHQDuF_7q2GfuJF\"\n", "playlist_logistic_en = \"https://www.youtube.com/watch?v=yIYKR4sgzI8&list=PLblh5JKOoLUKxzEP5HA2d-Li7IJkHfXSe\"\n", "playlist_nn_en = \"https://www.youtube.com/watch?v=zxagGtF9MeU&list=PLblh5JKOoLUIxGDQs4LFFD--41Vzf-ME1\"\n", "playlist_stat_en = \"https://www.youtube.com/watch?v=qBigTkBLU6g&list=PLblh5JKOoLUK0FLuzwntyYI10UQFUhsY9\"\n", "playlist_nn2_en = \"https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi\"\n", "playlist_linal2_en = \"https://www.youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QYmaFtk_5O6C" }, "outputs": [], "source": [ "transcripts_ML_en = get_playlist_transcripts_en(playlist_ml_en, api_key, 'en')\n", "# 2min 8 sec" ] }, { "cell_type": "code", "source": [ "len(transcripts_ML_en)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Sko_ef_2BUP6", "outputId": "b47cfbf2-6196-466d-fc28-b6db8ef8a959" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "99" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Cd5jJYKdAajV" }, "outputs": [], "source": [ "# tier 2\n", "transcripts_logistic_en = get_playlist_transcripts_en(playlist_logistic_en, api_key, 'en')\n", "transcripts_NN_en = get_playlist_transcripts_en(playlist_nn_en, api_key, 'en')\n", "transcripts_stat_en = get_playlist_transcripts_en(playlist_stat_en, api_key, 'en')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1m-VG8K9AajV" }, "outputs": [], "source": [ "# tier 3\n", "transcripts_nn2_en = get_playlist_transcripts_en(playlist_nn2_en, api_key, 'en')\n", "transcripts_linal2_en = get_playlist_transcripts_en(playlist_linal2_en, api_key, 'en')" ] }, { "cell_type": "markdown", "metadata": { "id": "wpCh1vb2AajW" }, "source": [ "### RU transcripts all" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b6WypAXMAajW" }, "outputs": [], "source": [ "\n", "def get_transcript_ru(video_id, language_code='ru'):\n", " try:\n", " transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language_code])\n", " transcript_text = \" \".join([entry['text'] for entry in transcript])\n", " return transcript_text\n", " except Exception as e:\n", " return str(e)\n", "\n", "def get_video_details(video_id, api_key):\n", " youtube = build('youtube', 'v3', developerKey=api_key)\n", "\n", " # Получаем информацию о видео\n", " request = youtube.videos().list(\n", " part=\"snippet\",\n", " id=video_id\n", " )\n", " response = request.execute()\n", "\n", " if 'items' in response and len(response['items']) > 0:\n", " return response['items'][0]['snippet']['title']\n", " else:\n", " return None\n", "\n", "def get_playlist_transcripts_ru(playlist_url, api_key, language_code='ru'):\n", " # Извлекаем playlist_id из URL\n", " playlist_id = playlist_url.split(\"list=\")[-1]\n", "\n", " # Получаем все video_id из плейлиста\n", " video_ids = get_playlist_video_ids(playlist_id, api_key)\n", "\n", " transcripts = []\n", "\n", " # Проходимся по всем видео и получаем транскрипты\n", " for video_id in video_ids:\n", " video_title = get_video_details(video_id, api_key)\n", " transcript = get_transcript_ru(video_id, language_code)\n", " transcripts.append({'title': video_title, 'transcript': transcript})\n", "\n", " return transcripts\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5990TRMnAajW" }, "outputs": [], "source": [ "# Sources Elbrus\n", "playlist_phase_1_url = \"https://www.youtube.com/playlist?list=PLYSHtNPbAINnbqXjIbN-c7DorjCT6eYOQ\"\n", "playlist_phase_2_url = 'https://www.youtube.com/playlist?list=PLYSHtNPbAINnNvDXtGNmC7-F1QRH7qTgb'\n", "playlist_phase_3_url = 'https://www.youtube.com/playlist?list=PLYSHtNPbAINlmyNNmTaqcn3BsaY8v1xgV'\n", "\n", "# Sources except Bootcamp:\n", "playlist_NN_ru = 'https://www.youtube.com/playlist?list=PL0Ks75aof3Tiru-UvOvYmXzD1tU0NrR8V'\n", "playlist_OOP_ru = 'https://www.youtube.com/watch?v=Z7AY41tE-3U&list=PLA0M1Bcd0w8zPwP7t-FgwONhZOHt9rz9E'\n", "playlist_linal_ru = 'https://youtube.com/playlist?list=PLAQWsvWQlb6cIRY6yJtYnXCbxLxPZv6-Z'\n", "playlist_docker_ru = 'https://www.youtube.com/watch?v=jVV8CVURmrE&list=PLqVeG_R3qMSwjnkMUns_Yc4zF_PtUZmB-'\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SojIu0NP5VNt" }, "outputs": [], "source": [ "# Elbrus\n", "transcripts_phase_1 = get_playlist_transcripts_ru(playlist_phase_1_url, api_key, 'ru')\n", "transcripts_phase_2 = get_playlist_transcripts_ru(playlist_phase_2_url, api_key, 'ru')\n", "transcripts_phase_3 = get_playlist_transcripts_ru(playlist_phase_3_url, api_key, 'ru')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VEDGiNEHAajW" }, "outputs": [], "source": [ "# other Ru\n", "transcripts_NN_ru = get_playlist_transcripts_ru(playlist_NN_ru, api_key, 'ru')\n", "transcripts_OOP_ru = get_playlist_transcripts_ru(playlist_OOP_ru, api_key, 'ru')\n", "transcripts_linal_ru = get_playlist_transcripts_ru(playlist_linal_ru, api_key, 'ru')\n", "transcripts_docker_ru = get_playlist_transcripts_ru(playlist_docker_ru, api_key, 'ru')\n", "\n", "# 3m12s" ] }, { "cell_type": "markdown", "metadata": { "id": "KPB9GcSfAajW" }, "source": [ "### Aggregate all Knowledge Base" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fp1Jb8BiAajW" }, "outputs": [], "source": [ "transcripts_all = [transcripts_phase_1, transcripts_phase_2, transcripts_phase_3, transcripts_NN_ru, transcripts_OOP_ru, transcripts_linal_ru, transcripts_docker_ru, \\\n", " transcripts_ML_en, transcripts_logistic_en, transcripts_NN_en, transcripts_stat_en, transcripts_nn2_en, transcripts_linal2_en]\n" ] }, { "cell_type": "markdown", "metadata": { "id": "x08xCtkk5ocW" }, "source": [ "## Нарезаем все транскрипты на фрагменты с overlap(нахлест), преобразуем каждый фрагмент в вектор и все вектора записываем в векторное хранилище FAISS" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gbbajjDK5niN", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "40059afc-575f-4d7a-d8c8-ff1cfafbc69e" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm, trange\n" ] } ], "source": [ "from langchain_core.documents import Document\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.vectorstores.faiss import FAISS\n", "from sentence_transformers import SentenceTransformer\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", "# Convert data to Document objects\n", "docs = []\n", "for playlist in transcripts_all:\n", " for item in playlist:\n", " for title, transcript in item.items():\n", " docs.append(Document(page_content=transcript, metadata={\"title\": title}))\n", "\n", "# Split documents into chunks\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", "split_docs = text_splitter.split_documents(docs)\n" ] }, { "cell_type": "code", "source": [ "split_docs[0].page_content" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "y04EIegH0EbH", "outputId": "4e1d67f6-f496-429c-e62b-fa260356e0f0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'1. Probability'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 32 } ] }, { "cell_type": "code", "source": [ "#Setup the new embeddings model\n", "model_name = \"intfloat/multilingual-e5-base\"\n", "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "\n", "# Create the FAISS vector store and save it locally\n", "vector_store = FAISS.from_documents(split_docs, embedding=embeddings)\n", "vector_store.save_local(\"faiss_index\")\n", "\n", "# Load the FAISS vector store from local storage\n", "vector_store = FAISS.load_local('faiss_index', embeddings=embeddings, allow_dangerous_deserialization=True)\n", "\n", "# Create the retriever for document retrieval\n", "embedding_retriever = vector_store.as_retriever(search_kwargs={\"k\": 15})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 503, "referenced_widgets": [ "c5841a4ce6bb4080bfd12fe45f9ef0ac", "b02022ee58da420e8eb3cfd9eb3f43d5", "3461edf56f2b474898430f891d99674c", "57e03ace47344b7d8768230e87f9a59b", "89f056a38392430eb597e1bae5be4659", "bc25be9987a14271a755ad0df670ca81", "959555740acc4a6497f85eaa745e3517", "84444adba9c64c49b18f6416775cb4c0", "37c442a4df204027b2f34bbd3e88ce92", "0f15db45f67d4136962279c7709d70fe", "9fc9fa9e63014a429b8718ac7452d7db", "48e65d69916a481e99144e0dd20f7252", "bd7bcc8db719412295587b2e0698119f", "0abbd878f96048c89525d33be6f6c2e8", "28b999acf27f4b818839120a705b6126", "605434ea75fc451795172f89008f8d95", "fdabcf226eb84b77986df5d69020eb15", "3935905da9af4dc099969908d6eedc2b", "3c570f314c9948aeab372e6f470566ee", "f365eb8705da44fea831803ef9c6cde0", "bfe275eb45824f859c573e66b916fbe6", "32b975e4c50c4e1396a70d4f81f59428", "2d6b6e723b394db1bda1628a67f29cd0", "5eb1e004b462473c91c5004119120481", "1f8e2782862e45c9979c10716ab5317a", "ad0a89dd4a6e4b16ba5a44717c459d82", "3a23ba98917a485a8b5a13564a309e83", "368b070adba9447ea065fe8c81f7ed91", "1b294c12e7d9493f9d13b771fb8a1302", "3119a5a291d84cbebaae86af46809302", "bc2e3f4cba83499ca60b11363e00b015", "da89f2ca38fb41b9b1728af45f1c6bd6", "f2ae90fd201441069307d0db9fcafcb5", "ddd607992f5a42598b5029a7d1c0735f", "faedbf86ad894465803b6b5c599d044c", "98c929fd22574c1fbc2b5d6b02d709e0", "d0c12c42aa8848d78a30953fe4c06043", "365e8d5a9fc041e69f591edee6341551", "dba5975516064d2683be2deb6686a0fa", "61692a8a2ce74fe598009b7e892bde1c", "ea1626bcb92c4362a7b8e21338d64ccc", "29f97006c32a4159bca0b40d559be2cf", "d97633070f21444f81f8c4e80d6b0efa", "ea8ea4ed7e1947af9ace049fffb0e5a1", "98fe33de61954aa2a0d6417c9ff12ee3", "273f53b7092c40d6a0d6a5b4d48da496", "4e6143a3e0d74384956df5317939852b", "77655507da874611be4e54a6953b7b39", "8d7df79192974dd3b176c133f7931543", "6331c3d717d34504ad1ffa88e94bdcd1", "552b4de06bbd43dcaef196a9ab9842d3", "4724d2b7db7e4c6592bfb32d5ae525fe", "38c9a29185b642489f8ffb257b72986d", "d77c0bf187a34cf3a66f8ab7f16c18e4", "06aadc98340f44a3b9170c53529bb621", "101f51b1dc994c58aea7e505678d5b60", "55f8475d0c6f403a99a75c8a689949e9", "b057272180204b0d9b2e8871a282695b", "fa6ddf942f4f489980e9daa0aba85896", "901ea41d6bef4187a2f868e64d828de9", "ef9449e7edcc4e96b0d6fcc32cc0d9cd", "8607bec554ba4150bfa3b73a7a7f9342", "b3bf5dba20cd442d936883006bf7014c", "e8e6d660042443daac145e62fc2b13a1", "b9009138ad534f049aab8123d089db99", "3f815eaccb1445bc87253fff0b8566fe", "26007c6d5f7b4e26995158272fd85376", "27b3d43982a64fe884cfd247da8a8d80", "02d333df54f446b49b4288e1e53026f2", "2a4f51c02020478d9ff6feef621f6521", "fa00438099664a2db8058ad1594b39cf", "c0e25fdc6ef04f8e9f2199c9bb2a535c", "f01bc5455cc8494791c16a2f4223bae6", "e9241a31ae2545e1ae5912310d955786", "66c807881a7a4b21b026849abc385fa0", "1e87056de38948f8a74c58163c747fde", "069d31eeb61f48f0bd254b4fc4d0245e", "eb22d97e7faa4129a75081c582ba6738", "3704511fadb648988d791f24b5f053f4", "d3e1937157734a74bc621058bdcd5807", "9bd8fc8ff5244929825e707161243edd", "c222d9d1e5514242a507a18a576dea34", "829597ddccab4023b1e98517578d3f1a", "48c828bf4d9e4ebcbdc58a1603a3ef50", "0bee9a8e7f49497da1bdbf50e94f3783", "67e3b43194f84b6bafa7b1f5af34c573", "337187f84dda4191974849353af3286c", "827842b86add4f5ebaab427098e016d2", "b9436002c7934f0f96a0da2f53fc3bc9", "71ea6d7c1a5c462bbc2c1b5e45e6980c", "60687dbde2104e29a5a42475d41802e5", "ba6d06200925454a912ae53d0916e70e", "46ee806e48eb48068f65c51e706d0a21", "4db6960655c84e3fbe6c36c1d9b001c7", "12b392354f60499eaf57ce129f68d51e", "3c3f34c148a94183826e4623fc0b64de", "ff11e37f068040699e0bf563e7889e5b", "6f467238d2344fd9b795158e839e221c", "171ef26577734650872d856f010f3a7e", "0abc174237c14c58881d5e47f4184d48", "b521bc45dc5440e9938269318dff6142", "fd522056a8604554ac34c46633ed6c19", "583e8464244d4ca8b1d471b1870f2f92", "238f168ef1ec401b8ba52678b261a1a9", "daff4f1f67e84c01a4e7c5f539b3d97b", "4c36ba21d43c465f85f545a5adc3575e", "9a11281b56c3417eb0c98578db6f2c2b", "0178d61509c340d2b96aa2ceb8adc4f6", "fc0defcca1a14e5aa4a870c92c8d177a", "7d0210ad5f954d269f16be72039328e6" ] }, "id": "nOtDOvDBC8St", "outputId": "96a533ef-e7f8-440d-8dcc-5dcb4a515825" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":3: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the langchain-huggingface package and should be used instead. To use it run `pip install -U langchain-huggingface` and import as `from langchain_huggingface import HuggingFaceEmbeddings`.\n", " embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "modules.json: 0%| | 0.00/387 [00:00