{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# from selenium import webdriver\n", "# from selenium.webdriver.common.by import By\n", "# from selenium.webdriver.common.keys import Keys\n", "# from bs4 import BeautifulSoup\n", "# import time\n", "# # !pip install tensorflow tensorflow-hub\n", "# import tensorflow as tf\n", "# import tensorflow_hub as hub\n", "# import numpy as np\n", "# # !pip install jellyfish\n", "# import jellyfish" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# !pip show selenium\n", "# !pip show beautifulsoup4\n", "# !pip show numpy\n", "# !pip show tensorflow\n", "# !pip show tensorflow-hub\n", "# !pip show jellyfish\n", "# !pip show streamlit" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# starting_topic = \"soulja boy\"\n", "# target_topic = \"fart\"\n", "\n", "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Version 3" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-07-30 09:07:17.451238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.common.keys import Keys\n", "from bs4 import BeautifulSoup\n", "import time\n", "# !pip install tensorflow tensorflow-hub\n", "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "import numpy as np\n", "import requests\n", "import json\n", "\n", "# Load the pre-trained Universal Sentence Encoder\n", "embed = hub.load(\"https://tfhub.dev/google/universal-sentence-encoder/4\")" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n", "Starting!\n", "\n", "------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n", "Page: 1\n", "Current topic: 'Soulja Boy'\n", "Current URL: 'https://en.wikipedia.org/wiki/Soulja_Boy'\n", "Current Topic Context: 'DeAndre Cortez Way (born July 28, 1990), known professionally as Soulja Boy (formerly Soulja Boy Tell 'Em), is an American rapper and record producer'\n", "Next topic: 'Peewee Longway'. Semantic similarity to 'Urine': 21.81%\n", "\n", "Page: 2\n", "Current topic: 'Peewee Longway'\n", "Current URL: 'https://en.wikipedia.org/wiki/Peewee_Longway'\n", "Current Topic Context: 'Quincy Lamont Williams (born August 17, 1984), known by his stage name Peewee Longway, is an American rapper best known for his mixtape The Blue M&M and his collaboration with Young Thug, \"Loaded\"'\n", "Next topic: 'Hip Hop'. Semantic similarity to 'Urine': 12.0%\n", "\n", "Page: 3\n", "Current topic: 'Hip Hop'\n", "Current URL: 'https://en.wikipedia.org/wiki/Hip_hop_music'\n", "Current Topic Context: 'Hip hop or hip-hop, also known as rap and formerly known as disco rap,[5][6] is a genre of popular music that was originated in the Bronx[7][8][9][10] borough of New York City in the early 1970s by African Americans,[11][12][13] having existed for several years prior to mainstream discovery.[14] Hip hop originated as an anti-drug and anti-violence genre,[15] while consisting of stylized rhythmic music (usually built around drum beats) that commonly accompanies rapping, a rhythmic and rhyming speech that is chanted.[16] According to the professor Asante of African American studies at Temple University, \"hip hop is something that blacks can unequivocally claim as their own\".[17] It was developed as part of hip hop culture, a subculture defined by four key stylistic elements: MCing/rapping, DJing/scratching with turntables, break dancing, and graffiti art.[18][19][20] Other elements include sampling beats or bass lines from records (or synthesized beats and sounds), and rhythmic beatboxing'\n", "Next topic: 'Rufus Thomas'. Semantic similarity to 'Urine': 21.79%\n", "\n", "Page: 4\n", "Current topic: 'Rufus Thomas'\n", "Current URL: 'https://en.wikipedia.org/wiki/Rufus_Thomas'\n", "Current Topic Context: 'Rufus C'\n", "Next topic: 'Rabbit Foot Minstrels'. Semantic similarity to 'Urine': 19.28%\n", "\n", "Page: 5\n", "Current topic: 'Rabbit Foot Minstrels'\n", "Current URL: 'https://en.wikipedia.org/wiki/The_Rabbit%27s_Foot_Company'\n", "Current Topic Context: 'The Rabbit's Foot Company, also known as the Rabbit('s) Foot Minstrels and colloquially as \"The Foots\", was a long-running minstrel and variety troupe that toured as a tent show in the American South between 1900 and the late 1950s'\n", "Next topic: 'Jstor'. Semantic similarity to 'Urine': 11.85%\n", "\n", "Page: 6\n", "Current topic: 'Jstor'\n", "Current URL: 'https://en.wikipedia.org/wiki/JSTOR'\n", "Current Topic Context: 'JSTOR (/ˈdʒeɪstɔːr/; short for Journal Storage)[2] is a digital library founded in 1994'\n", "Next topic: 'Nieman Lab'. Semantic similarity to 'Urine': 12.14%\n", "\n", "Page: 7\n", "Current topic: 'Nieman Lab'\n", "Current URL: 'https://en.wikipedia.org/wiki/Nieman_Foundation_for_Journalism'\n", "Current Topic Context: 'The Nieman Foundation for Journalism at Harvard University is the primary journalism institution at Harvard.'\n", "Next topic: 'Men'S Soccer'. Semantic similarity to 'Urine': 14.43%\n", "\n", "Page: 8\n", "Current topic: 'Men'S Soccer'\n", "Current URL: 'https://en.wikipedia.org/wiki/Harvard_Crimson_men%27s_soccer'\n", "Current Topic Context: 'The Harvard Crimson men's soccer team is an intercollegiate varsity sports team of Harvard University'\n", "Next topic: 'California Golden Bears Men'S Soccer'. Semantic similarity to 'Urine': 17.31%\n", "\n", "Page: 9\n", "Current topic: 'California Golden Bears Men'S Soccer'\n", "Current URL: 'https://en.wikipedia.org/wiki/California_Golden_Bears_men%27s_soccer'\n", "Current Topic Context: 'The California Golden Bears men's soccer team is a varsity intercollegiate athletic team of University of California, Berkeley in Berkeley, California, United States.[1] The team is a member of the Pac-12 Conference, which is part of the National Collegiate Athletic Association's Division I'\n", "Next topic: 'California Drinking Song'. Semantic similarity to 'Urine': 15.78%\n", "\n", "Page: 10\n", "Current topic: 'California Drinking Song'\n", "Current URL: 'https://en.wikipedia.org/wiki/California_Drinking_Song'\n", "Current Topic Context: '\"California Drinking Song\" is a spirit song from the University of California, Berkeley'\n", "Next topic: 'Uc Men'S Octet'. Semantic similarity to 'Urine': 15.63%\n", "\n", "Page: 11\n", "Current topic: 'Uc Men'S Octet'\n", "Current URL: 'https://en.wikipedia.org/wiki/University_of_California_Men%27s_Octet'\n", "Current Topic Context: 'The UC Men's Octet, sometimes termed the Cal Men’s Octet or the UC Berkeley Men’s Octet, is an eight-member male a cappella group at the University of California, Berkeley'\n", "Next topic: 'Laboratories'. Semantic similarity to 'Urine': 15.45%\n", "\n", "Page: 12\n", "Current topic: 'Laboratories'\n", "Current URL: 'https://en.wikipedia.org/wiki/Research_centers_and_laboratories_at_the_University_of_California,_Berkeley'\n", "Current Topic Context: 'The University of California, Berkeley, contains many research centers and laboratories.'\n", "Next topic: 'Uc Irvine Medical Center'. Semantic similarity to 'Urine': 18.16%\n", "\n", "Page: 13\n", "Current topic: 'Uc Irvine Medical Center'\n", "Current URL: 'https://en.wikipedia.org/wiki/University_of_California,_Irvine_Medical_Center'\n", "Current Topic Context: 'The University of California, Irvine Medical Center (UCIMC or UCI Medical Center) is a major research hospital located in Orange, California'\n", "Next topic: 'Sepsis'. Semantic similarity to 'Urine': 19.29%\n", "\n", "Page: 14\n", "Current topic: 'Sepsis'\n", "Current URL: 'https://en.wikipedia.org/wiki/Sepsis'\n", "Current Topic Context: 'Sepsis (septicaemia in British English), or blood poisoning,[8][9] is a life-threatening condition that arises when the body's response to infection causes injury to its own tissues and organs.[4][8]'\n", "Next topic: 'Urinary Tract'. Semantic similarity to 'Urine': 51.26%\n", "\n", "Page: 15\n", "Current topic: 'Urinary Tract'\n", "Current URL: 'https://en.wikipedia.org/wiki/Urinary_system'\n", "Current Topic Context: 'The urinary system, also known as the urinary tract or renal system, consists of the kidneys, ureters, bladder, and the urethra'\n", "Next topic: 'Urinary Bladder'. Semantic similarity to 'Urine': 61.01%\n", "\n", "Page: 16\n", "Current topic: 'Urinary Bladder'\n", "Current URL: 'https://en.wikipedia.org/wiki/Bladder'\n", "Current Topic Context: 'The bladder is a hollow organ in humans and other vertebrates that stores urine from the kidneys before disposal by urination'\n", "Next topic: 'Urination § Anatomy Of The Bladder And Outlet'. Semantic similarity to 'Urine': 57.69%\n", "\n", "Page: 17\n", "Current topic: 'Urination § Anatomy Of The Bladder And Outlet'\n", "Current URL: 'https://en.wikipedia.org/wiki/Urination#Anatomy_of_the_bladder_and_outlet'\n", "Current Topic Context: 'Urination is the release of urine from the urinary bladder through the urethra to the outside of the body'\n", "Next topic: 'Urine'. Semantic similarity to 'Urine': 57.28%\n", "\n", "Page: 18\n", "Current topic: 'Urine'\n", "Current URL: 'https://en.wikipedia.org/wiki/Urine'\n", "Current Topic Context: 'Urine is a liquid by-product of metabolism in humans and in many other animals'\n", "\n", "------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n", "From 'Soulja Boy', to 'Urine' in 18 pages, 8.54 seconds!\n", "Starting topic: 'Soulja Boy': 'https://en.wikipedia.org/wiki/Soulja_Boy'\n", "Target topic: 'Urine': 'https://en.wikipedia.org/wiki/Urine'\n", "\n", "------------------------------------------------------------------------------------------------------------------------------------------------------\n" ] } ], "source": [ "def most_similar_sentence(target_topic, labels_list):\n", " # Encode the context sentence and all sentences in the list\n", " context_embedding = embed([target_topic])[0]\n", " sentence_embeddings = embed(labels_list)\n", " \n", " # Calculate cosine similarities between the context sentence and each sentence in the list\n", " similarities = np.inner(context_embedding, sentence_embeddings)\n", " \n", " # Find the index of the most similar sentence\n", " most_similar_index = np.argmax(similarities)\n", " \n", " return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index\n", "\n", "def search_wikipedia(search_term):\n", " # Define the endpoint\n", " endpoint = \"https://en.wikipedia.org/w/api.php\"\n", "\n", " # Define the search parameters\n", " params = {\n", " \"action\": \"query\",\n", " \"format\": \"json\",\n", " \"list\": \"search\",\n", " \"srsearch\": search_term\n", " }\n", "\n", " # Send a GET request to the endpoint with your parameters\n", " response = requests.get(url = endpoint, params = params)\n", "\n", " # Parse the results as JSON\n", " data = json.loads(response.text)\n", "\n", " # Get the title of the first result (this will be used as the page title in the next step)\n", " page_title = data[\"query\"][\"search\"][0][\"title\"]\n", "\n", " # if \"may refer to\" in data[\"query\"][\"search\"][0][\"snippet\"].lower():\n", " # page_title = data[\"query\"][\"search\"][1][\"title\"]\n", "\n", " # Construct the URL of the Wikipedia page\n", " page_url = \"https://en.wikipedia.org/wiki/{}\".format(page_title.replace(\" \", \"_\"))\n", "\n", " return page_url, page_title\n", "\n", "def get_topic_context(driver, more = False):\n", " # Find the first paragraph of the main article\n", " first_paragraph = driver.find_element(By.CSS_SELECTOR, \"div.mw-parser-output > p:not(.mw-empty-elt)\").text\n", "\n", " if more:\n", " context_sentence = \". \".join(first_paragraph.split(\". \")[:5])\n", " else:\n", " context_sentence = first_paragraph.split(\". \")[0]\n", "\n", " return context_sentence\n", "\n", "# bad_words = [word for word in open(\"censored.txt\", \"r\").readlines()]\n", "bad_words = [word.strip() for word in open(\"censored.txt\", \"r\").readlines()]\n", "\n", "def refine_links(topic, links, current_url_suffix, used_links, used_topics, censor = False):\n", "\n", " links_texts = []\n", "\n", " # Iterate through the links and extract their URLs\n", " for link in links:\n", " link_url = link.get('href')\n", " if link_url and link_url.startswith(\"/wiki/\"):\n", " link_url = \"https://en.wikipedia.org\" + link_url\n", " link_text = link.text.strip() # Get the text and remove leading/trailing spaces\n", "\n", " # make sure they are both not None\n", " if link_text and current_url_suffix not in link_url:\n", "\n", " if link_url not in used_links and link_text.lower() not in [topic.lower() for topic in used_topics]:\n", "\n", " # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)\n", " if topic.lower() not in link_url.lower() and \"en.wikipedia.org/wiki/\" in link_url and \":\" not in \"\".join(link_url.split(\"/\")[1:]) and \"Main_Page\" != str(link_url.split(\"/\")[-1]):\n", "\n", " # censoring if needed\n", " if censor:\n", " if not any(word1.lower() in bad_words for word1 in [word.lower() for word in link_text.split()]):\n", " links_texts.append((link_url, link_text))\n", " else:\n", " links_texts.append((link_url, link_text))\n", "\n", " return links_texts\n", "\n", "def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):\n", "\n", " ##### Setup Chrome options\n", " chrome_options = webdriver.ChromeOptions()\n", " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n", " chrome_options.add_argument(\"--no-sandbox\")\n", " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", " driver = webdriver.Chrome(options = chrome_options)\n", "\n", " #### Getting target url, topic, and context\n", " driver_target = webdriver.Chrome(options = chrome_options)\n", " target_url, target_topic = search_wikipedia(search_term = target_topic)\n", " driver_target.get(target_url)\n", " target_context = get_topic_context(driver_target, more = True)\n", " # print(target_context)\n", " driver_target.quit()\n", "\n", " topic = starting_topic\n", " num_pages = 0\n", " used_topics = []\n", " used_links = []\n", "\n", " start_time = time.time()\n", "\n", " ### BEGIN ###\n", "\n", " print(\"-\" * 150)\n", " print(f\"\\nStarting!\\n\")\n", " print(\"-\" * 150)\n", "\n", " url, topic = search_wikipedia(search_term = starting_topic)\n", " driver.get(url)\n", " used_topics.append(topic)\n", " used_links.append(driver.current_url)\n", "\n", " while True:\n", " # increment the page tracking by 1 for each new page\n", " num_pages += 1\n", "\n", " # if not the first page, navigate to the new page\n", " if num_pages > 1:\n", " driver.get(next_link)\n", "\n", " try:\n", " context_sentence = get_topic_context(driver)\n", " except Exception as e:\n", " context_sentence = \"Context could not be found from webpage\"\n", "\n", " current_url = driver.current_url\n", " current_url_suffix = str(current_url).split(\"/\")[-1]\n", "\n", " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n", " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n", "\n", " soup = BeautifulSoup(current_page, 'html.parser')\n", "\n", " links = soup.find_all('a')\n", "\n", " # get rid of any bloat in the links from the page\n", " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n", "\n", " # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])\n", " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context.lower(), labels_list = [text.lower() for link, text in links_texts])\n", "\n", " print(f\"\\nPage: {num_pages}\")\n", " print(f\"Current topic: '{topic.title()}'\")\n", " print(f\"Current URL: '{current_url}'\")\n", " print(f\"Current Topic Context: '{context_sentence}'\")\n", " if current_url != target_url:\n", " print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n", "\n", " next_link, topic = links_texts[loc_idx]\n", "\n", " used_links.append(next_link)\n", " used_topics.append(topic)\n", "\n", " if current_url == target_url: # because the target_url is now found through the API\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n", " print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n", " print(f\"Target topic: '{target_topic.title()}': '{target_url}'\\n\")\n", " print(\"-\" * 150)\n", " driver.quit()\n", " break\n", "\n", " if num_pages == limit:\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n", " print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n", " print(f\"\\nTry a different combination to see if it can do it!\\n\")\n", " print(\"-\" * 150)\n", " driver.quit()\n", " break\n", "\n", "###### Example\n", "\n", "starting_topic = 'soulja boy'\n", "target_topic = 'urine'\n", "\n", "play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# starting_topic = 'soulja boy'\n", "# target_topic = 'fart'\n", "\n", "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tracking Stats" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n", "\n", " stats_dict = {}\n", "\n", " ##### Setup Chrome options\n", " chrome_options = webdriver.ChromeOptions()\n", " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n", " chrome_options.add_argument(\"--no-sandbox\")\n", " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", " driver = webdriver.Chrome(options = chrome_options)\n", "\n", " #### Getting target url, topic, and context\n", " driver_target = webdriver.Chrome(options = chrome_options)\n", " target_url, target_topic = search_wikipedia(search_term = target_topic)\n", " driver_target.get(target_url)\n", " target_context = get_topic_context(driver_target)\n", " print(target_context)\n", " print()\n", " driver_target.quit()\n", " \n", " topic = starting_topic\n", " num_pages = 0\n", " used_topics = []\n", " used_links = []\n", " contexts = []\n", " sim_to_target_scores = []\n", "\n", " start_time = time.time()\n", "\n", " ### BEGIN ###\n", "\n", " print(\"-\" * 150)\n", " print(f\"\\nStarting!\\n\")\n", " print(\"-\" * 150)\n", "\n", " url, topic = search_wikipedia(search_term = starting_topic)\n", " driver.get(url)\n", " used_topics.append(topic)\n", " used_links.append(driver.current_url)\n", " sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n", "\n", " while True:\n", " # increment the page tracking by 1 for each new page\n", " num_pages += 1\n", "\n", " # if not the first page, navigate to the new page\n", " if num_pages > 1:\n", " driver.get(next_link)\n", "\n", " context_sentence = get_topic_context(driver)\n", " contexts.append(context_sentence)\n", "\n", " current_url = driver.current_url\n", " current_url_suffix = str(current_url).split(\"/\")[-1]\n", "\n", " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n", " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n", "\n", " soup = BeautifulSoup(current_page, 'html.parser')\n", "\n", " links = soup.find_all('a')\n", "\n", " # get rid of any bloat in the links from the page\n", " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n", "\n", " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n", "\n", " print(f\"\\nPage: {num_pages}\")\n", " print(f\"Current topic: '{topic.title()}'\")\n", " print(f\"Current URL: '{current_url}'\")\n", " print(f\"Current Topic Context: '{context_sentence}'\")\n", " if current_url != target_url:\n", " print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n", " \n", " next_link, topic = links_texts[loc_idx]\n", "\n", " # contexts.append(context_sentence)\n", "\n", " if current_url == target_url: # because the target_url is now found through the API\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n", " print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n", " print(f\"Target topic: '{target_topic.title()}': '{used_links[-1]}'\\n\")\n", " print(\"-\" * 150)\n", "\n", " stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n", " stats_dict['topic'] = used_topics\n", " stats_dict['context'] = contexts\n", " stats_dict['sim_to_target'] = sim_to_target_scores\n", " stats_dict['url'] = used_links\n", " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n", " driver.quit()\n", " return stats_dict\n", " break\n", "\n", " ##### ADD DRAMATIC DELAY HERE #####\n", " # time.sleep(0.5)\n", " # time.sleep(10)\n", "\n", " if num_pages == limit:\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n", " print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n", " print(f\"\\nTry a different combination to see if it can do it!\\n\")\n", " print(\"-\" * 150)\n", "\n", " stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n", " stats_dict['topic'] = used_topics\n", " stats_dict['context'] = contexts\n", " stats_dict['sim_to_target'] = sim_to_target_scores\n", " stats_dict['url'] = used_links\n", " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n", " driver.quit()\n", " return stats_dict\n", " break\n", "\n", " used_links.append(next_link)\n", " used_topics.append(topic)\n", " sim_to_target_scores.append(best_score)\n", "\n", "# starting_topic = 'john mayer'\n", "# target_topic = 'fart'\n", "\n", "# stats_dict = play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['start_end', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n", "[6, 6, 6, 6, 6, 6]\n" ] } ], "source": [ "# stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(7)]\n", "print(stats_dict.keys())\n", "print([len(stats_dict[key]) for key in stats_dict.keys()])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0.027460583, 0.20852715, 0.2775123, 0.31147623, 0.4413054, 0.6199604]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats_dict['sim_to_target']" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_endtopiccontextsim_to_targeturlpage_num
0john mayer_FlatulenceJohn MayerJohn Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ...0.027461https://en.wikipedia.org/wiki/John_Mayer1
1john mayer_Flatulencecardiac dysrhythmiaArrhythmias, also known as cardiac arrhythmias...0.208527https://en.wikipedia.org/wiki/Cardiac_dysrhythmia2
2john mayer_FlatulenceprolapseMitral valve prolapse (MVP) is a valvular hear...0.277512https://en.wikipedia.org/wiki/Mitral_valve_pro...3
3john mayer_Flatulencegastrointestinal disturbancesGastrointestinal diseases (abbrev0.311476https://en.wikipedia.org/wiki/Gastrointestinal...4
4john mayer_Flatulencegastrointestinal tractThe gastrointestinal tract (GI tract, digestiv...0.441305https://en.wikipedia.org/wiki/Human_gastrointe...5
5john mayer_FlatulenceflatulenceFlatulence, in humans, is the expulsion of gas...0.619960https://en.wikipedia.org/wiki/Flatulence6
\n", "
" ], "text/plain": [ " start_end topic \\\n", "0 john mayer_Flatulence John Mayer \n", "1 john mayer_Flatulence cardiac dysrhythmia \n", "2 john mayer_Flatulence prolapse \n", "3 john mayer_Flatulence gastrointestinal disturbances \n", "4 john mayer_Flatulence gastrointestinal tract \n", "5 john mayer_Flatulence flatulence \n", "\n", " context sim_to_target \\\n", "0 John Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ... 0.027461 \n", "1 Arrhythmias, also known as cardiac arrhythmias... 0.208527 \n", "2 Mitral valve prolapse (MVP) is a valvular hear... 0.277512 \n", "3 Gastrointestinal diseases (abbrev 0.311476 \n", "4 The gastrointestinal tract (GI tract, digestiv... 0.441305 \n", "5 Flatulence, in humans, is the expulsion of gas... 0.619960 \n", "\n", " url page_num \n", "0 https://en.wikipedia.org/wiki/John_Mayer 1 \n", "1 https://en.wikipedia.org/wiki/Cardiac_dysrhythmia 2 \n", "2 https://en.wikipedia.org/wiki/Mitral_valve_pro... 3 \n", "3 https://en.wikipedia.org/wiki/Gastrointestinal... 4 \n", "4 https://en.wikipedia.org/wiki/Human_gastrointe... 5 \n", "5 https://en.wikipedia.org/wiki/Flatulence 6 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "stats_df = pd.DataFrame(stats_dict)\n", "stats_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Simluations" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "110\n" ] }, { "data": { "text/plain": [ "[('Sushi', 'Mars'),\n", " ('Sushi', 'Beethoven'),\n", " ('Sushi', 'Mount Everest'),\n", " ('Sushi', 'Humpback Whale'),\n", " ('Sushi', 'The Great Wall of China')]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import itertools\n", "\n", "unrelated_list = [\n", " \"Sushi\",\n", " \"Mars\",\n", " \"Beethoven\",\n", " \"Mount Everest\",\n", " \"Humpback Whale\",\n", " \"The Great Wall of China\",\n", " \"Photography\",\n", " \"Pyramids of Egypt\",\n", " \"Albert Einstein\",\n", " \"Rainforests\",\n", " 'buggy'\n", "]\n", "\n", "# Generate all permutations of pairs\n", "pair_permutations = list(itertools.permutations(unrelated_list, 2))\n", "\n", "print(len(pair_permutations)) # no pairs with self\n", "pair_permutations[:5]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fruits': ['apple', 'banana', 'orange', 'grapes', 'kiwi'], 'animals': ['cat', 'dog', 'elephant', 'tiger', 'lion'], 'cities': ['New York', 'London'], 'colors': ['red', 'blue']}\n" ] } ], "source": [ "# Initial dictionary\n", "main_dict = {\n", " 'fruits': ['apple', 'banana', 'orange'],\n", " 'animals': ['cat', 'dog', 'elephant'],\n", "}\n", "\n", "# Function to add a new dictionary to the main_dict\n", "def add_to_main_dict(main_dict, new_dict):\n", " for key, value in new_dict.items():\n", " if key in main_dict:\n", " main_dict[key].extend(value)\n", " else:\n", " main_dict[key] = value\n", "\n", "# New dictionary to add to main_dict\n", "new_dict1 = {\n", " 'fruits': ['grapes', 'kiwi'],\n", " 'cities': ['New York', 'London'],\n", "}\n", "\n", "# Add new_dict1 to main_dict\n", "add_to_main_dict(main_dict, new_dict1)\n", "\n", "# New dictionary to add to main_dict\n", "new_dict2 = {\n", " 'animals': ['tiger', 'lion'],\n", " 'colors': ['red', 'blue'],\n", "}\n", "\n", "# Add new_dict2 to main_dict\n", "add_to_main_dict(main_dict, new_dict2)\n", "\n", "# Print the updated main_dict\n", "print(main_dict)\n" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start: 'Sushi'. End: 'Mars'. Page: 8\n", "Start: 'Sushi'. End: 'Ludwig van Beethoven'. Page: 9\n", "Start: 'Sushi'. End: 'Mount Everest'. Page: 4\n", "Start: 'Sushi'. End: 'Humpback whale'. Page: 3\n", "Start: 'Sushi'. End: 'Great Wall of China'. Page: 7\n", "Start: 'Sushi'. End: 'Photography'. Page: 29\n", "Start: 'Sushi'. End: 'Egyptian pyramids'. Page: 23\n", "Start: 'Sushi'. End: 'Albert Einstein'. Page: 12\n", "Start: 'Sushi'. End: 'Rainforest'. Page: 7\n", "Start: 'Sushi'. End: 'Buggy'. Page: 200\n", "Start: 'Mars'. End: 'Sushi'. Page: 19\n", "Start: 'Mars'. End: 'Ludwig van Beethoven'. Page: 4\n", "Start: 'Mars'. End: 'Mount Everest'. Page: 2\n", "Start: 'Mars'. End: 'Humpback whale'. Page: 4\n", "Start: 'Mars'. End: 'Great Wall of China'. Page: 13\n", "Start: 'Mars'. End: 'Photography'. Page: 32\n", "Start: 'Mars'. End: 'Egyptian pyramids'. Page: 3\n", "Start: 'Mars'. End: 'Albert Einstein'. Page: 6\n", "Start: 'Mars'. End: 'Rainforest'. Page: 7\n", "Start: 'Mars'. End: 'Buggy'. Page: 200\n", "Start: 'Beethoven'. End: 'Sushi'. Page: 17\n", "Start: 'Beethoven'. End: 'Mars'. Page: 3\n", "Start: 'Beethoven'. End: 'Mount Everest'. Page: 6\n", "Start: 'Beethoven'. End: 'Humpback whale'. Page: 4\n", "Start: 'Beethoven'. End: 'Great Wall of China'. Page: 14\n", "Start: 'Beethoven'. End: 'Photography'. Page: 31\n", "Start: 'Beethoven'. End: 'Egyptian pyramids'. Page: 8\n", "Start: 'Beethoven'. End: 'Albert Einstein'. Page: 3\n", "Start: 'Beethoven'. End: 'Rainforest'. Page: 15\n", "Start: 'Beethoven'. End: 'Buggy'. Page: 200\n", "Start: 'Mount Everest'. End: 'Sushi'. Page: 14\n", "Start: 'Mount Everest'. End: 'Mars'. Page: 2\n", "Start: 'Mount Everest'. End: 'Ludwig van Beethoven'. Page: 23\n", "Start: 'Mount Everest'. End: 'Humpback whale'. Page: 7\n", "Start: 'Mount Everest'. End: 'Great Wall of China'. Page: 6\n", "Start: 'Mount Everest'. End: 'Photography'. Page: 29\n", "Start: 'Mount Everest'. End: 'Egyptian pyramids'. Page: 8\n", "Start: 'Mount Everest'. End: 'Albert Einstein'. Page: 5\n", "Start: 'Mount Everest'. End: 'Rainforest'. Page: 7\n", "Start: 'Mount Everest'. End: 'Buggy'. Page: 200\n", "Start: 'Humpback Whale'. End: 'Sushi'. Page: 9\n", "Start: 'Humpback Whale'. End: 'Mars'. Page: 19\n", "Start: 'Humpback Whale'. End: 'Ludwig van Beethoven'. Page: 29\n", "Start: 'Humpback Whale'. End: 'Mount Everest'. Page: 5\n", "Start: 'Humpback Whale'. End: 'Great Wall of China'. Page: 12\n", "Start: 'Humpback Whale'. End: 'Photography'. Page: 5\n", "Start: 'Humpback Whale'. End: 'Egyptian pyramids'. Page: 5\n", "Start: 'Humpback Whale'. End: 'Albert Einstein'. Page: 8\n", "Start: 'Humpback Whale'. End: 'Rainforest'. Page: 3\n", "Start: 'Humpback Whale'. End: 'Buggy'. Page: 200\n", "Start: 'The Great Wall of China'. End: 'Sushi'. Page: 7\n", "Start: 'The Great Wall of China'. End: 'Mars'. Page: 13\n", "Start: 'The Great Wall of China'. End: 'Ludwig van Beethoven'. Page: 10\n", "Start: 'The Great Wall of China'. End: 'Mount Everest'. Page: 3\n", "Start: 'The Great Wall of China'. End: 'Humpback whale'. Page: 11\n", "Start: 'The Great Wall of China'. End: 'Photography'. Page: 48\n", "Start: 'The Great Wall of China'. End: 'Egyptian pyramids'. Page: 5\n", "Start: 'The Great Wall of China'. End: 'Albert Einstein'. Page: 7\n", "Start: 'The Great Wall of China'. End: 'Rainforest'. Page: 4\n", "Start: 'The Great Wall of China'. End: 'Buggy'. Page: 200\n", "Start: 'Photography'. End: 'Sushi'. Page: 15\n", "Start: 'Photography'. End: 'Mars'. Page: 13\n", "Start: 'Photography'. End: 'Ludwig van Beethoven'. Page: 26\n", "Start: 'Photography'. End: 'Mount Everest'. Page: 8\n", "Start: 'Photography'. End: 'Humpback whale'. Page: 10\n", "Start: 'Photography'. End: 'Great Wall of China'. Page: 3\n", "Start: 'Photography'. End: 'Egyptian pyramids'. Page: 6\n", "Start: 'Photography'. End: 'Albert Einstein'. Page: 21\n", "Start: 'Photography'. End: 'Rainforest'. Page: 8\n", "Start: 'Photography'. End: 'Buggy'. Page: 200\n", "Start: 'Pyramids of Egypt'. End: 'Sushi'. Page: 7\n", "Start: 'Pyramids of Egypt'. End: 'Mars'. Page: 7\n", "Start: 'Pyramids of Egypt'. End: 'Ludwig van Beethoven'. Page: 62\n", "Start: 'Pyramids of Egypt'. End: 'Mount Everest'. Page: 8\n", "Start: 'Pyramids of Egypt'. End: 'Humpback whale'. Page: 10\n", "Start: 'Pyramids of Egypt'. End: 'Great Wall of China'. Page: 8\n", "Start: 'Pyramids of Egypt'. End: 'Photography'. Page: 31\n", "Start: 'Pyramids of Egypt'. End: 'Albert Einstein'. Page: 3\n", "Start: 'Pyramids of Egypt'. End: 'Rainforest'. Page: 10\n", "Start: 'Pyramids of Egypt'. End: 'Buggy'. Page: 200\n", "Start: 'Albert Einstein'. End: 'Sushi'. Page: 10\n", "Start: 'Albert Einstein'. End: 'Mars'. Page: 3\n", "Start: 'Albert Einstein'. End: 'Ludwig van Beethoven'. Page: 2\n", "Start: 'Albert Einstein'. End: 'Mount Everest'. Page: 5\n", "Start: 'Albert Einstein'. End: 'Humpback whale'. Page: 18\n", "Start: 'Albert Einstein'. End: 'Great Wall of China'. Page: 8\n", "Start: 'Albert Einstein'. End: 'Photography'. Page: 42\n", "Start: 'Albert Einstein'. End: 'Egyptian pyramids'. Page: 7\n", "Start: 'Albert Einstein'. End: 'Rainforest'. Page: 6\n", "Start: 'Albert Einstein'. End: 'Buggy'. Page: 200\n", "Start: 'Rainforests'. End: 'Sushi'. Page: 3\n", "Start: 'Rainforests'. End: 'Mars'. Page: 7\n", "Start: 'Rainforests'. End: 'Ludwig van Beethoven'. Page: 18\n", "Start: 'Rainforests'. End: 'Mount Everest'. Page: 7\n", "Start: 'Rainforests'. End: 'Humpback whale'. Page: 4\n", "Start: 'Rainforests'. End: 'Great Wall of China'. Page: 4\n", "Start: 'Rainforests'. End: 'Photography'. Page: 38\n", "Start: 'Rainforests'. End: 'Egyptian pyramids'. Page: 7\n", "Start: 'Rainforests'. End: 'Albert Einstein'. Page: 8\n", "Start: 'Rainforests'. End: 'Buggy'. Page: 200\n", "Start: 'buggy'. End: 'Sushi'. Page: 6\n", "Start: 'buggy'. End: 'Mars'. Page: 8\n", "Start: 'buggy'. End: 'Ludwig van Beethoven'. Page: 28\n", "Start: 'buggy'. End: 'Mount Everest'. Page: 8\n", "Start: 'buggy'. End: 'Humpback whale'. Page: 19\n", "Start: 'buggy'. End: 'Great Wall of China'. Page: 12\n", "Start: 'buggy'. End: 'Photography'. Page: 54\n", "Start: 'buggy'. End: 'Egyptian pyramids'. Page: 9\n", "Start: 'buggy'. End: 'Albert Einstein'. Page: 35\n", "Start: 'buggy'. End: 'Rainforest'. Page: 9\n" ] } ], "source": [ "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n", "\n", " stats_dict = {}\n", "\n", " ##### Setup Chrome options\n", " chrome_options = webdriver.ChromeOptions()\n", " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n", " chrome_options.add_argument(\"--no-sandbox\")\n", " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", " driver = webdriver.Chrome(options = chrome_options)\n", "\n", " #### Getting target url, topic, and context\n", " driver_target = webdriver.Chrome(options = chrome_options)\n", " target_url, target_topic = search_wikipedia(search_term = target_topic)\n", " driver_target.get(target_url)\n", " target_context = get_topic_context(driver_target)\n", " driver_target.quit()\n", " \n", " topic = starting_topic\n", " num_pages = 0\n", " used_topics = []\n", " used_links = []\n", " contexts = []\n", " sim_to_target_scores = []\n", "\n", " start_time = time.time()\n", "\n", " url, topic = search_wikipedia(search_term = starting_topic)\n", " driver.get(url)\n", " used_topics.append(topic)\n", " used_links.append(driver.current_url)\n", " sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n", "\n", " while True:\n", " # increment the page tracking by 1 for each new page\n", " num_pages += 1\n", "\n", " # if not the first page, navigate to the new page\n", " if num_pages > 1:\n", " driver.get(next_link)\n", "\n", " context_sentence = get_topic_context(driver)\n", " contexts.append(context_sentence)\n", "\n", " current_url = driver.current_url\n", " current_url_suffix = str(current_url).split(\"/\")[-1]\n", "\n", " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n", " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n", "\n", " soup = BeautifulSoup(current_page, 'html.parser')\n", "\n", " links = soup.find_all('a')\n", "\n", " # get rid of any bloat in the links from the page\n", " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n", "\n", " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n", "\n", " print(f\"Start: '{starting_topic}'. End: '{target_topic}'. Page: {num_pages}\", end = '\\r')\n", "\n", " next_link, topic = links_texts[loc_idx]\n", "\n", " if current_url == target_url: # because the target_url is now found through the API\n", " print()\n", " stats_dict['start'] = [starting_topic for i in range(num_pages)]\n", " stats_dict['target'] = [target_topic for i in range(num_pages)]\n", " stats_dict['topic'] = used_topics\n", " stats_dict['context'] = contexts\n", " stats_dict['sim_to_target'] = sim_to_target_scores\n", " # stats_dict['time_seconds'] = times\n", " stats_dict['url'] = used_links\n", " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n", " add_to_main_dict(master_dict, stats_dict)\n", " driver.quit()\n", " break\n", "\n", " if num_pages == limit:\n", " print()\n", " stats_dict['start'] = [starting_topic for i in range(num_pages)]\n", " stats_dict['target'] = [target_topic for i in range(num_pages)]\n", " stats_dict['topic'] = used_topics\n", " stats_dict['context'] = contexts\n", " stats_dict['sim_to_target'] = sim_to_target_scores\n", " stats_dict['url'] = used_links\n", " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n", " driver.quit()\n", " add_to_main_dict(master_dict, stats_dict)\n", " break\n", "\n", " used_links.append(next_link)\n", " used_topics.append(topic)\n", " sim_to_target_scores.append(best_score)\n", "\n", "master_dict = {}\n", "master_dict['start'] = []\n", "master_dict['target'] = []\n", "master_dict['topic'] = []\n", "master_dict['context'] = []\n", "master_dict['sim_to_target'] = []\n", "master_dict['url'] = []\n", "master_dict['page_num'] = []\n", "\n", "# starting_topic = 'john mayer'\n", "# target_topic = 'fart'\n", "\n", "for starting_topic, target_topic in pair_permutations:\n", " play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['start', 'target', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n", "[3238, 3238, 3238, 3238, 3238, 3238, 3238]\n" ] } ], "source": [ "print(master_dict.keys())\n", "print([len(master_dict[key]) for key in master_dict.keys()])" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
starttargettopiccontextsim_to_targeturlpage_num
0SushiMarsSushiSushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ...0.046150https://en.wikipedia.org/wiki/Sushi1
1SushiMarsPlanet MoneyPlanet Money is an American podcast and blog p...0.494693https://en.wikipedia.org/wiki/Planet_Money2
2SushiMarsPacifica FoundationPacifica Foundation is an American non-profit ...0.186643https://en.wikipedia.org/wiki/Pacifica_Foundation3
3SushiMarsMars HillThe Mars Hill Network is a network of Christia...0.466525https://en.wikipedia.org/wiki/Mars_Hill_Network4
4SushiMarsEquinox MountainEquinox Mountain is the highest peak of the Ta...0.196999https://en.wikipedia.org/wiki/Equinox_Mountain5
........................
3233buggyRainforestForests of the United StatesIt has been estimated that before European set...0.437653https://en.wikipedia.org/wiki/Forests_of_the_U...5
3234buggyRainforestboreal forestsTaiga (/ˈtaɪɡə/; Russian: тайга́; relates to M...0.474700https://en.wikipedia.org/wiki/Boreal_forest6
3235buggyRainforestDeciduous forestsTemperate deciduous or temperate broad-leaf fo...0.501480https://en.wikipedia.org/wiki/Temperate_decidu...7
3236buggyRainforestTropical deciduous forestThe tropical and subtropical dry broadleaf for...0.480779https://en.wikipedia.org/wiki/Tropical_deciduo...8
3237buggyRainforestrainforestsRainforests are forests characterized by a clo...0.482825https://en.wikipedia.org/wiki/Rainforest9
\n", "

3238 rows × 7 columns

\n", "
" ], "text/plain": [ " start target topic \\\n", "0 Sushi Mars Sushi \n", "1 Sushi Mars Planet Money \n", "2 Sushi Mars Pacifica Foundation \n", "3 Sushi Mars Mars Hill \n", "4 Sushi Mars Equinox Mountain \n", "... ... ... ... \n", "3233 buggy Rainforest Forests of the United States \n", "3234 buggy Rainforest boreal forests \n", "3235 buggy Rainforest Deciduous forests \n", "3236 buggy Rainforest Tropical deciduous forest \n", "3237 buggy Rainforest rainforests \n", "\n", " context sim_to_target \\\n", "0 Sushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ... 0.046150 \n", "1 Planet Money is an American podcast and blog p... 0.494693 \n", "2 Pacifica Foundation is an American non-profit ... 0.186643 \n", "3 The Mars Hill Network is a network of Christia... 0.466525 \n", "4 Equinox Mountain is the highest peak of the Ta... 0.196999 \n", "... ... ... \n", "3233 It has been estimated that before European set... 0.437653 \n", "3234 Taiga (/ˈtaɪɡə/; Russian: тайга́; relates to M... 0.474700 \n", "3235 Temperate deciduous or temperate broad-leaf fo... 0.501480 \n", "3236 The tropical and subtropical dry broadleaf for... 0.480779 \n", "3237 Rainforests are forests characterized by a clo... 0.482825 \n", "\n", " url page_num \n", "0 https://en.wikipedia.org/wiki/Sushi 1 \n", "1 https://en.wikipedia.org/wiki/Planet_Money 2 \n", "2 https://en.wikipedia.org/wiki/Pacifica_Foundation 3 \n", "3 https://en.wikipedia.org/wiki/Mars_Hill_Network 4 \n", "4 https://en.wikipedia.org/wiki/Equinox_Mountain 5 \n", "... ... ... \n", "3233 https://en.wikipedia.org/wiki/Forests_of_the_U... 5 \n", "3234 https://en.wikipedia.org/wiki/Boreal_forest 6 \n", "3235 https://en.wikipedia.org/wiki/Temperate_decidu... 7 \n", "3236 https://en.wikipedia.org/wiki/Tropical_deciduo... 8 \n", "3237 https://en.wikipedia.org/wiki/Rainforest 9 \n", "\n", "[3238 rows x 7 columns]" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "master_df = pd.DataFrame(master_dict)\n", "master_df.to_csv(\"data/3238x7.csv\", index = False)\n", "master_df" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }