{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# from selenium import webdriver\n", "# from selenium.webdriver.common.by import By\n", "# from selenium.webdriver.common.keys import Keys\n", "# from bs4 import BeautifulSoup\n", "# import time\n", "# # !pip install tensorflow tensorflow-hub\n", "# import tensorflow as tf\n", "# import tensorflow_hub as hub\n", "# import numpy as np\n", "# # !pip install jellyfish\n", "# import jellyfish" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# !pip show selenium\n", "# !pip show beautifulsoup4\n", "# !pip show numpy\n", "# !pip show tensorflow\n", "# !pip show tensorflow-hub\n", "# !pip show jellyfish\n", "# !pip show streamlit" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# starting_topic = \"soulja boy\"\n", "# target_topic = \"fart\"\n", "\n", "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Version 3" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-07-30 09:07:17.451238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.common.keys import Keys\n", "from bs4 import BeautifulSoup\n", "import time\n", "# !pip install tensorflow tensorflow-hub\n", "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "import numpy as np\n", "import requests\n", "import json\n", "\n", "# Load the pre-trained Universal Sentence Encoder\n", "embed = hub.load(\"https://tfhub.dev/google/universal-sentence-encoder/4\")" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n", "Starting!\n", "\n", "------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n", "Page: 1\n", "Current topic: 'Soulja Boy'\n", "Current URL: 'https://en.wikipedia.org/wiki/Soulja_Boy'\n", "Current Topic Context: 'DeAndre Cortez Way (born July 28, 1990), known professionally as Soulja Boy (formerly Soulja Boy Tell 'Em), is an American rapper and record producer'\n", "Next topic: 'Peewee Longway'. Semantic similarity to 'Urine': 21.81%\n", "\n", "Page: 2\n", "Current topic: 'Peewee Longway'\n", "Current URL: 'https://en.wikipedia.org/wiki/Peewee_Longway'\n", "Current Topic Context: 'Quincy Lamont Williams (born August 17, 1984), known by his stage name Peewee Longway, is an American rapper best known for his mixtape The Blue M&M and his collaboration with Young Thug, \"Loaded\"'\n", "Next topic: 'Hip Hop'. Semantic similarity to 'Urine': 12.0%\n", "\n", "Page: 3\n", "Current topic: 'Hip Hop'\n", "Current URL: 'https://en.wikipedia.org/wiki/Hip_hop_music'\n", "Current Topic Context: 'Hip hop or hip-hop, also known as rap and formerly known as disco rap,[5][6] is a genre of popular music that was originated in the Bronx[7][8][9][10] borough of New York City in the early 1970s by African Americans,[11][12][13] having existed for several years prior to mainstream discovery.[14] Hip hop originated as an anti-drug and anti-violence genre,[15] while consisting of stylized rhythmic music (usually built around drum beats) that commonly accompanies rapping, a rhythmic and rhyming speech that is chanted.[16] According to the professor Asante of African American studies at Temple University, \"hip hop is something that blacks can unequivocally claim as their own\".[17] It was developed as part of hip hop culture, a subculture defined by four key stylistic elements: MCing/rapping, DJing/scratching with turntables, break dancing, and graffiti art.[18][19][20] Other elements include sampling beats or bass lines from records (or synthesized beats and sounds), and rhythmic beatboxing'\n", "Next topic: 'Rufus Thomas'. Semantic similarity to 'Urine': 21.79%\n", "\n", "Page: 4\n", "Current topic: 'Rufus Thomas'\n", "Current URL: 'https://en.wikipedia.org/wiki/Rufus_Thomas'\n", "Current Topic Context: 'Rufus C'\n", "Next topic: 'Rabbit Foot Minstrels'. Semantic similarity to 'Urine': 19.28%\n", "\n", "Page: 5\n", "Current topic: 'Rabbit Foot Minstrels'\n", "Current URL: 'https://en.wikipedia.org/wiki/The_Rabbit%27s_Foot_Company'\n", "Current Topic Context: 'The Rabbit's Foot Company, also known as the Rabbit('s) Foot Minstrels and colloquially as \"The Foots\", was a long-running minstrel and variety troupe that toured as a tent show in the American South between 1900 and the late 1950s'\n", "Next topic: 'Jstor'. Semantic similarity to 'Urine': 11.85%\n", "\n", "Page: 6\n", "Current topic: 'Jstor'\n", "Current URL: 'https://en.wikipedia.org/wiki/JSTOR'\n", "Current Topic Context: 'JSTOR (/ˈdʒeɪstɔːr/; short for Journal Storage)[2] is a digital library founded in 1994'\n", "Next topic: 'Nieman Lab'. Semantic similarity to 'Urine': 12.14%\n", "\n", "Page: 7\n", "Current topic: 'Nieman Lab'\n", "Current URL: 'https://en.wikipedia.org/wiki/Nieman_Foundation_for_Journalism'\n", "Current Topic Context: 'The Nieman Foundation for Journalism at Harvard University is the primary journalism institution at Harvard.'\n", "Next topic: 'Men'S Soccer'. Semantic similarity to 'Urine': 14.43%\n", "\n", "Page: 8\n", "Current topic: 'Men'S Soccer'\n", "Current URL: 'https://en.wikipedia.org/wiki/Harvard_Crimson_men%27s_soccer'\n", "Current Topic Context: 'The Harvard Crimson men's soccer team is an intercollegiate varsity sports team of Harvard University'\n", "Next topic: 'California Golden Bears Men'S Soccer'. Semantic similarity to 'Urine': 17.31%\n", "\n", "Page: 9\n", "Current topic: 'California Golden Bears Men'S Soccer'\n", "Current URL: 'https://en.wikipedia.org/wiki/California_Golden_Bears_men%27s_soccer'\n", "Current Topic Context: 'The California Golden Bears men's soccer team is a varsity intercollegiate athletic team of University of California, Berkeley in Berkeley, California, United States.[1] The team is a member of the Pac-12 Conference, which is part of the National Collegiate Athletic Association's Division I'\n", "Next topic: 'California Drinking Song'. Semantic similarity to 'Urine': 15.78%\n", "\n", "Page: 10\n", "Current topic: 'California Drinking Song'\n", "Current URL: 'https://en.wikipedia.org/wiki/California_Drinking_Song'\n", "Current Topic Context: '\"California Drinking Song\" is a spirit song from the University of California, Berkeley'\n", "Next topic: 'Uc Men'S Octet'. Semantic similarity to 'Urine': 15.63%\n", "\n", "Page: 11\n", "Current topic: 'Uc Men'S Octet'\n", "Current URL: 'https://en.wikipedia.org/wiki/University_of_California_Men%27s_Octet'\n", "Current Topic Context: 'The UC Men's Octet, sometimes termed the Cal Men’s Octet or the UC Berkeley Men’s Octet, is an eight-member male a cappella group at the University of California, Berkeley'\n", "Next topic: 'Laboratories'. Semantic similarity to 'Urine': 15.45%\n", "\n", "Page: 12\n", "Current topic: 'Laboratories'\n", "Current URL: 'https://en.wikipedia.org/wiki/Research_centers_and_laboratories_at_the_University_of_California,_Berkeley'\n", "Current Topic Context: 'The University of California, Berkeley, contains many research centers and laboratories.'\n", "Next topic: 'Uc Irvine Medical Center'. Semantic similarity to 'Urine': 18.16%\n", "\n", "Page: 13\n", "Current topic: 'Uc Irvine Medical Center'\n", "Current URL: 'https://en.wikipedia.org/wiki/University_of_California,_Irvine_Medical_Center'\n", "Current Topic Context: 'The University of California, Irvine Medical Center (UCIMC or UCI Medical Center) is a major research hospital located in Orange, California'\n", "Next topic: 'Sepsis'. Semantic similarity to 'Urine': 19.29%\n", "\n", "Page: 14\n", "Current topic: 'Sepsis'\n", "Current URL: 'https://en.wikipedia.org/wiki/Sepsis'\n", "Current Topic Context: 'Sepsis (septicaemia in British English), or blood poisoning,[8][9] is a life-threatening condition that arises when the body's response to infection causes injury to its own tissues and organs.[4][8]'\n", "Next topic: 'Urinary Tract'. Semantic similarity to 'Urine': 51.26%\n", "\n", "Page: 15\n", "Current topic: 'Urinary Tract'\n", "Current URL: 'https://en.wikipedia.org/wiki/Urinary_system'\n", "Current Topic Context: 'The urinary system, also known as the urinary tract or renal system, consists of the kidneys, ureters, bladder, and the urethra'\n", "Next topic: 'Urinary Bladder'. Semantic similarity to 'Urine': 61.01%\n", "\n", "Page: 16\n", "Current topic: 'Urinary Bladder'\n", "Current URL: 'https://en.wikipedia.org/wiki/Bladder'\n", "Current Topic Context: 'The bladder is a hollow organ in humans and other vertebrates that stores urine from the kidneys before disposal by urination'\n", "Next topic: 'Urination § Anatomy Of The Bladder And Outlet'. Semantic similarity to 'Urine': 57.69%\n", "\n", "Page: 17\n", "Current topic: 'Urination § Anatomy Of The Bladder And Outlet'\n", "Current URL: 'https://en.wikipedia.org/wiki/Urination#Anatomy_of_the_bladder_and_outlet'\n", "Current Topic Context: 'Urination is the release of urine from the urinary bladder through the urethra to the outside of the body'\n", "Next topic: 'Urine'. Semantic similarity to 'Urine': 57.28%\n", "\n", "Page: 18\n", "Current topic: 'Urine'\n", "Current URL: 'https://en.wikipedia.org/wiki/Urine'\n", "Current Topic Context: 'Urine is a liquid by-product of metabolism in humans and in many other animals'\n", "\n", "------------------------------------------------------------------------------------------------------------------------------------------------------\n", "\n", "From 'Soulja Boy', to 'Urine' in 18 pages, 8.54 seconds!\n", "Starting topic: 'Soulja Boy': 'https://en.wikipedia.org/wiki/Soulja_Boy'\n", "Target topic: 'Urine': 'https://en.wikipedia.org/wiki/Urine'\n", "\n", "------------------------------------------------------------------------------------------------------------------------------------------------------\n" ] } ], "source": [ "def most_similar_sentence(target_topic, labels_list):\n", " # Encode the context sentence and all sentences in the list\n", " context_embedding = embed([target_topic])[0]\n", " sentence_embeddings = embed(labels_list)\n", " \n", " # Calculate cosine similarities between the context sentence and each sentence in the list\n", " similarities = np.inner(context_embedding, sentence_embeddings)\n", " \n", " # Find the index of the most similar sentence\n", " most_similar_index = np.argmax(similarities)\n", " \n", " return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index\n", "\n", "def search_wikipedia(search_term):\n", " # Define the endpoint\n", " endpoint = \"https://en.wikipedia.org/w/api.php\"\n", "\n", " # Define the search parameters\n", " params = {\n", " \"action\": \"query\",\n", " \"format\": \"json\",\n", " \"list\": \"search\",\n", " \"srsearch\": search_term\n", " }\n", "\n", " # Send a GET request to the endpoint with your parameters\n", " response = requests.get(url = endpoint, params = params)\n", "\n", " # Parse the results as JSON\n", " data = json.loads(response.text)\n", "\n", " # Get the title of the first result (this will be used as the page title in the next step)\n", " page_title = data[\"query\"][\"search\"][0][\"title\"]\n", "\n", " # if \"may refer to\" in data[\"query\"][\"search\"][0][\"snippet\"].lower():\n", " # page_title = data[\"query\"][\"search\"][1][\"title\"]\n", "\n", " # Construct the URL of the Wikipedia page\n", " page_url = \"https://en.wikipedia.org/wiki/{}\".format(page_title.replace(\" \", \"_\"))\n", "\n", " return page_url, page_title\n", "\n", "def get_topic_context(driver, more = False):\n", " # Find the first paragraph of the main article\n", " first_paragraph = driver.find_element(By.CSS_SELECTOR, \"div.mw-parser-output > p:not(.mw-empty-elt)\").text\n", "\n", " if more:\n", " context_sentence = \". \".join(first_paragraph.split(\". \")[:5])\n", " else:\n", " context_sentence = first_paragraph.split(\". \")[0]\n", "\n", " return context_sentence\n", "\n", "# bad_words = [word for word in open(\"censored.txt\", \"r\").readlines()]\n", "bad_words = [word.strip() for word in open(\"censored.txt\", \"r\").readlines()]\n", "\n", "def refine_links(topic, links, current_url_suffix, used_links, used_topics, censor = False):\n", "\n", " links_texts = []\n", "\n", " # Iterate through the links and extract their URLs\n", " for link in links:\n", " link_url = link.get('href')\n", " if link_url and link_url.startswith(\"/wiki/\"):\n", " link_url = \"https://en.wikipedia.org\" + link_url\n", " link_text = link.text.strip() # Get the text and remove leading/trailing spaces\n", "\n", " # make sure they are both not None\n", " if link_text and current_url_suffix not in link_url:\n", "\n", " if link_url not in used_links and link_text.lower() not in [topic.lower() for topic in used_topics]:\n", "\n", " # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)\n", " if topic.lower() not in link_url.lower() and \"en.wikipedia.org/wiki/\" in link_url and \":\" not in \"\".join(link_url.split(\"/\")[1:]) and \"Main_Page\" != str(link_url.split(\"/\")[-1]):\n", "\n", " # censoring if needed\n", " if censor:\n", " if not any(word1.lower() in bad_words for word1 in [word.lower() for word in link_text.split()]):\n", " links_texts.append((link_url, link_text))\n", " else:\n", " links_texts.append((link_url, link_text))\n", "\n", " return links_texts\n", "\n", "def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):\n", "\n", " ##### Setup Chrome options\n", " chrome_options = webdriver.ChromeOptions()\n", " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n", " chrome_options.add_argument(\"--no-sandbox\")\n", " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", " driver = webdriver.Chrome(options = chrome_options)\n", "\n", " #### Getting target url, topic, and context\n", " driver_target = webdriver.Chrome(options = chrome_options)\n", " target_url, target_topic = search_wikipedia(search_term = target_topic)\n", " driver_target.get(target_url)\n", " target_context = get_topic_context(driver_target, more = True)\n", " # print(target_context)\n", " driver_target.quit()\n", "\n", " topic = starting_topic\n", " num_pages = 0\n", " used_topics = []\n", " used_links = []\n", "\n", " start_time = time.time()\n", "\n", " ### BEGIN ###\n", "\n", " print(\"-\" * 150)\n", " print(f\"\\nStarting!\\n\")\n", " print(\"-\" * 150)\n", "\n", " url, topic = search_wikipedia(search_term = starting_topic)\n", " driver.get(url)\n", " used_topics.append(topic)\n", " used_links.append(driver.current_url)\n", "\n", " while True:\n", " # increment the page tracking by 1 for each new page\n", " num_pages += 1\n", "\n", " # if not the first page, navigate to the new page\n", " if num_pages > 1:\n", " driver.get(next_link)\n", "\n", " try:\n", " context_sentence = get_topic_context(driver)\n", " except Exception as e:\n", " context_sentence = \"Context could not be found from webpage\"\n", "\n", " current_url = driver.current_url\n", " current_url_suffix = str(current_url).split(\"/\")[-1]\n", "\n", " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n", " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n", "\n", " soup = BeautifulSoup(current_page, 'html.parser')\n", "\n", " links = soup.find_all('a')\n", "\n", " # get rid of any bloat in the links from the page\n", " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n", "\n", " # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])\n", " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context.lower(), labels_list = [text.lower() for link, text in links_texts])\n", "\n", " print(f\"\\nPage: {num_pages}\")\n", " print(f\"Current topic: '{topic.title()}'\")\n", " print(f\"Current URL: '{current_url}'\")\n", " print(f\"Current Topic Context: '{context_sentence}'\")\n", " if current_url != target_url:\n", " print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n", "\n", " next_link, topic = links_texts[loc_idx]\n", "\n", " used_links.append(next_link)\n", " used_topics.append(topic)\n", "\n", " if current_url == target_url: # because the target_url is now found through the API\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n", " print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n", " print(f\"Target topic: '{target_topic.title()}': '{target_url}'\\n\")\n", " print(\"-\" * 150)\n", " driver.quit()\n", " break\n", "\n", " if num_pages == limit:\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n", " print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n", " print(f\"\\nTry a different combination to see if it can do it!\\n\")\n", " print(\"-\" * 150)\n", " driver.quit()\n", " break\n", "\n", "###### Example\n", "\n", "starting_topic = 'soulja boy'\n", "target_topic = 'urine'\n", "\n", "play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# starting_topic = 'soulja boy'\n", "# target_topic = 'fart'\n", "\n", "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tracking Stats" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n", "\n", " stats_dict = {}\n", "\n", " ##### Setup Chrome options\n", " chrome_options = webdriver.ChromeOptions()\n", " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n", " chrome_options.add_argument(\"--no-sandbox\")\n", " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", " driver = webdriver.Chrome(options = chrome_options)\n", "\n", " #### Getting target url, topic, and context\n", " driver_target = webdriver.Chrome(options = chrome_options)\n", " target_url, target_topic = search_wikipedia(search_term = target_topic)\n", " driver_target.get(target_url)\n", " target_context = get_topic_context(driver_target)\n", " print(target_context)\n", " print()\n", " driver_target.quit()\n", " \n", " topic = starting_topic\n", " num_pages = 0\n", " used_topics = []\n", " used_links = []\n", " contexts = []\n", " sim_to_target_scores = []\n", "\n", " start_time = time.time()\n", "\n", " ### BEGIN ###\n", "\n", " print(\"-\" * 150)\n", " print(f\"\\nStarting!\\n\")\n", " print(\"-\" * 150)\n", "\n", " url, topic = search_wikipedia(search_term = starting_topic)\n", " driver.get(url)\n", " used_topics.append(topic)\n", " used_links.append(driver.current_url)\n", " sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n", "\n", " while True:\n", " # increment the page tracking by 1 for each new page\n", " num_pages += 1\n", "\n", " # if not the first page, navigate to the new page\n", " if num_pages > 1:\n", " driver.get(next_link)\n", "\n", " context_sentence = get_topic_context(driver)\n", " contexts.append(context_sentence)\n", "\n", " current_url = driver.current_url\n", " current_url_suffix = str(current_url).split(\"/\")[-1]\n", "\n", " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n", " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n", "\n", " soup = BeautifulSoup(current_page, 'html.parser')\n", "\n", " links = soup.find_all('a')\n", "\n", " # get rid of any bloat in the links from the page\n", " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n", "\n", " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n", "\n", " print(f\"\\nPage: {num_pages}\")\n", " print(f\"Current topic: '{topic.title()}'\")\n", " print(f\"Current URL: '{current_url}'\")\n", " print(f\"Current Topic Context: '{context_sentence}'\")\n", " if current_url != target_url:\n", " print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n", " \n", " next_link, topic = links_texts[loc_idx]\n", "\n", " # contexts.append(context_sentence)\n", "\n", " if current_url == target_url: # because the target_url is now found through the API\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n", " print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n", " print(f\"Target topic: '{target_topic.title()}': '{used_links[-1]}'\\n\")\n", " print(\"-\" * 150)\n", "\n", " stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n", " stats_dict['topic'] = used_topics\n", " stats_dict['context'] = contexts\n", " stats_dict['sim_to_target'] = sim_to_target_scores\n", " stats_dict['url'] = used_links\n", " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n", " driver.quit()\n", " return stats_dict\n", " break\n", "\n", " ##### ADD DRAMATIC DELAY HERE #####\n", " # time.sleep(0.5)\n", " # time.sleep(10)\n", "\n", " if num_pages == limit:\n", " print(\"\\n\" + \"-\" * 150)\n", " print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n", " print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n", " print(f\"\\nTry a different combination to see if it can do it!\\n\")\n", " print(\"-\" * 150)\n", "\n", " stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n", " stats_dict['topic'] = used_topics\n", " stats_dict['context'] = contexts\n", " stats_dict['sim_to_target'] = sim_to_target_scores\n", " stats_dict['url'] = used_links\n", " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n", " driver.quit()\n", " return stats_dict\n", " break\n", "\n", " used_links.append(next_link)\n", " used_topics.append(topic)\n", " sim_to_target_scores.append(best_score)\n", "\n", "# starting_topic = 'john mayer'\n", "# target_topic = 'fart'\n", "\n", "# stats_dict = play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['start_end', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n", "[6, 6, 6, 6, 6, 6]\n" ] } ], "source": [ "# stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(7)]\n", "print(stats_dict.keys())\n", "print([len(stats_dict[key]) for key in stats_dict.keys()])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0.027460583, 0.20852715, 0.2775123, 0.31147623, 0.4413054, 0.6199604]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats_dict['sim_to_target']" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | start_end | \n", "topic | \n", "context | \n", "sim_to_target | \n", "url | \n", "page_num | \n", "
---|---|---|---|---|---|---|
0 | \n", "john mayer_Flatulence | \n", "John Mayer | \n", "John Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ... | \n", "0.027461 | \n", "https://en.wikipedia.org/wiki/John_Mayer | \n", "1 | \n", "
1 | \n", "john mayer_Flatulence | \n", "cardiac dysrhythmia | \n", "Arrhythmias, also known as cardiac arrhythmias... | \n", "0.208527 | \n", "https://en.wikipedia.org/wiki/Cardiac_dysrhythmia | \n", "2 | \n", "
2 | \n", "john mayer_Flatulence | \n", "prolapse | \n", "Mitral valve prolapse (MVP) is a valvular hear... | \n", "0.277512 | \n", "https://en.wikipedia.org/wiki/Mitral_valve_pro... | \n", "3 | \n", "
3 | \n", "john mayer_Flatulence | \n", "gastrointestinal disturbances | \n", "Gastrointestinal diseases (abbrev | \n", "0.311476 | \n", "https://en.wikipedia.org/wiki/Gastrointestinal... | \n", "4 | \n", "
4 | \n", "john mayer_Flatulence | \n", "gastrointestinal tract | \n", "The gastrointestinal tract (GI tract, digestiv... | \n", "0.441305 | \n", "https://en.wikipedia.org/wiki/Human_gastrointe... | \n", "5 | \n", "
5 | \n", "john mayer_Flatulence | \n", "flatulence | \n", "Flatulence, in humans, is the expulsion of gas... | \n", "0.619960 | \n", "https://en.wikipedia.org/wiki/Flatulence | \n", "6 | \n", "
\n", " | start | \n", "target | \n", "topic | \n", "context | \n", "sim_to_target | \n", "url | \n", "page_num | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Sushi | \n", "Mars | \n", "Sushi | \n", "Sushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ... | \n", "0.046150 | \n", "https://en.wikipedia.org/wiki/Sushi | \n", "1 | \n", "
1 | \n", "Sushi | \n", "Mars | \n", "Planet Money | \n", "Planet Money is an American podcast and blog p... | \n", "0.494693 | \n", "https://en.wikipedia.org/wiki/Planet_Money | \n", "2 | \n", "
2 | \n", "Sushi | \n", "Mars | \n", "Pacifica Foundation | \n", "Pacifica Foundation is an American non-profit ... | \n", "0.186643 | \n", "https://en.wikipedia.org/wiki/Pacifica_Foundation | \n", "3 | \n", "
3 | \n", "Sushi | \n", "Mars | \n", "Mars Hill | \n", "The Mars Hill Network is a network of Christia... | \n", "0.466525 | \n", "https://en.wikipedia.org/wiki/Mars_Hill_Network | \n", "4 | \n", "
4 | \n", "Sushi | \n", "Mars | \n", "Equinox Mountain | \n", "Equinox Mountain is the highest peak of the Ta... | \n", "0.196999 | \n", "https://en.wikipedia.org/wiki/Equinox_Mountain | \n", "5 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
3233 | \n", "buggy | \n", "Rainforest | \n", "Forests of the United States | \n", "It has been estimated that before European set... | \n", "0.437653 | \n", "https://en.wikipedia.org/wiki/Forests_of_the_U... | \n", "5 | \n", "
3234 | \n", "buggy | \n", "Rainforest | \n", "boreal forests | \n", "Taiga (/ˈtaɪɡə/; Russian: тайга́; relates to M... | \n", "0.474700 | \n", "https://en.wikipedia.org/wiki/Boreal_forest | \n", "6 | \n", "
3235 | \n", "buggy | \n", "Rainforest | \n", "Deciduous forests | \n", "Temperate deciduous or temperate broad-leaf fo... | \n", "0.501480 | \n", "https://en.wikipedia.org/wiki/Temperate_decidu... | \n", "7 | \n", "
3236 | \n", "buggy | \n", "Rainforest | \n", "Tropical deciduous forest | \n", "The tropical and subtropical dry broadleaf for... | \n", "0.480779 | \n", "https://en.wikipedia.org/wiki/Tropical_deciduo... | \n", "8 | \n", "
3237 | \n", "buggy | \n", "Rainforest | \n", "rainforests | \n", "Rainforests are forests characterized by a clo... | \n", "0.482825 | \n", "https://en.wikipedia.org/wiki/Rainforest | \n", "9 | \n", "
3238 rows × 7 columns
\n", "