In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import spacy

In [None]:
def tokenize(text):
    nlp = spacy.load('en_core_web_sm')
    tokens = [token.text for token in nlp(text) if not token.is_punct and not token.is_space]
    
    return tokens

In [None]:
def add_target_columns(data, category, target):
    # Column to store category of questions
    data["Category"] = category
    # Target column to store whether human answer or chatgpt answer
    data["Human vs ChatGPT"] = target
    
    return data

In [None]:
def store_excel(data, prev_data = None):
    if prev_data:
        # Loading old data into dataframe
        old_data = pd.read_excel(prev_data)
        # Concatenating the two dataframes vertically
        complete_data = pd.concat([old_data, data], ignore_index=True)
        # Storing the combined data to the excel file
        complete_data.to_excel('scraped_data.xlsx', index=False)
    else:
        data.to_excel("scraped_data.xlsx", index=False)

In [None]:
def getData(page_urls, driver, min_ans_len = 15, limit = 10, scroll_num = 10):
    
    # Empty dataframe to store the scraped content
    scraped_data = pd.DataFrame()
    
    # Initializing variable to track the number of data samples collected 
    len_data = 0
    
    # Initializing lists to store the scraped content
    questions = []
    answers = []
    
    count = 1
    count1 = 0
    for page_url in page_urls:
        print(f"Page {count} of {len(page_urls)}")
        # Sending a get request to the web page (Navigating to the webpage)
        driver.get(page_url)
        # Wait
        driver.implicitly_wait(10)

        # Initializing variables to iterate through the try except block
        max_tries = 10
        retry = 0   

        # Initializing variable to check if we've reached the end of the page 
        old_content = None
        new_content = None

        # Scrolling to get enough answers
        for i in range(scroll_num):
            # Scrolling to access the next page of questions
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # Wait for the content to be loaded        
            time.sleep(5) 
            # Checking if page is same before and after scrolling
            new_content = driver.page_source
            if new_content == old_content:
                break
            old_content = new_content

        # Used while loop to avoid "StaleElementReferenceException" error
        while retry<max_tries:
            try:
                # Scraping the question answer blocks on Quora
                data_elements = driver.find_elements(By.CSS_SELECTOR, "div.dom_annotate_multifeed_bundle_AnswersBundle")
                retry = 0
                break
            except:
                retry += 1


        # Used while loop to avoid "StaleElementReferenceException" error
        while retry<max_tries:
            try:
                for block in data_elements:

                    ### --- Questions --- ###
                    # Scraping question from the webpage 
                    ques = block.find_element(By.CSS_SELECTOR, "div.q-text.puppeteer_test_question_title span")

                    ### --- Answers --- ###
                    # Checking if "more" button is present for an answer
                    try: 
                        # Selecting the "more" button
                        read_more = block.find_element(By.CSS_SELECTOR, "div.q-absolute div.qt_read_more")
                        # Checking if the button is clickable
                        try:
                            # Expanding answer by clicking "more" button
                            read_more.click()
                        except:
                            # Discarding data where complete answer cannot be obtained
                            continue
                    except:
                        None
                    # Scraping answers from the webpage 
                    ans = block.find_element(By.CSS_SELECTOR, "div.q-box.spacing_log_answer_content.puppeteer_test_answer_content span.q-box")

                    if ques.text and ans.text:
                        # Skipping questions that are already present
                        if ques.text in questions:
                            continue
                        # Skipping the questions where length of answers are less than a given threshold
                        ans_tokens = len(tokenize(ans.text))
                        if ans_tokens<min_ans_len:
                            continue
                        # Appending the scraped question
                        questions.append(ques.text)
                        # Appending the scraped answer
                        answers.append(ans.text)
                        count1+=1
                        print(f"{count1} of 250")
                    else:
                        continue

                    # Updating the number of data samples collected
                    len_data = len(questions) 
                    # Collecting data until limit is reached 
                    if len_data == limit:
                        break
                retry = 0
                break
            except:
                retry += 1 
        count+=1
        if len_data == limit:
            break
        
    # Warning to give more urls if desired amount of data is not scraped
    if len_data < limit:
        print("Warning: Need to provide more webpages to get desired amount of data!")
        
    # Storing the scraped information in a dataframe  
    scraped_data["Questions"] = questions
    scraped_data["Answers"] = answers
    
    return scraped_data

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://boards.greenhouse.io/enveritas/jobs/4001717008"

In [3]:
# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

In [27]:
# Extract the job description content
job_title = soup.find("h1", {"class": "app-title"})

company_name = soup.find("span",{"class": "company-name"})

job_location = soup.find("div", {"class": "location"})

job_description = soup.find("div", {"id": "content"})

# Print the job description
if job_description:
    print("Job Title: "+job_title.get_text()+"\n"+"Company: "+company_name.get_text().strip().split("at ")[1]+"\n"+"Job Location: "+job_location.get_text().strip()+"\n"+"Job Description: \n"+job_description.get_text().strip())
else:
    print("Job description not found.")


Job Title: Data Scientist
Company: Enveritas
Job Location: Global / Remote
Job Description: 
Data Scientist, Engineering & Data Group
Do you want to work for a mission-driven non-profit, analyzing data and writing software that will contribute to helping millions of coffee farmers out of poverty? Enveritas is a 501(c)3 non-profit and Y Combinator-backed startup looking to hire a Data Scientist for our Data Team. 
We are looking for a Data Scientist with extensive professional experience to join our Engineering and Data Group on a remote, full-time basis. This position is open globally, based on locations supported by our EOR partner, Deel. You can learn more about this role at https://www.enveritas.org/jobs/data-scientist.
Our Engineering and Data Group is a quirky, talented, and humble group of about twenty with diverse backgrounds ranging from journalism to academia to international industry.
About Our Data Team
The Data Team's mission is to leverage data analytics to drive Enveritas