Text_Summarzier / scraper.py
ksvmuralidhar's picture
Update scraper.py
aa42935 verified
raw
history blame
1.45 kB
from selenium import webdriver
from selenium.webdriver.common.by import By
import streamlit as st
from selenium.webdriver import FirefoxOptions
import re
import logging
def scrape_text(url, n_words=15):
try:
driver = None
logging.warning("Initiated Scraping")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
opts = FirefoxOptions()
opts.add_argument("--headless")
opts.add_argument(f"user-agent={user_agent}")
driver = webdriver.Firefox(options=opts)
driver.set_page_load_timeout(60)
driver.get(url)
elem = driver.find_element(By.TAG_NAME, "body").text
# h1 = driver.find_element(By.TAG_NAME, "h1").text
# elem = h1 + "\n" + elem
sents = elem.split("\n")
sentence_list = []
for sent in sents:
sent = sent.strip()
if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
sentence_list.append(sent)
driver.quit()
logging.warning("Closed Webdriver")
logging.warning("Successfully scraped text")
if len(sentence_list) < 3:
raise Exception("Found nothing to scrape.")
return " \n\n\n".join(sentence_list)
except:
if driver:
driver.close()
logging.warning("Closed Webdriver")
raise