Text_Summarzier / scraper.py
ksvmuralidhar's picture
Upload files
8ef7527 verified
raw
history blame
1.17 kB
from selenium import webdriver
from selenium.webdriver.common.by import By
import streamlit as st
from selenium.webdriver import FirefoxOptions
import re
import logging
def scrape_text(url, n_words=15):
try:
driver = None
logging.warning("Initiated Scraping")
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(options=opts)
driver.set_page_load_timeout(60)
driver.get(url)
elem = driver.find_element(By.TAG_NAME, "body").text
# h1 = driver.find_element(By.TAG_NAME, "h1").text
# elem = h1 + "\n" + elem
sents = elem.split("\n")
sentence_list = []
for sent in sents:
sent = sent.strip()
if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
sentence_list.append(sent)
driver.quit()
logging.warning("Closed Webdriver")
logging.warning("Successfully scraped text")
return " \n\n\n".join(sentence_list)
except:
if driver:
driver.close()
logging.warning("Closed Webdriver")
raise