ebook-gen / search_client.py
pragneshbarik's picture
major refactoring, web crawl in now multithreaded
e51667a
raw
history blame
2.93 kB
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures
class SearchClient:
def __init__(self, vendor, engine_id=None, api_key=None):
self.vendor = vendor
if vendor == "google":
self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
elif vendor == "bing":
self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
self.headers = {
"Ocp-Apim-Subscription-Key": api_key,
}
@staticmethod
def _extract_text_from_link(link):
page = requests.get(link)
if page.status_code == 200:
soup = BeautifulSoup(page.content, "html.parser")
text = soup.get_text()
cleaned_text = re.sub(r"\s+", " ", text)
return cleaned_text
return None
def _fetch_text_from_links(self, links):
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {
executor.submit(self._extract_text_from_link, link): link
for link in links
}
for future in concurrent.futures.as_completed(future_to_link):
link = future_to_link[future]
try:
cleaned_text = future.result()
if cleaned_text:
results.append({"text": cleaned_text, "link": link})
except Exception as e:
print(f"Error fetching data from {link}: {e}")
return results
def _google_search(self, query, n_crawl):
response = requests.get(self.endpoint, params={"q": query})
search_results = response.json()
results = []
count = 0
for item in search_results.get("items", []):
if count >= n_crawl:
break
link = item["link"]
results.append(link)
count += 1
text_results = self._fetch_text_from_links(results)
return text_results
def _bing_search(self, query, n_crawl):
params = {
"q": query,
"count": n_crawl, # You might need to adjust this based on Bing API requirements
"mkt": "en-US",
}
response = requests.get(self.endpoint, headers=self.headers, params=params)
search_results = response.json()
results = []
for item in search_results.get("webPages", {}).get("value", []):
link = item["url"]
results.append(link)
text_results = self._fetch_text_from_links(results)
return text_results
def search(self, query, n_crawl):
if self.vendor == "google":
return self._google_search(query, n_crawl)
elif self.vendor == "bing":
return self._bing_search(query, n_crawl)
else:
return "Invalid vendor"