BotNews / search_engine.py
leandrocarneiro's picture
Upload search_engine.py
f5f34c2 verified
raw
history blame
2.63 kB
# Created by Leandro Carneiro at 19/01/2024
# Description:
# ------------------------------------------------
import os.path
import time
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import constants
def google_search_api(search_term, api_key, cse_id, **kwargs):
try:
service = build("customsearch", "v1", developerKey=api_key)
res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
return res['items']
except Exception as e:
return -1
def search_google(subject, sites):
try:
results = []
for site in sites:
print(' Buscando notícias no domínio: ' + site)
query = f"{subject} site:{site}"
sites_searched = google_search_api(query, os.environ['GOOGLE_KEY'], os.environ['GOOGLE_SEARCH'], num=constants.num_sites)
if sites_searched == -1:
results.append(site)
else:
for s in sites_searched:
if 'pdf' not in s['link'].lower():
results.append(s['link'])
else:
print(' Arquivo PDF encontrado: ' + s['link'])
#time.sleep(3)
print(' Total de sites encontrados: ' + str(len(results)))
return results
except Exception as e:
print(str(e))
return str(e)
def retrieve_text_from_site(sites):
result = []
for site in sites:
print(' Baixando texto do site: ' + site)
try:
response = requests.get(site)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
result.append(soup.get_text())
except Exception as e:
result.append('Erro na recuperação do texto: ' + str(e))
return result
def delete_base(local_base):
try:
for i in os.listdir(local_base):
file_path = os.path.join(local_base, i)
os.remove(file_path)
return 0
except Exception as e:
return str(e)
def save_on_base(sites, texts, local_base):
try:
for i in range(len(sites)):
filename = f'news{i}.txt'
with open(os.path.join(local_base, filename), 'w', encoding='utf-8') as file:
file.write(texts[i])
with open(os.path.join(local_base, 'filename_url.csv'), 'a', encoding='utf-8') as file:
file.write(filename + ';' + sites[i] + '\n')
return 0
except Exception as e:
return str(e)