"""Scrape selected data from the 'untapped' website, put them in a list of dictionaries, embed and store them into a vectorstore.""" import requests import os import openai from bs4 import BeautifulSoup from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings import CohereEmbeddings, HuggingFaceHubEmbeddings from langchain.vectorstores import FAISS, Annoy, DocArrayInMemorySearch from langchain.document_loaders import CSVLoader import csv import pickle openai.api_key = os.environ['OPENAI_API_KEY'] def ingest_data(): headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} url='https://untappd.com/v/gourmet-haus-staudt/15392' response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a', attrs={'data-track':'menu', 'data-href': ":"}) beer_list = [] for link in links: page_url = 'https://untappd.com' + link.get('href') if page_url: page_response = requests.get(page_url, headers=headers) page_soup = BeautifulSoup(page_response.text, 'html.parser') # Extract the "relevant" page content box_info = page_soup.find('div', class_="box b_info") if box_info: name = box_info.find('h1').get_text() brewery = box_info.find(class_="brewery").get_text().strip() style = box_info.find(class_="style").get_text() description = box_info.find(class_="beer-descrption-read-less") description.find('a').extract() description = description.get_text().strip() abv = box_info.find(class_="abv").get_text().strip() ibu = box_info.find(class_="ibu").get_text().strip() ratings = box_info.find(class_="num").get_text().replace("(", "").replace(")", "") n_ratings = box_info.find(class_="raters").get_text().strip() beer_dict = {"name": name, "brewery": brewery, "style": style, "description": description, "alcohol_by_volume": abv, "ibu_bitterness_unit": ibu, "ratings": ratings, "n_ratings": n_ratings} beer_list.append(beer_dict) keys = beer_list[0].keys() with open('beers.csv', 'w', newline='') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(beer_list) file = 'beers.csv' loader = CSVLoader(file_path=file) data = loader.load() embeddings = OpenAIEmbeddings() db = FAISS.from_documents(data, embeddings) # Save vectorstore with open("db.pkl", "wb") as f: pickle.dump(db, f) if __name__ == "__main__": ingest_data()