Spaces:
Runtime error
Runtime error
"""Scrape selected data from the 'untapped' website, put them in a list of dictionaries, | |
embed and store them into a vectorstore.""" | |
import requests | |
import os | |
import openai | |
from bs4 import BeautifulSoup | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.embeddings import CohereEmbeddings, HuggingFaceHubEmbeddings | |
from langchain.vectorstores import FAISS, Annoy, DocArrayInMemorySearch | |
from langchain.document_loaders import CSVLoader | |
import csv | |
import pickle | |
openai.api_key = os.environ['OPENAI_API_KEY'] | |
def ingest_data(): | |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} | |
url='https://untappd.com/v/gourmet-haus-staudt/15392' | |
response = requests.get(url, headers=headers) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = soup.find_all('a', attrs={'data-track':'menu', 'data-href': ":"}) | |
beer_list = [] | |
for link in links: | |
page_url = 'https://untappd.com' + link.get('href') | |
if page_url: | |
page_response = requests.get(page_url, headers=headers) | |
page_soup = BeautifulSoup(page_response.text, 'html.parser') | |
# Extract the "relevant" page content | |
box_info = page_soup.find('div', class_="box b_info") | |
if box_info: | |
name = box_info.find('h1').get_text() | |
brewery = box_info.find(class_="brewery").get_text().strip() | |
style = box_info.find(class_="style").get_text() | |
description = box_info.find(class_="beer-descrption-read-less") | |
description.find('a').extract() | |
description = description.get_text().strip() | |
abv = box_info.find(class_="abv").get_text().strip() | |
ibu = box_info.find(class_="ibu").get_text().strip() | |
ratings = box_info.find(class_="num").get_text().replace("(", "").replace(")", "") | |
n_ratings = box_info.find(class_="raters").get_text().strip() | |
beer_dict = {"name": name, "brewery": brewery, "style": style, "description": description, "alcohol_by_volume": abv, "ibu_bitterness_unit": ibu, "ratings": ratings, "n_ratings": n_ratings} | |
beer_list.append(beer_dict) | |
keys = beer_list[0].keys() | |
with open('beers.csv', 'w', newline='') as output_file: | |
dict_writer = csv.DictWriter(output_file, keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(beer_list) | |
file = 'beers.csv' | |
loader = CSVLoader(file_path=file) | |
data = loader.load() | |
embeddings = OpenAIEmbeddings() | |
db = FAISS.from_documents(data, embeddings) | |
# Save vectorstore | |
with open("db.pkl", "wb") as f: | |
pickle.dump(db, f) | |
if __name__ == "__main__": | |
ingest_data() |