beer-sommelier / ingest.py
sooolee's picture
Update ingest.py to generate db only
57ca393
raw
history blame contribute delete
No virus
2.9 kB
"""Scrape selected data from the 'untapped' website, put them in a list of dictionaries,
embed and store them into a vectorstore."""
import requests
import os
import openai
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CohereEmbeddings, HuggingFaceHubEmbeddings
from langchain.vectorstores import FAISS, Annoy, DocArrayInMemorySearch
from langchain.document_loaders import CSVLoader
import csv
import pickle
openai.api_key = os.environ['OPENAI_API_KEY']
def ingest_data():
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url='https://untappd.com/v/gourmet-haus-staudt/15392'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', attrs={'data-track':'menu', 'data-href': ":"})
beer_list = []
for link in links:
page_url = 'https://untappd.com' + link.get('href')
if page_url:
page_response = requests.get(page_url, headers=headers)
page_soup = BeautifulSoup(page_response.text, 'html.parser')
# Extract the "relevant" page content
box_info = page_soup.find('div', class_="box b_info")
if box_info:
name = box_info.find('h1').get_text()
brewery = box_info.find(class_="brewery").get_text().strip()
style = box_info.find(class_="style").get_text()
description = box_info.find(class_="beer-descrption-read-less")
description.find('a').extract()
description = description.get_text().strip()
abv = box_info.find(class_="abv").get_text().strip()
ibu = box_info.find(class_="ibu").get_text().strip()
ratings = box_info.find(class_="num").get_text().replace("(", "").replace(")", "")
n_ratings = box_info.find(class_="raters").get_text().strip()
beer_dict = {"name": name, "brewery": brewery, "style": style, "description": description, "alcohol_by_volume": abv, "ibu_bitterness_unit": ibu, "ratings": ratings, "n_ratings": n_ratings}
beer_list.append(beer_dict)
keys = beer_list[0].keys()
with open('beers.csv', 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(beer_list)
file = 'beers.csv'
loader = CSVLoader(file_path=file)
data = loader.load()
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(data, embeddings)
return db
# # Save vectorstore
# with open("db.pkl", "wb") as f:
# pickle.dump(db, f)
if __name__ == "__main__":
ingest_data()