File size: 2,896 Bytes
ba34941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57ca393
ba34941
57ca393
 
 
 
ba34941
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""Scrape selected data from the 'untapped' website, put them in a list of dictionaries, 
embed and store them into a vectorstore."""

import requests
import os
import openai
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CohereEmbeddings, HuggingFaceHubEmbeddings
from langchain.vectorstores import FAISS, Annoy, DocArrayInMemorySearch
from langchain.document_loaders import CSVLoader
import csv
import pickle

openai.api_key  = os.environ['OPENAI_API_KEY']


def ingest_data():
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    url='https://untappd.com/v/gourmet-haus-staudt/15392'
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = soup.find_all('a', attrs={'data-track':'menu', 'data-href': ":"})

    beer_list = []

    for link in links:
        page_url = 'https://untappd.com' + link.get('href')
        
        if page_url:
            page_response = requests.get(page_url, headers=headers)
            page_soup = BeautifulSoup(page_response.text, 'html.parser')
            
            # Extract the "relevant" page content
            box_info = page_soup.find('div', class_="box b_info")

            if box_info:
                name = box_info.find('h1').get_text()
                brewery = box_info.find(class_="brewery").get_text().strip()
                style = box_info.find(class_="style").get_text()
                description = box_info.find(class_="beer-descrption-read-less")
                description.find('a').extract()
                description = description.get_text().strip()
                abv = box_info.find(class_="abv").get_text().strip()
                ibu = box_info.find(class_="ibu").get_text().strip()
                ratings = box_info.find(class_="num").get_text().replace("(", "").replace(")", "")
                n_ratings = box_info.find(class_="raters").get_text().strip()
            
            beer_dict = {"name": name, "brewery": brewery, "style": style, "description": description, "alcohol_by_volume": abv, "ibu_bitterness_unit": ibu, "ratings": ratings, "n_ratings": n_ratings}
            
            beer_list.append(beer_dict)

    keys = beer_list[0].keys()

    with open('beers.csv', 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(beer_list)

    file = 'beers.csv'
    loader = CSVLoader(file_path=file)
    data = loader.load()

    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(data, embeddings)

    return db

    # # Save vectorstore

    # with open("db.pkl", "wb") as f:
    #     pickle.dump(db, f)

if __name__ == "__main__":
    ingest_data()