iarbel's picture
add error handling
1896c1d
raw
history blame
3.31 kB
import re
import os
import requests
from base64 import b64decode
from bs4 import BeautifulSoup
from typing import Dict, Optional
Z_KEY = os.environ.get('ZYTE_KEY')
PAGE_NOT_FOUND_STR = 'page not found'
def zyte_call(url: str) -> bytes:
api_response = requests.post(
"https://api.zyte.com/v1/extract",
auth=(Z_KEY, ""),
json={
"url": url,
"httpResponseBody": True
},
)
http_response_body: bytes = b64decode(
api_response.json()["httpResponseBody"])
return http_response_body
def get_asin_pdp(soup: BeautifulSoup) -> Optional[Dict[str, str]]:
# Check if 404
if PAGE_NOT_FOUND_STR in soup.find('title').text.lower():
return None
# Get ASIN
try:
asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
except TypeError:
asin = None
# Get title
search = soup.find('span', id="productTitle")
title = search.text.lstrip().rstrip() if search else None
# Get feature-bullets
search = soup.find('div', id="feature-bullets")
if search:
bullet_search = search.find_all('span', class_='a-list-item')
feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)]
# Remove unwanted bullets
feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.']
else:
feature_bullets = None
# Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV
kv_res = parse_kv_table(soup)
tech_res = parse_tech_table(soup)
ap_data = parse_ap_table(soup)
tech_data = {**kv_res, **tech_res, **ap_data}
res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data}
return res
def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]:
kv_res = {}
try:
search = soup.find('div', id='productOverview_feature_div')
table = search.find('table')
data = table.find_all('tr')
for d in data:
kv = d.find_all('td')
k = kv[0].text.lstrip().rstrip()
v = kv[1].text.lstrip().rstrip()
kv_res[k] = v
except AttributeError:
pass
return kv_res
def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]:
tech_res = {}
tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*'))
if tables:
for tab in tables:
data = tab.find_all('tr')
for d in data:
key = d.find('th').text.lstrip().rstrip()
value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip()
tech_res[key] = value
return tech_res
def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]:
ap_res = {}
tech = soup.find_all('div', id='tech')
for div in tech:
tables = div.find_all('table')
for tab in tables:
data = tab.find_all('tr')
for d in data:
kv = d.find_all('td')
if kv:
key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
ap_res[key] = value
return ap_res