Spaces:
Sleeping
Sleeping
import re | |
import requests | |
from base64 import b64decode | |
from bs4 import BeautifulSoup | |
from typing import Dict | |
Z_KEY = '' | |
def zyte_call(url: str) -> bytes: | |
api_response = requests.post( | |
"https://api.zyte.com/v1/extract", | |
auth=(Z_KEY, ""), | |
json={ | |
"url": url, | |
"httpResponseBody": True | |
}, | |
) | |
http_response_body: bytes = b64decode( | |
api_response.json()["httpResponseBody"]) | |
return http_response_body | |
def get_asin_pdp(soup: BeautifulSoup) -> Dict[str, str]: | |
# Get ASIN | |
try: | |
asin = soup.find('link', rel='canonical')['href'].split('/')[-1] | |
except TypeError: | |
asin = None | |
# Get title | |
search = soup.find('span', id="productTitle") | |
title = search.text.lstrip().rstrip() if search else None | |
# Get feature-bullets | |
search = soup.find('div', id="feature-bullets") | |
if search: | |
bullet_search = search.find_all('span', class_='a-list-item') | |
feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)] | |
# Remove unwanted bullets | |
feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.'] | |
else: | |
feature_bullets = None | |
# Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV | |
kv_res = parse_kv_table(soup) | |
tech_res = parse_tech_table(soup) | |
ap_data = parse_ap_table(soup) | |
tech_data = {**kv_res, **tech_res, **ap_data} | |
res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data} | |
return res | |
def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]: | |
kv_res = {} | |
try: | |
search = soup.find('div', id='productOverview_feature_div') | |
table = search.find('table') | |
data = table.find_all('tr') | |
for d in data: | |
kv = d.find_all('td') | |
k = kv[0].text.lstrip().rstrip() | |
v = kv[1].text.lstrip().rstrip() | |
kv_res[k] = v | |
except AttributeError: | |
pass | |
return kv_res | |
def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]: | |
tech_res = {} | |
tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*')) | |
if tables: | |
for tab in tables: | |
data = tab.find_all('tr') | |
for d in data: | |
key = d.find('th').text.lstrip().rstrip() | |
value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip() | |
tech_res[key] = value | |
return tech_res | |
def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]: | |
ap_res = {} | |
tech = soup.find_all('div', id='tech') | |
for div in tech: | |
tables = div.find_all('table') | |
for tab in tables: | |
data = tab.find_all('tr') | |
for d in data: | |
kv = d.find_all('td') | |
if kv: | |
key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip() | |
value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip() | |
ap_res[key] = value | |
return ap_res |