Spaces:

iarbel
/

amazon-feature-bullets-demo

Sleeping

App Files Files Community

amazon-feature-bullets-demo / src /scrape.py

iarbel

add src files

00f57d4 12 months ago

raw

history blame

3.12 kB

	import re
	import requests
	from base64 import b64decode
	from bs4 import BeautifulSoup
	from typing import Dict

	Z_KEY = ''


	def zyte_call(url: str) -> bytes:
	api_response = requests.post(
	"https://api.zyte.com/v1/extract",
	auth=(Z_KEY, ""),
	json={
	"url": url,
	"httpResponseBody": True
	},
	)
	http_response_body: bytes = b64decode(
	api_response.json()["httpResponseBody"])
	return http_response_body


	def get_asin_pdp(soup: BeautifulSoup) -> Dict[str, str]:
	# Get ASIN
	try:
	asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
	except TypeError:
	asin = None

	# Get title
	search = soup.find('span', id="productTitle")
	title = search.text.lstrip().rstrip() if search else None

	# Get feature-bullets
	search = soup.find('div', id="feature-bullets")
	if search:
	bullet_search = search.find_all('span', class_='a-list-item')
	feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)]
	# Remove unwanted bullets
	feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.']
	else:
	feature_bullets = None

	# Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV
	kv_res = parse_kv_table(soup)
	tech_res = parse_tech_table(soup)
	ap_data = parse_ap_table(soup)
	tech_data = {kv_res, tech_res, **ap_data}

	res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data}
	return res


	def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]:
	kv_res = {}
	try:
	search = soup.find('div', id='productOverview_feature_div')
	table = search.find('table')

	data = table.find_all('tr')
	for d in data:
	kv = d.find_all('td')
	k = kv[0].text.lstrip().rstrip()
	v = kv[1].text.lstrip().rstrip()
	kv_res[k] = v
	except AttributeError:
	pass
	return kv_res


	def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]:
	tech_res = {}
	tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*'))
	if tables:
	for tab in tables:
	data = tab.find_all('tr')
	for d in data:
	key = d.find('th').text.lstrip().rstrip()
	value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip()
	tech_res[key] = value
	return tech_res


	def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]:
	ap_res = {}
	tech = soup.find_all('div', id='tech')
	for div in tech:
	tables = div.find_all('table')
	for tab in tables:
	data = tab.find_all('tr')
	for d in data:
	kv = d.find_all('td')
	if kv:
	key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
	value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
	ap_res[key] = value
	return ap_res