File size: 3,312 Bytes
00f57d4
ba41c0a
00f57d4
 
 
1896c1d
00f57d4
ba41c0a
1896c1d
00f57d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1896c1d
 
 
 
 
00f57d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
import os
import requests
from base64 import b64decode
from bs4 import BeautifulSoup
from typing import Dict, Optional

Z_KEY = os.environ.get('ZYTE_KEY')
PAGE_NOT_FOUND_STR = 'page not found'


def zyte_call(url: str) -> bytes:
    api_response = requests.post(
        "https://api.zyte.com/v1/extract",
        auth=(Z_KEY, ""),
        json={
            "url": url,
            "httpResponseBody": True
        },
    )
    http_response_body: bytes = b64decode(
        api_response.json()["httpResponseBody"])
    return http_response_body


def get_asin_pdp(soup: BeautifulSoup) -> Optional[Dict[str, str]]:
    # Check if 404
    if PAGE_NOT_FOUND_STR in soup.find('title').text.lower():
        return None

    # Get ASIN
    try:
        asin = soup.find('link', rel='canonical')['href'].split('/')[-1]
    except TypeError:
        asin = None

    # Get title
    search = soup.find('span', id="productTitle")
    title = search.text.lstrip().rstrip() if search else None

    # Get feature-bullets
    search = soup.find('div', id="feature-bullets")
    if search:
        bullet_search = search.find_all('span', class_='a-list-item')
        feature_bullets = [h.text.lstrip().rstrip() for h in bullet_search if len(bullet_search)]
        # Remove unwanted bullets
        feature_bullets = [b for b in feature_bullets if b != 'Make sure this fits by entering your model number.']
    else:
        feature_bullets = None

    # Get KV, tech, A+ tables. Merge with override key hierarchy: A+ > tech > KV
    kv_res = parse_kv_table(soup)
    tech_res = parse_tech_table(soup)
    ap_data = parse_ap_table(soup)
    tech_data = {**kv_res, **tech_res, **ap_data}

    res = {'asin': asin, 'title': title, 'feature_bullets': feature_bullets, 'tech_data': tech_data}
    return res


def parse_kv_table(soup: BeautifulSoup) -> Dict[str, str]:
    kv_res = {}
    try:
        search = soup.find('div', id='productOverview_feature_div')
        table = search.find('table')

        data = table.find_all('tr')
        for d in data:
            kv = d.find_all('td')
            k = kv[0].text.lstrip().rstrip()
            v = kv[1].text.lstrip().rstrip()
            kv_res[k] = v
    except AttributeError:
        pass
    return kv_res


def parse_tech_table(soup: BeautifulSoup) -> Dict[str, str]:
    tech_res = {}
    tables = soup.find_all('table', id=re.compile('productDetails_techSpec.*'))
    if tables:
        for tab in tables:
            data = tab.find_all('tr')
            for d in data:
                key = d.find('th').text.lstrip().rstrip()
                value = d.find('td').text.strip('\n').replace('\u200e', '').lstrip().rstrip()
                tech_res[key] = value
    return tech_res


def parse_ap_table(soup: BeautifulSoup) -> Dict[str, str]:
    ap_res = {}
    tech = soup.find_all('div', id='tech')
    for div in tech:
        tables = div.find_all('table')
        for tab in tables:
            data = tab.find_all('tr')
            for d in data:
                kv = d.find_all('td')
                if kv:
                    key = kv[0].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
                    value = kv[1].text.strip('\n').replace('\u200e', '').lstrip().rstrip()
                    ap_res[key] = value
    return ap_res