from lxml import html import requests import os client_id = os.environ['LV_CLIENT_ID'] client_secret = os.environ['LV_CLIENT_SECRET'] headers = { 'Content-Type':'application/json', 'authority':'api.louisvuitton.com', 'accept':'application/json, text/plain, */*', 'accept-language':'en-US,en;q=0.6', 'client_id':client_id, 'client_secret':client_secret, 'cookie':'ak_cc=US; OPTOUTMULTI=0:0%7Cc1:0%7Cc2:1%7Cc4:1%7Cc3:1; ATGID=anonymous; SGID=sb.springboot41-prd; SGID=.springboot41-prd; prevURL=; currentURL=https://www.louisvuitton.com/ajax/productsearch; storeLangCommerceHeader=eng-e1; geolocUserZone=eng-us; AKA_A2=A; _abck=94026E30A1FABEBFA844CBD6388965B8~0~YAAQnr8mF5/FGIeIAQAAjjKPkgpw8D5r3zC2HFECV0EQXRXPCoAQ+gYSqR/Sut2w/03toFhfVT9Yda45yFwqUGKStpIxbZAwPw0ooQplfv9eymgzFxyoYjEcqrD6rV/4OEgdCIov1wE0On3Z71z1v9UoZOisQgAlBEKsV0dYi02t6vutjUwi6f5T6N+h6SWX1l62T/QGwvxI13WHrWAOIsNJ1VJd/N3FiC/cKxVKskc3YPJf8tFZ25jOs2cUin5GUXWA7HuLb7dpffVxE0wp5vcOes47KXi9be6zRpbtfF+aHEjVkgPsOjznAFhN/X6FCHBTuJ5UvY2vHlWEIR8kW/pEW4zHm5bfQyaKKiBWQvHTXkeKRyhCnUSoRx9wZPB2dIcdY2igDmG06NT+NWYDPJtLoIU1I14uXhn+p8/w~-1~-1~-1; bm_sz=A8530E728A25BB2F984ED31AD5DF00A6~YAAQnr8mF6LFGIeIAQAAjjKPkhQ72plnxNz48wepUXjRYRETUBQ1oWkIiS8E0wS/+9NHmJzh9bagubSRjmSbTkSEEzyIbBsWTowVRBJBCiW6xO9lhJT/vmSLZtosV4g0eTOhRFoFwRrorwWUjGarPRTLozlk+KpZppYbt+EbdcyCdZNb95EUr6Za65FUi8FRSH02djUkhL5XlC3aebrVqtHfnG7uCJsWYBo+fnDJU1+mxQ8e1J+iTdq4ZdXsYuUuXZz3A0OJMy1JP5M/kKV4JKmIO89y3rgqZZB+xxNLbY+Su4yH7c/q2+s=~3556656~3686978; lv-dispatch=eng-us; consent_ecom_us=functional:1|analytics:0|customization:0|advertising:0; qb_dnt=2; ak_bmsc=589EDDF01ED59E65EA72F7393A895F31~000000000000000000000000000000~YAAQnr8mFxbGGIeIAQAATzaPkhR5K7vu8hiBPm/G/uXQB2vcY/Q4eGJWL+r7GNyViebDR/5XNvUSus0l/5Uhpgq+sHzuQR7L7aYzbFREYRNqvAZ0ngto1AvbYgwL8xA7OJM+D03m8i/BMKZHk++/aC3UFmk/RYNOKIv8p4HjlGM22auYotHx2mlWq8DTZPj1z77noO3WrzpYdEirg6xSV8havW9BHABzNmP3Cfnt3xxVq3y3bMNDcpiXy2+eAJ7eX5UNVa/Rj+P75LGxaPAv4AsXiphTgX6rMKt5T6OAvLMX6vdLLCpZYvDxvtctIZ1o7p1kgBdympPZFvVTPcmDtKrzh9sABV0v2Lls8NzLbbdf2k4B5uyUfy0fQL60zUUZgrlSOoFzgJgQviOHvoVQbqFdXCFyiH+fBjKltnlQXKJ9f0Y5JOjnBIkUHCJ9fUHINiwqt9h2cw5daORuAKdfQpsi3Z3cSVM7cIdb/Dfu0f2vu+K1GOxNv1+NH0KlwbAxMA==; ATG_SESSION_ID=GCUqx-HVSh3KVpIaIX+L30So.front41-prd; _dynSessConf=-86995900442046424; JSESSIONID=GCUqx-HVSh3KVpIaIX+L30So.front41-prd; PIM-SESSION-ID=HBqQRDXDzoWH4e2V; bm_sv=CF0A04BDB7486115978F40C364AB5E38~YAAQnr8mF1opHYeIAQAAT1G5khS7D+LGov3Y87+pRE+B0F86Y3tJjyDrwszLeFA06ZC1s/so8vDcDtmB0VrCDU1N+jlXvFNpGfEkhJTiyQrFAkZq2i57xih3Y4Oe7kDnWY6TqYj05c7rqHmBOUw8+XtLNLJAzSPx+0cKcLqujkgpHiI1xPuO1N27mPXK0SMc2DLsF9MJZa0EqlH8YtoHBgDTz/q2R9Q5g28FJHVuw3SqLHdFDaHO2dML6P/eVsRDeYR3XTd8OQ==~1; lv-dispatch-url=https://us.louisvuitton.com/eng-us/women/handbags/all-handbags/_/N-tfr7qdp; utag_main=_sn:6$_se:25$_ss:0$_st:1686090605436$dc_visit:2$v_id:01884a527895001827510ebf5e3504075002b06d00a61$ses_id:1686086038612%3Bexp-session$_pn:10%3Bexp-session', 'origin':'https://us.louisvuitton.com', 'referer':'https://us.louisvuitton.com/', 'sec-ch-ua':'"Brave";v="113", "Chromium";v="113", "Not-A.Brand";v="24"', 'sec-ch-ua-mobile':'?0', 'sec-ch-ua-platform':'"macOS"', 'sec-fetch-dest':'empty', 'sec-fetch-mode':'cors', 'sec-fetch-site':'same-site', 'sec-gpc':'1', 'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0', } url = 'https://us.louisvuitton.com/eng-us/homepage' response = requests.get(url, headers=headers) print(len(response.text)) tree = html.fromstring(response.text) sitemap_paths = tree.xpath('//*[@id="header"]//a/@href') categories = {} for p in sitemap_paths: parts = p.split("/") if parts[-1].startswith("N-") : categories[parts[-3]]= {"code":parts[-1][2:]} import requests for cat_key in categories.keys(): print(cat_key) code = categories[cat_key]["code"] url = f'https://api.louisvuitton.com/eco-eu/search-merch-eapi/v1/eng-us/plp/products/{code}-ay1ygzo25?page=' print(url) nbPages = 100 page = 0 all_hits = [] while page < nbPages: response = requests.get(url + str(page), headers=headers) res = response.json() if "nbPages" in res: nbPages = res["nbPages"] all_hits.extend(res["hits"]) print(res["page"], len(all_hits)) page = page + 1 recs = [] misses = [] for hit in all_hits: try: rec = {} for key in ["productId", "name", "url", "disambiguatingDescription"]: rec[key] = hit[key] rec["image"] = hit["image"][0]["contentUrl"] recs.append(rec) except: misses.append(rec) print(hit) categories[cat_key]["urls"] = recs categories[cat_key]["misses"] = misses flatten = [] for cat in categories.keys(): info = categories[cat] for rec in categories[cat]["urls"]: doc = {} doc["productId"] = rec["productId"] doc["category"] = cat doc["category_code"] = info["code"] doc["name"] = rec["name"] doc["url"] = rec["url"] doc["disambiguatingDescription"] = rec["disambiguatingDescription"] flatten.append(doc) import pandas as pd dat = pd.DataFrame(flatten) size = str(400) dat['image_fix'] = dat['url'].str.replace("{IMG_HEIGHT}", size).str.replace("{IMG_WIDTH}", size) import os for rec in dat.itertuples(): os.system(f""" curl '{rec.image_fix}'\ -H 'authority: us.louisvuitton.com' \ -H 'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8' \ -H 'accept-language: en-US,en;q=0.9' \ -H 'cache-control: no-cache' \ -H 'cookie: ak_cc=US; OPTOUTMULTI=0:0%7Cc1:0%7Cc2:1%7Cc4:1%7Cc3:1; ATGID=anonymous; SGID=sb.springboot41-prd; prevURL=; currentURL=https://www.louisvuitton.com/ajax/productsearch; storeLangCommerceHeader=eng-e1; geolocUserZone=eng-us; AKA_A2=A; _abck=94026E30A1FABEBFA844CBD6388965B8~0~YAAQnr8mF5/FGIeIAQAAjjKPkgpw8D5r3zC2HFECV0EQXRXPCoAQ+gYSqR/Sut2w/03toFhfVT9Yda45yFwqUGKStpIxbZAwPw0ooQplfv9eymgzFxyoYjEcqrD6rV/4OEgdCIov1wE0On3Z71z1v9UoZOisQgAlBEKsV0dYi02t6vutjUwi6f5T6N+h6SWX1l62T/QGwvxI13WHrWAOIsNJ1VJd/N3FiC/cKxVKskc3YPJf8tFZ25jOs2cUin5GUXWA7HuLb7dpffVxE0wp5vcOes47KXi9be6zRpbtfF+aHEjVkgPsOjznAFhN/X6FCHBTuJ5UvY2vHlWEIR8kW/pEW4zHm5bfQyaKKiBWQvHTXkeKRyhCnUSoRx9wZPB2dIcdY2igDmG06NT+NWYDPJtLoIU1I14uXhn+p8/w~-1~-1~-1; bm_sz=A8530E728A25BB2F984ED31AD5DF00A6~YAAQnr8mF6LFGIeIAQAAjjKPkhQ72plnxNz48wepUXjRYRETUBQ1oWkIiS8E0wS/+9NHmJzh9bagubSRjmSbTkSEEzyIbBsWTowVRBJBCiW6xO9lhJT/vmSLZtosV4g0eTOhRFoFwRrorwWUjGarPRTLozlk+KpZppYbt+EbdcyCdZNb95EUr6Za65FUi8FRSH02djUkhL5XlC3aebrVqtHfnG7uCJsWYBo+fnDJU1+mxQ8e1J+iTdq4ZdXsYuUuXZz3A0OJMy1JP5M/kKV4JKmIO89y3rgqZZB+xxNLbY+Su4yH7c/q2+s=~3556656~3686978; lv-dispatch=eng-us; consent_ecom_us=functional:1|analytics:0|customization:0|advertising:0; qb_dnt=2; ak_bmsc=589EDDF01ED59E65EA72F7393A895F31~000000000000000000000000000000~YAAQnr8mFxbGGIeIAQAATzaPkhR5K7vu8hiBPm/G/uXQB2vcY/Q4eGJWL+r7GNyViebDR/5XNvUSus0l/5Uhpgq+sHzuQR7L7aYzbFREYRNqvAZ0ngto1AvbYgwL8xA7OJM+D03m8i/BMKZHk++/aC3UFmk/RYNOKIv8p4HjlGM22auYotHx2mlWq8DTZPj1z77noO3WrzpYdEirg6xSV8havW9BHABzNmP3Cfnt3xxVq3y3bMNDcpiXy2+eAJ7eX5UNVa/Rj+P75LGxaPAv4AsXiphTgX6rMKt5T6OAvLMX6vdLLCpZYvDxvtctIZ1o7p1kgBdympPZFvVTPcmDtKrzh9sABV0v2Lls8NzLbbdf2k4B5uyUfy0fQL60zUUZgrlSOoFzgJgQviOHvoVQbqFdXCFyiH+fBjKltnlQXKJ9f0Y5JOjnBIkUHCJ9fUHINiwqt9h2cw5daORuAKdfQpsi3Z3cSVM7cIdb/Dfu0f2vu+K1GOxNv1+NH0KlwbAxMA==; PIM-SESSION-ID=HBqQRDXDzoWH4e2V; lv-dispatch-url=https://us.louisvuitton.com/eng-us/women/handbags/all-handbags/_/N-tfr7qdp; utag_main=_sn:6$_se:28$_ss:0$_st:1686090718084$dc_visit:2$v_id:01884a527895001827510ebf5e3504075002b06d00a61$ses_id:1686086038612%3Bexp-session$_pn:11%3Bexp-session; anonymous_session=true; ATG_SESSION_ID=B-8233ZEyNVjFobJCNa+2INd.front41-prd; _dynSessConf=2255588875954406228; JSESSIONID=B-8233ZEyNVjFobJCNa+2INd.front41-prd; bm_sv=CF0A04BDB7486115978F40C364AB5E38~YAAQnr8mF1xwHYeIAQAAlSm7khS9xE3iX2eAH+anrXfnlQ7v8oUsEA0z/MTkPXZnzBAvGACc64Rw7A1Y5WpGYwhqMubbtv5eueVsxuxlhE/aJNZtgpkk/epZage/P7W27HdoknmpCXGdYaFsYNTqrcCrNXOS/DUkUdvE6OjHGDg6c+05MfuLiHR+zMzGM/mZzxWzV3ruLRv6toIGPskD/LkRrOU0j8B7alLLaAhQKTvZVFPlcgbho2BHD8rHcNL0E7h2pvdGvA==~1' \ -H 'pragma: no-cache' \ -H 'referer: https://us.louisvuitton.com/eng-us/products/twist-belt-chain-wallet-epi-nvprod1740047v/M68560' \ -H 'sec-ch-ua: "Not.A/Brand";v="8", "Chromium";v="114", "Brave";v="114"' \ -H 'sec-ch-ua-mobile: ?0' \ -H 'sec-ch-ua-platform: "macOS"' \ -H 'sec-fetch-dest: image' \ -H 'sec-fetch-mode: no-cors' \ -H 'sec-fetch-site: same-origin' \ -H 'sec-gpc: 1' \ -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' \ --compressed --output 'images/{rec.productId}.png' """)