hushh / crawl_lv.py
Justin Donaldson
add code
fe3bafe
raw
history blame
No virus
8.85 kB
from lxml import html
import requests
import os
client_id = os.environ['LV_CLIENT_ID']
client_secret = os.environ['LV_CLIENT_SECRET']
headers = {
'Content-Type':'application/json',
'authority':'api.louisvuitton.com',
'accept':'application/json, text/plain, */*',
'accept-language':'en-US,en;q=0.6',
'client_id':client_id,
'client_secret':client_secret,
'cookie':'ak_cc=US; OPTOUTMULTI=0:0%7Cc1:0%7Cc2:1%7Cc4:1%7Cc3:1; ATGID=anonymous; SGID=sb.springboot41-prd; SGID=.springboot41-prd; prevURL=; currentURL=https://www.louisvuitton.com/ajax/productsearch; storeLangCommerceHeader=eng-e1; geolocUserZone=eng-us; AKA_A2=A; _abck=94026E30A1FABEBFA844CBD6388965B8~0~YAAQnr8mF5/FGIeIAQAAjjKPkgpw8D5r3zC2HFECV0EQXRXPCoAQ+gYSqR/Sut2w/03toFhfVT9Yda45yFwqUGKStpIxbZAwPw0ooQplfv9eymgzFxyoYjEcqrD6rV/4OEgdCIov1wE0On3Z71z1v9UoZOisQgAlBEKsV0dYi02t6vutjUwi6f5T6N+h6SWX1l62T/QGwvxI13WHrWAOIsNJ1VJd/N3FiC/cKxVKskc3YPJf8tFZ25jOs2cUin5GUXWA7HuLb7dpffVxE0wp5vcOes47KXi9be6zRpbtfF+aHEjVkgPsOjznAFhN/X6FCHBTuJ5UvY2vHlWEIR8kW/pEW4zHm5bfQyaKKiBWQvHTXkeKRyhCnUSoRx9wZPB2dIcdY2igDmG06NT+NWYDPJtLoIU1I14uXhn+p8/w~-1~-1~-1; bm_sz=A8530E728A25BB2F984ED31AD5DF00A6~YAAQnr8mF6LFGIeIAQAAjjKPkhQ72plnxNz48wepUXjRYRETUBQ1oWkIiS8E0wS/+9NHmJzh9bagubSRjmSbTkSEEzyIbBsWTowVRBJBCiW6xO9lhJT/vmSLZtosV4g0eTOhRFoFwRrorwWUjGarPRTLozlk+KpZppYbt+EbdcyCdZNb95EUr6Za65FUi8FRSH02djUkhL5XlC3aebrVqtHfnG7uCJsWYBo+fnDJU1+mxQ8e1J+iTdq4ZdXsYuUuXZz3A0OJMy1JP5M/kKV4JKmIO89y3rgqZZB+xxNLbY+Su4yH7c/q2+s=~3556656~3686978; lv-dispatch=eng-us; consent_ecom_us=functional:1|analytics:0|customization:0|advertising:0; qb_dnt=2; ak_bmsc=589EDDF01ED59E65EA72F7393A895F31~000000000000000000000000000000~YAAQnr8mFxbGGIeIAQAATzaPkhR5K7vu8hiBPm/G/uXQB2vcY/Q4eGJWL+r7GNyViebDR/5XNvUSus0l/5Uhpgq+sHzuQR7L7aYzbFREYRNqvAZ0ngto1AvbYgwL8xA7OJM+D03m8i/BMKZHk++/aC3UFmk/RYNOKIv8p4HjlGM22auYotHx2mlWq8DTZPj1z77noO3WrzpYdEirg6xSV8havW9BHABzNmP3Cfnt3xxVq3y3bMNDcpiXy2+eAJ7eX5UNVa/Rj+P75LGxaPAv4AsXiphTgX6rMKt5T6OAvLMX6vdLLCpZYvDxvtctIZ1o7p1kgBdympPZFvVTPcmDtKrzh9sABV0v2Lls8NzLbbdf2k4B5uyUfy0fQL60zUUZgrlSOoFzgJgQviOHvoVQbqFdXCFyiH+fBjKltnlQXKJ9f0Y5JOjnBIkUHCJ9fUHINiwqt9h2cw5daORuAKdfQpsi3Z3cSVM7cIdb/Dfu0f2vu+K1GOxNv1+NH0KlwbAxMA==; ATG_SESSION_ID=GCUqx-HVSh3KVpIaIX+L30So.front41-prd; _dynSessConf=-86995900442046424; JSESSIONID=GCUqx-HVSh3KVpIaIX+L30So.front41-prd; PIM-SESSION-ID=HBqQRDXDzoWH4e2V; bm_sv=CF0A04BDB7486115978F40C364AB5E38~YAAQnr8mF1opHYeIAQAAT1G5khS7D+LGov3Y87+pRE+B0F86Y3tJjyDrwszLeFA06ZC1s/so8vDcDtmB0VrCDU1N+jlXvFNpGfEkhJTiyQrFAkZq2i57xih3Y4Oe7kDnWY6TqYj05c7rqHmBOUw8+XtLNLJAzSPx+0cKcLqujkgpHiI1xPuO1N27mPXK0SMc2DLsF9MJZa0EqlH8YtoHBgDTz/q2R9Q5g28FJHVuw3SqLHdFDaHO2dML6P/eVsRDeYR3XTd8OQ==~1; lv-dispatch-url=https://us.louisvuitton.com/eng-us/women/handbags/all-handbags/_/N-tfr7qdp; utag_main=_sn:6$_se:25$_ss:0$_st:1686090605436$dc_visit:2$v_id:01884a527895001827510ebf5e3504075002b06d00a61$ses_id:1686086038612%3Bexp-session$_pn:10%3Bexp-session',
'origin':'https://us.louisvuitton.com',
'referer':'https://us.louisvuitton.com/',
'sec-ch-ua':'"Brave";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"macOS"',
'sec-fetch-dest':'empty',
'sec-fetch-mode':'cors',
'sec-fetch-site':'same-site',
'sec-gpc':'1',
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0',
}
url = 'https://us.louisvuitton.com/eng-us/homepage'
response = requests.get(url, headers=headers)
print(len(response.text))
tree = html.fromstring(response.text)
sitemap_paths = tree.xpath('//*[@id="header"]//a/@href')
categories = {}
for p in sitemap_paths:
parts = p.split("/")
if parts[-1].startswith("N-") :
categories[parts[-3]]= {"code":parts[-1][2:]}
import requests
for cat_key in categories.keys():
print(cat_key)
code = categories[cat_key]["code"]
url = f'https://api.louisvuitton.com/eco-eu/search-merch-eapi/v1/eng-us/plp/products/{code}-ay1ygzo25?page='
print(url)
nbPages = 100
page = 0
all_hits = []
while page < nbPages:
response = requests.get(url + str(page), headers=headers)
res = response.json()
if "nbPages" in res:
nbPages = res["nbPages"]
all_hits.extend(res["hits"])
print(res["page"], len(all_hits))
page = page + 1
recs = []
misses = []
for hit in all_hits:
try:
rec = {}
for key in ["productId", "name", "url", "disambiguatingDescription"]:
rec[key] = hit[key]
rec["image"] = hit["image"][0]["contentUrl"]
recs.append(rec)
except:
misses.append(rec)
print(hit)
categories[cat_key]["urls"] = recs
categories[cat_key]["misses"] = misses
flatten = []
for cat in categories.keys():
info = categories[cat]
for rec in categories[cat]["urls"]:
doc = {}
doc["productId"] = rec["productId"]
doc["category"] = cat
doc["category_code"] = info["code"]
doc["name"] = rec["name"]
doc["url"] = rec["url"]
doc["disambiguatingDescription"] = rec["disambiguatingDescription"]
flatten.append(doc)
import pandas as pd
dat = pd.DataFrame(flatten)
size = str(400)
dat['image_fix'] = dat['url'].str.replace("{IMG_HEIGHT}", size).str.replace("{IMG_WIDTH}", size)
import os
for rec in dat.itertuples():
os.system(f"""
curl '{rec.image_fix}'\
-H 'authority: us.louisvuitton.com' \
-H 'accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8' \
-H 'accept-language: en-US,en;q=0.9' \
-H 'cache-control: no-cache' \
-H 'cookie: ak_cc=US; OPTOUTMULTI=0:0%7Cc1:0%7Cc2:1%7Cc4:1%7Cc3:1; ATGID=anonymous; SGID=sb.springboot41-prd; prevURL=; currentURL=https://www.louisvuitton.com/ajax/productsearch; storeLangCommerceHeader=eng-e1; geolocUserZone=eng-us; AKA_A2=A; _abck=94026E30A1FABEBFA844CBD6388965B8~0~YAAQnr8mF5/FGIeIAQAAjjKPkgpw8D5r3zC2HFECV0EQXRXPCoAQ+gYSqR/Sut2w/03toFhfVT9Yda45yFwqUGKStpIxbZAwPw0ooQplfv9eymgzFxyoYjEcqrD6rV/4OEgdCIov1wE0On3Z71z1v9UoZOisQgAlBEKsV0dYi02t6vutjUwi6f5T6N+h6SWX1l62T/QGwvxI13WHrWAOIsNJ1VJd/N3FiC/cKxVKskc3YPJf8tFZ25jOs2cUin5GUXWA7HuLb7dpffVxE0wp5vcOes47KXi9be6zRpbtfF+aHEjVkgPsOjznAFhN/X6FCHBTuJ5UvY2vHlWEIR8kW/pEW4zHm5bfQyaKKiBWQvHTXkeKRyhCnUSoRx9wZPB2dIcdY2igDmG06NT+NWYDPJtLoIU1I14uXhn+p8/w~-1~-1~-1; bm_sz=A8530E728A25BB2F984ED31AD5DF00A6~YAAQnr8mF6LFGIeIAQAAjjKPkhQ72plnxNz48wepUXjRYRETUBQ1oWkIiS8E0wS/+9NHmJzh9bagubSRjmSbTkSEEzyIbBsWTowVRBJBCiW6xO9lhJT/vmSLZtosV4g0eTOhRFoFwRrorwWUjGarPRTLozlk+KpZppYbt+EbdcyCdZNb95EUr6Za65FUi8FRSH02djUkhL5XlC3aebrVqtHfnG7uCJsWYBo+fnDJU1+mxQ8e1J+iTdq4ZdXsYuUuXZz3A0OJMy1JP5M/kKV4JKmIO89y3rgqZZB+xxNLbY+Su4yH7c/q2+s=~3556656~3686978; lv-dispatch=eng-us; consent_ecom_us=functional:1|analytics:0|customization:0|advertising:0; qb_dnt=2; ak_bmsc=589EDDF01ED59E65EA72F7393A895F31~000000000000000000000000000000~YAAQnr8mFxbGGIeIAQAATzaPkhR5K7vu8hiBPm/G/uXQB2vcY/Q4eGJWL+r7GNyViebDR/5XNvUSus0l/5Uhpgq+sHzuQR7L7aYzbFREYRNqvAZ0ngto1AvbYgwL8xA7OJM+D03m8i/BMKZHk++/aC3UFmk/RYNOKIv8p4HjlGM22auYotHx2mlWq8DTZPj1z77noO3WrzpYdEirg6xSV8havW9BHABzNmP3Cfnt3xxVq3y3bMNDcpiXy2+eAJ7eX5UNVa/Rj+P75LGxaPAv4AsXiphTgX6rMKt5T6OAvLMX6vdLLCpZYvDxvtctIZ1o7p1kgBdympPZFvVTPcmDtKrzh9sABV0v2Lls8NzLbbdf2k4B5uyUfy0fQL60zUUZgrlSOoFzgJgQviOHvoVQbqFdXCFyiH+fBjKltnlQXKJ9f0Y5JOjnBIkUHCJ9fUHINiwqt9h2cw5daORuAKdfQpsi3Z3cSVM7cIdb/Dfu0f2vu+K1GOxNv1+NH0KlwbAxMA==; PIM-SESSION-ID=HBqQRDXDzoWH4e2V; lv-dispatch-url=https://us.louisvuitton.com/eng-us/women/handbags/all-handbags/_/N-tfr7qdp; utag_main=_sn:6$_se:28$_ss:0$_st:1686090718084$dc_visit:2$v_id:01884a527895001827510ebf5e3504075002b06d00a61$ses_id:1686086038612%3Bexp-session$_pn:11%3Bexp-session; anonymous_session=true; ATG_SESSION_ID=B-8233ZEyNVjFobJCNa+2INd.front41-prd; _dynSessConf=2255588875954406228; JSESSIONID=B-8233ZEyNVjFobJCNa+2INd.front41-prd; bm_sv=CF0A04BDB7486115978F40C364AB5E38~YAAQnr8mF1xwHYeIAQAAlSm7khS9xE3iX2eAH+anrXfnlQ7v8oUsEA0z/MTkPXZnzBAvGACc64Rw7A1Y5WpGYwhqMubbtv5eueVsxuxlhE/aJNZtgpkk/epZage/P7W27HdoknmpCXGdYaFsYNTqrcCrNXOS/DUkUdvE6OjHGDg6c+05MfuLiHR+zMzGM/mZzxWzV3ruLRv6toIGPskD/LkRrOU0j8B7alLLaAhQKTvZVFPlcgbho2BHD8rHcNL0E7h2pvdGvA==~1' \
-H 'pragma: no-cache' \
-H 'referer: https://us.louisvuitton.com/eng-us/products/twist-belt-chain-wallet-epi-nvprod1740047v/M68560' \
-H 'sec-ch-ua: "Not.A/Brand";v="8", "Chromium";v="114", "Brave";v="114"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
-H 'sec-fetch-dest: image' \
-H 'sec-fetch-mode: no-cors' \
-H 'sec-fetch-site: same-origin' \
-H 'sec-gpc: 1' \
-H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' \
--compressed --output 'images/{rec.productId}.png'
""")