Spaces:
Runtime error
Runtime error
File size: 5,991 Bytes
4f8ad24 |
|
import os
from enum import Enum
from typing import Iterator, Union, List, Optional, Mapping, Tuple, Literal
from urllib.parse import quote_plus, urljoin
from hbutils.system import urlsplit
from .web import WebDataSource
from ..utils import get_requests_session, srequest
class Sort(str, Enum):
ID = 'id'
FAV = 'fav'
class Time(str, Enum):
ALL = '0'
LAST_7000 = '1'
LAST_15000 = '2'
class Dimension(str, Enum):
LARGE = 'large'
HUGE = 'huge'
LANDSCAPE = 'landscape'
PORTRAIT = 'portrait'
SQUARE = 'square'
SelectTyping = Literal['medium', 'large', 'full']
class ZerochanSource(WebDataSource):
__SITE__ = 'https://www.zerochan.net'
def __init__(self, word: Union[str, List[str]], sort: Sort = Sort.FAV, time: Time = Time.ALL,
dimension: Optional[Dimension] = None, color: Optional[str] = None, strict: bool = False,
select: SelectTyping = 'large', group_name: str = 'zerochan', download_silent: bool = True,
user_agent=None, username: Optional[str] = None, password: Optional[str] = None):
if user_agent:
headers = {'User-Agent': user_agent}
else:
headers = {}
WebDataSource.__init__(self, group_name, get_requests_session(headers=headers), download_silent)
self.word = word
self.sort = sort
self.time = time
self.dimension = dimension
self.color = color
self.strict = strict
self.select = select
self.username = username
self._password = password
self._is_authed = False
def _auth(self):
if not self._is_authed and self.username is not None:
resp = self.session.post(
'https://www.zerochan.net/login',
data={
'ref': '/',
'name': self.username,
'password': self._password,
'login': 'Login'
},
headers={
'Referrer': "https://www.zerochan.net/login?ref=%2F",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,'
'image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded',
},
allow_redirects=False,
)
if resp.status_code != 303:
raise ConnectionError('Username or password wrong, failed to login to zerochan.net.')
self._is_authed = True
@property
def _base_url(self) -> str:
if isinstance(self.word, str):
return f'{self.__SITE__}/{quote_plus(self.word)}'
elif isinstance(self.word, (list, tuple)):
return f'{self.__SITE__}/{",".join(map(quote_plus, self.word))}'
else:
raise TypeError(f'Unknown type of word - {self.word!r}.')
@property
def _params(self) -> Mapping[str, str]:
params = {
'json': '1',
's': self.sort.value,
't': self.time.value,
}
if self.dimension is not None:
params['d'] = self.dimension.value
if self.color is not None:
params['c'] = self.color
if self.strict:
params['strict'] = '1'
return params
@classmethod
def _get_urls(cls, data):
id_ = data['id']
medium_url = data['thumbnail']
prefix = quote_plus(data['tag'].replace(' ', '.'))
large_urls = [f'https://s1.zerochan.net/{prefix}.600.{id_}.jpg']
full_urls = [
f"https://static.zerochan.net/{prefix}.full.{id_}{ext}"
for ext in ['.jpg', '.png']
]
return {'medium': medium_url, 'large': large_urls, 'full': full_urls}
def _get_url(self, data):
urls = self._get_urls(data)
if self.select == 'full':
url_fallbacks = [*urls['full'], *urls['large']]
elif self.select == 'large':
url_fallbacks = urls['large']
else:
url_fallbacks = []
for url in url_fallbacks:
resp = srequest(self.session, 'HEAD', url, raise_for_status=False)
if resp.ok:
return url
else:
return urls['medium']
def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]:
self._auth()
page = 1
while True:
quit_ = False
_base_url = self._base_url
while True:
resp = srequest(self.session, 'GET', _base_url,
params={**self._params, 'p': str(page), 'l': '200'},
allow_redirects=False, raise_for_status=False)
if resp.status_code // 100 == 3:
_base_url = urljoin(_base_url, resp.headers['Location'])
elif resp.status_code in {403, 404}:
quit_ = True
break
else:
resp.raise_for_status()
break
if quit_:
break
json_ = resp.json()
if 'items' in json_:
items = json_['items']
for data in items:
url = self._get_url(data)
_, ext_name = os.path.splitext(urlsplit(url).filename)
filename = f'{self.group_name}_{data["id"]}{ext_name}'
meta = {
'zerochan': {
**data,
'url': url,
},
'group_id': f'{self.group_name}_{data["id"]}',
'filename': filename,
}
yield data["id"], url, meta
else:
break
page += 1
|