LittleApple-fp16's picture
Upload 88 files
4f8ad24
raw
history blame
2.11 kB
import os
import re
from typing import Iterator, Tuple, Union
from hbutils.system import urlsplit
from .web import WebDataSource
from ..utils import get_requests_session, srequest
def _extract_words(keyword):
return list(filter(bool, re.split(r'[\W_]+', keyword)))
class DuitangSource(WebDataSource):
def __init__(self, keyword: str, strict: bool = True, page_size: int = 100,
group_name: str = 'duitang', download_silent: bool = True):
WebDataSource.__init__(self, group_name, get_requests_session(), download_silent)
self.keyword = keyword
self.words = set(_extract_words(keyword))
self.page_size: int = page_size
self.strict = strict
def _check_title(self, title):
if not self.strict:
return True
else:
t_words = set(_extract_words(title))
return len(t_words & self.words) == len(self.words)
def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]:
offset = 0
while True:
resp = srequest(self.session, 'GET', 'https://www.duitang.com/napi/blog/list/by_search/', params={
'kw': self.keyword,
'start': str(offset),
'limit': str(self.page_size),
})
resp.raise_for_status()
raw = resp.json()
if 'data' not in raw or 'object_list' not in raw['data']:
break
posts = raw['data']['object_list']
if not posts:
break
for post in posts:
if not self._check_title(post['msg']):
continue
url = post['photo']['path']
_, ext_name = os.path.splitext(urlsplit(url).filename)
filename = f'{self.group_name}_{post["id"]}{ext_name}'
meta = {
'duitang': post,
'group_id': f'{self.group_name}_{post["id"]}',
'filename': filename,
}
yield post['id'], url, meta
offset += self.page_size