Spaces:
Runtime error
Runtime error
File size: 2,111 Bytes
4f8ad24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import os
import re
from typing import Iterator, Tuple, Union
from hbutils.system import urlsplit
from .web import WebDataSource
from ..utils import get_requests_session, srequest
def _extract_words(keyword):
return list(filter(bool, re.split(r'[\W_]+', keyword)))
class DuitangSource(WebDataSource):
def __init__(self, keyword: str, strict: bool = True, page_size: int = 100,
group_name: str = 'duitang', download_silent: bool = True):
WebDataSource.__init__(self, group_name, get_requests_session(), download_silent)
self.keyword = keyword
self.words = set(_extract_words(keyword))
self.page_size: int = page_size
self.strict = strict
def _check_title(self, title):
if not self.strict:
return True
else:
t_words = set(_extract_words(title))
return len(t_words & self.words) == len(self.words)
def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]:
offset = 0
while True:
resp = srequest(self.session, 'GET', 'https://www.duitang.com/napi/blog/list/by_search/', params={
'kw': self.keyword,
'start': str(offset),
'limit': str(self.page_size),
})
resp.raise_for_status()
raw = resp.json()
if 'data' not in raw or 'object_list' not in raw['data']:
break
posts = raw['data']['object_list']
if not posts:
break
for post in posts:
if not self._check_title(post['msg']):
continue
url = post['photo']['path']
_, ext_name = os.path.splitext(urlsplit(url).filename)
filename = f'{self.group_name}_{post["id"]}{ext_name}'
meta = {
'duitang': post,
'group_id': f'{self.group_name}_{post["id"]}',
'filename': filename,
}
yield post['id'], url, meta
offset += self.page_size
|