File size: 2,111 Bytes
4f8ad24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import re
from typing import Iterator, Tuple, Union

from hbutils.system import urlsplit

from .web import WebDataSource
from ..utils import get_requests_session, srequest


def _extract_words(keyword):
    return list(filter(bool, re.split(r'[\W_]+', keyword)))


class DuitangSource(WebDataSource):
    def __init__(self, keyword: str, strict: bool = True, page_size: int = 100,
                 group_name: str = 'duitang', download_silent: bool = True):
        WebDataSource.__init__(self, group_name, get_requests_session(), download_silent)
        self.keyword = keyword
        self.words = set(_extract_words(keyword))
        self.page_size: int = page_size
        self.strict = strict

    def _check_title(self, title):
        if not self.strict:
            return True
        else:
            t_words = set(_extract_words(title))
            return len(t_words & self.words) == len(self.words)

    def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]:
        offset = 0
        while True:
            resp = srequest(self.session, 'GET', 'https://www.duitang.com/napi/blog/list/by_search/', params={
                'kw': self.keyword,
                'start': str(offset),
                'limit': str(self.page_size),
            })
            resp.raise_for_status()

            raw = resp.json()
            if 'data' not in raw or 'object_list' not in raw['data']:
                break

            posts = raw['data']['object_list']
            if not posts:
                break

            for post in posts:
                if not self._check_title(post['msg']):
                    continue

                url = post['photo']['path']
                _, ext_name = os.path.splitext(urlsplit(url).filename)
                filename = f'{self.group_name}_{post["id"]}{ext_name}'
                meta = {
                    'duitang': post,
                    'group_id': f'{self.group_name}_{post["id"]}',
                    'filename': filename,
                }
                yield post['id'], url, meta

            offset += self.page_size