File size: 3,816 Bytes
3e01798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a27e4e5
 
3e01798
 
 
 
 
a27e4e5
 
3e01798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84d76e8
3e01798
 
 
 
 
84d76e8
3e01798
 
 
 
 
 
 
84d76e8
a27e4e5
 
 
3e01798
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from bs4 import BeautifulSoup
import re
import requests as r
from html2text import html2text
import tqdm

def process_url(url):
    """Process a single URL to fetch answers."""
    try:
        response = r.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        # answers = []
        # for idx in range(1, 100):
        #     answer = soup.find('div', {'id': f'answer_{idx}'})
        #     if answer:
        #         answers.append(answer)
        #     else:
        #         break
        answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')})
        answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify()))
                   for answer in answers if answer.find('div', {'class': "answerDetail"})]
        title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
        questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
        # print("Question: ", questionDetails, '\n')
        title = title.replace("์งˆ๋ฌธ", '').strip()
        print("Answers extracted from: \n", url)
        print(len(answers))
        print('-'*60)
        return {
            "title": title,
            "questionDetails": questionDetails,
            "url": url,
            "answers": answers
        }
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        with open('error_urls.txt', 'w') as f:
            f.write(url + '\n')
        return {"title": '', "questionDetails": '', "url": url, "answers": ''}

def get_answers(results_a_elements, query):
    """Fetch answers for all the extracted result links."""
    if not results_a_elements:
        print("No results found.")
        return []

    print("Result links extracted: ", len(results_a_elements))
    
    # Limit the number of parallel processes for better resource management
    # max_processes = 4
    
    # with multiprocessing.Pool(processes=max_processes) as pool:
    #     results = pool.map(process_url, results_a_elements)

    results = []
    # answer_count = 0
    for url in tqdm.tqdm(results_a_elements):
        res = process_url(url)
        results.append(res)
        # answer_count += len(res['answers'])
    return results

def get_search_results(query, num_pages):
    """Fetch search results for the given query from Naver ์ง€์‹in."""
    results = []
    for page in range(1, num_pages + 1):
        url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
        print("Starting the scraping process for:\n", url)

        try:
            response = r.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
            results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
            results += results_a_elements
        except Exception as e:
            print(f"Error while fetching search results: {e}")
    return results

def extract_data(query, num_pages=150) -> list[dict[str, object]]:
    results_a_elements = get_search_results(query, num_pages)
    answers = get_answers(results_a_elements, query)
    print("Total answers collected:", len(answers))
    return answers

# if __name__ == "__main__":
#     start = time.time()
#     query = "์žฅ๋ž˜ํฌ๋ง, ์ธ๊ณต์ง€๋Šฅ ๊ฐœ๋ฐœ์ž/์—ฐ๊ตฌ์›, ํŒŒ์ด์ฌ, ์ค‘ํ•™์ƒ ์ˆ˜์ค€, ํŒŒ์ด์ฌ ์„ค์น˜, ๋„์„œ ์ถ”์ฒœ"
#     answers = process_query(query)
#     print("Total answers collected:", len(answers))
#     print("Time taken: ", time.time() - start)
#     # print(answers)




# AJAX URL: 
# https://kin.naver.com/ajax/detail/answerList.naver?
# dirId=401030201&docId=292159869
# &answerSortType=DEFAULT&answerViewType=DETAIL
# &answerNo=&page=2&count=5&_=1736131792605