Spaces:
Sleeping
Sleeping
File size: 3,816 Bytes
3e01798 a27e4e5 3e01798 a27e4e5 3e01798 84d76e8 3e01798 84d76e8 3e01798 84d76e8 a27e4e5 3e01798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from bs4 import BeautifulSoup
import re
import requests as r
from html2text import html2text
import tqdm
def process_url(url):
"""Process a single URL to fetch answers."""
try:
response = r.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# answers = []
# for idx in range(1, 100):
# answer = soup.find('div', {'id': f'answer_{idx}'})
# if answer:
# answers.append(answer)
# else:
# break
answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')})
answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify()))
for answer in answers if answer.find('div', {'class': "answerDetail"})]
title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
# print("Question: ", questionDetails, '\n')
title = title.replace("์ง๋ฌธ", '').strip()
print("Answers extracted from: \n", url)
print(len(answers))
print('-'*60)
return {
"title": title,
"questionDetails": questionDetails,
"url": url,
"answers": answers
}
except Exception as e:
print(f"Error processing URL {url}: {e}")
with open('error_urls.txt', 'w') as f:
f.write(url + '\n')
return {"title": '', "questionDetails": '', "url": url, "answers": ''}
def get_answers(results_a_elements, query):
"""Fetch answers for all the extracted result links."""
if not results_a_elements:
print("No results found.")
return []
print("Result links extracted: ", len(results_a_elements))
# Limit the number of parallel processes for better resource management
# max_processes = 4
# with multiprocessing.Pool(processes=max_processes) as pool:
# results = pool.map(process_url, results_a_elements)
results = []
# answer_count = 0
for url in tqdm.tqdm(results_a_elements):
res = process_url(url)
results.append(res)
# answer_count += len(res['answers'])
return results
def get_search_results(query, num_pages):
"""Fetch search results for the given query from Naver ์ง์in."""
results = []
for page in range(1, num_pages + 1):
url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
print("Starting the scraping process for:\n", url)
try:
response = r.get(url)
soup = BeautifulSoup(response.text, "html.parser")
results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
results += results_a_elements
except Exception as e:
print(f"Error while fetching search results: {e}")
return results
def extract_data(query, num_pages=150) -> list[dict[str, object]]:
results_a_elements = get_search_results(query, num_pages)
answers = get_answers(results_a_elements, query)
print("Total answers collected:", len(answers))
return answers
# if __name__ == "__main__":
# start = time.time()
# query = "์ฅ๋ํฌ๋ง, ์ธ๊ณต์ง๋ฅ ๊ฐ๋ฐ์/์ฐ๊ตฌ์, ํ์ด์ฌ, ์คํ์ ์์ค, ํ์ด์ฌ ์ค์น, ๋์ ์ถ์ฒ"
# answers = process_query(query)
# print("Total answers collected:", len(answers))
# print("Time taken: ", time.time() - start)
# # print(answers)
# AJAX URL:
# https://kin.naver.com/ajax/detail/answerList.naver?
# dirId=401030201&docId=292159869
# &answerSortType=DEFAULT&answerViewType=DETAIL
# &answerNo=&page=2&count=5&_=1736131792605 |