Spaces:
Runtime error
Runtime error
| import re | |
| import csv | |
| import time | |
| import requests | |
| from typing import List | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from bs4 import BeautifulSoup | |
| def scrape_question_with_answers(question_url: str) -> List[str]: | |
| url = 'https://stackoverflow.com/' + question_url | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| title = soup.find('title').text.replace(' - Stack Overflow', '') | |
| question_div = soup.find('div', {'class': 'postcell post-layout--right'}) | |
| question = question_div.find('p').text | |
| answers_div = soup.find('div', {'class': 'answercell post-layout--right'}) | |
| answer = answers_div.find('div', {'class': 's-prose js-post-body'}).text | |
| return [title, question, answer, url] | |
| def scrape_questions_page(url: str, min_votes: int, min_answers: int) -> List[List[str]]: | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| posts_summaries = soup.find_all('div', {'class':'s-post-summary js-post-summary'}) | |
| qa_data = [] | |
| for summary in posts_summaries: | |
| stats_div = summary.find('div', {'class': 's-post-summary--stats'}) | |
| vote_div = stats_div.find('div', { | |
| 'class': 's-post-summary--stats-item s-post-summary--stats-item__emphasized', | |
| 'title': re.compile(r'^Score of \d+$')}) | |
| if vote_div: | |
| vote_number = int(vote_div.find('span', {'class': 's-post-summary--stats-item-number'}).text) | |
| else: | |
| vote_number = 0 | |
| answer_div = stats_div.find('div', { | |
| 'class': 's-post-summary--stats-item', | |
| 'title': re.compile(r'^\d+ answers$')}) | |
| if answer_div: | |
| answer_number = int(answer_div.find('span', {'class': 's-post-summary--stats-item-number'}).text) | |
| else: | |
| answer_number = 0 | |
| question_href = summary.find('a', {'class': 's-link'})['href'] | |
| if vote_number >= min_votes and answer_number >= min_answers: | |
| try: | |
| qa_data.append(scrape_question_with_answers(question_href)) | |
| except Exception as error: | |
| print(error) | |
| time.sleep(1.5) | |
| return qa_data | |
| def crawl_and_save_qa( | |
| filename: str, | |
| base_url: str, | |
| start_page: int, | |
| n_pages: int=10, | |
| min_votes: int=1, | |
| min_answers: int=1 | |
| ): | |
| with open(filename, 'a', newline='') as f: | |
| writer = csv.writer(f) | |
| if start_page == 1: | |
| writer.writerow(['title', 'question', 'answer', 'url']) | |
| for page_num in tqdm(range(start_page, start_page+n_pages)): | |
| page_data = scrape_questions_page( | |
| base_url.format(page_num), | |
| min_votes, | |
| min_answers | |
| ) | |
| if page_data: | |
| for qa_data in page_data: | |
| writer.writerow(qa_data) | |
| if __name__ == '__main__': | |
| filename = '../datasets/stackoverflow_linux.csv' | |
| url = 'https://stackoverflow.com/questions/tagged/linux?tab=votes&page={}&pagesize=15' | |
| crawl_and_save_qa( | |
| filename=filename, | |
| base_url=url, | |
| start_page=21, | |
| n_pages=10, | |
| min_votes=1, | |
| min_answers=1 | |
| ) | |