Spaces:
Sleeping
Sleeping
| import pdb | |
| import numpy as np | |
| import pandas as pd | |
| from scrapy.selector import Selector | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.common.keys import Keys | |
| import time | |
| from tqdm import tqdm | |
| import warnings | |
| import pdb | |
| warnings.filterwarnings("ignore") | |
| chrome_options = webdriver.ChromeOptions() | |
| chrome_options.add_argument('--headless') | |
| chrome_options.add_argument('--no-sandbox') | |
| chrome_options.add_argument('--disable-dev-shm-usage') | |
| driver = webdriver.Chrome(options=chrome_options) | |
| if __name__ == '__main__': | |
| url = "https://www.imdb.com/title/tt1517268/reviews/?ref_=tt_ov_rt" | |
| driver.get(url) | |
| sel = Selector(text = driver.page_source) | |
| review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0] | |
| more_review_pages = int(int(review_counts)/25) | |
| for i in tqdm(range(more_review_pages)): | |
| try: | |
| css_selector = 'load-more-trigger' | |
| driver.find_element(By.ID, css_selector).click() | |
| except: | |
| pass | |
| rating_list = [] | |
| review_date_list = [] | |
| review_title_list = [] | |
| author_list = [] | |
| review_list = [] | |
| review_url_list = [] | |
| error_url_list = [] | |
| error_msg_list = [] | |
| reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container') | |
| for d in tqdm(reviews): | |
| try: | |
| sel2 = Selector(text = d.get_attribute('innerHTML')) | |
| try: | |
| rating = sel2.css('.rating-other-user-rating span::text').extract_first() | |
| except: | |
| rating = np.NaN | |
| try: | |
| review = sel2.css('.text.show-more__control::text').extract_first() | |
| except: | |
| review = np.NaN | |
| try: | |
| review_date = sel2.css('.review-date::text').extract_first() | |
| except: | |
| review_date = np.NaN | |
| try: | |
| author = sel2.css('.display-name-link a::text').extract_first() | |
| except: | |
| author = np.NaN | |
| try: | |
| review_title = sel2.css('a.title::text').extract_first() | |
| except: | |
| review_title = np.NaN | |
| try: | |
| review_url = sel2.css('a.title::attr(href)').extract_first() | |
| except: | |
| review_url = np.NaN | |
| rating_list.append(rating) | |
| review_date_list.append(review_date) | |
| review_title_list.append(review_title) | |
| author_list.append(author) | |
| review_list.append(review) | |
| review_url_list.append(review_url) | |
| except Exception as e: | |
| error_url_list.append(url) | |
| error_msg_list.append(e) | |
| review_df = pd.DataFrame({ | |
| 'Review_Date':review_date_list, | |
| 'Author':author_list, | |
| 'Rating':rating_list, | |
| 'Review_Title':review_title_list, | |
| 'Review':review_list, | |
| 'Review_Url':review_url | |
| }) | |
| pdb.set_trace() | |