Spaces:

ashok2216
/

youtube-data_scraper

Build error

App Files Files Community

youtube-data_scraper / code /youtube1.py

ashok2216

Upload 13 files

4292ffa verified 11 months ago

raw

history blame contribute delete

4.51 kB

	import time
	import pprint
	import csv
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.by import By
	import csv
	from youtube_comment_scraper_python import *
	import pandas as pd
	import plotly.express as px
	import re
	import streamlit as st

	st.title('Youtube Channel Analysis')
	st.write('Youtube WebScrap')


	# # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------

	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))


	url = st.text_input('Paste the Youtube Channel Link',"")
	if not url:
	st.warning('Please input a Link.')
	st.stop()
	st.success('Thank you for inputting a link.')
	# url ='https://www.youtube.com/@YasoobKhalid/videos'
	name = re.compile(r"[A-Z]\w+")
	inp = name.findall(url)
	out = inp[0]
	st.write('Getting Data from', out, 'channel')
	driver.get(url)

	# url = input('Enter Youtube Video Url- ')
	# driver.get(url)
	# # "https://www.youtube.com/@YasoobKhalid/videos"
	# channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
	handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
	subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text

	WAIT_IN_SECONDS = 5
	last_height = driver.execute_script("return document.documentElement.scrollHeight")

	while True:
	# Scroll to the bottom of page
	driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
	# Wait for new videos to show up
	time.sleep(WAIT_IN_SECONDS)

	# Calculate new document height and compare it with last height
	new_height = driver.execute_script("return document.documentElement.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height


	thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
	views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
	titles = driver.find_elements(By.ID, "video-title")
	links = driver.find_elements(By.ID, "video-title-link")
	# likes = driver.find_elements(By.ID, "video-title-link-likes")

	videos = []
	for title, view, thumb, link in zip(titles, views, thumbnails, links):
	video_dict = {
	'title': title.text,
	'views': view.text,
	# 'likes': likes.text,
	'thumbnail': thumb.get_attribute('src'),
	'link': link.get_attribute('href')
	}
	videos.append(video_dict)

	print(videos)

	to_csv = videos
	keys = to_csv[0].keys()

	with open('output/people.csv', 'w', newline='', encoding='utf-8') as output_file:
	dict_writer = csv.DictWriter(output_file, keys)
	dict_writer.writeheader()
	dict_writer.writerows(to_csv)
	df = pd.read_csv('output/people.csv')
	st.dataframe(df)

	count = st.slider('Select Lower Video Count', 0, 607, 100)
	st.write("You selected", count, 'Videos')

	fig = px.bar(df,
	x="title",
	y="views", height=600
	)
	fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
	# fig.update_yaxes(tickvals=['10k', '22k', '29k', '56k'])
	tab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])
	with tab1:
	# Use the Streamlit theme.
	# This is the default. So you can also omit the theme argument.
	st.plotly_chart(fig, theme="streamlit", use_container_width=True)
	with tab2:
	# Use the native Plotly theme.
	st.plotly_chart(fig, theme=None, use_container_width=True)

	# ----------------------------------------------------------------------------COMMENTS------------------------------------------------------------------------------


	# url = input('Enter Youtube Video Url- ')
	# youtube.open(url)
	# youtube.keypress("pagedown")

	# data = []
	# currentpagesource=youtube.get_page_source()
	# lastpagesource=''

	# while(True):
	# if(lastpagesource==currentpagesource):
	# break

	# lastpagesource=currentpagesource
	# response=youtube.video_comments()

	# for c in response['body']:
	# data.append(c)

	# youtube.scroll()
	# currentpagesource=youtube.get_page_source()


	# df = pd.DataFrame(data)

	# df = df.replace('\n',' ', regex=True)

	# df = df[['Comment', 'Likes']].drop_duplicates(keep="first")
	# # df = df[['Likes']].drop_duplicates(keep="first")

	# df.to_csv('output/data.csv',index=False)

	# df.head()