Spaces:

halme
/

Stock_Price_Forecaster_ID2223

Running

App Files Files Community

Stock_Price_Forecaster_ID2223 / DataLoader.py

halme

Upload 2 files

95528fd verified 2 months ago

raw

history blame

4.48 kB

	#Class to fetch news and stock data from the web for a specific ticker and combine them into a dataframe.

	import pandas as pd
	import numpy as np
	import requests
	import matplotlib.pyplot as plt
	import yfinance as yf
	from datetime import datetime
	from pygooglenews import GoogleNews
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from transformers import pipeline
	class DataLoader:
	def __init__(self, ticker, time_period_news, time_period_stock, news_decay_rate = 0):
	self.ticker = ticker
	self.time_period_news = time_period_news
	self.time_period_stock = time_period_stock
	self.news_decay_rate = news_decay_rate

	def get_data(self):
	stock_data = self.get_stock_data()
	news_data = self.get_news_data()
	news_sentiment = self.get_sentiment(news_data)
	combined_data = self.combine_data(stock_data, news_sentiment)

	if self.news_decay_rate != 0:
	combined_data = self.news_decay(combined_data, self.news_decay_rate)

	return combined_data


	def get_stock_data(self):
	data = yf.download(self.ticker, period = self.time_period_stock)
	df = pd.DataFrame()
	df['Open'] = data['Open']
	df['Close'] = data['Close']
	df['High'] = data['High']
	df['Low'] = data['Low']

	return df

	def get_news_data(self):
	googlenews = GoogleNews()
	news_data = googlenews.search(self.ticker, when=self.time_period_news)
	news_data = pd.DataFrame(news_data['entries'])
	return news_data

	def get_sentiment(self, news_data):
	tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
	model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
	classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

	news_sentiment = []
	for i in range(len(news_data)):
	sentiment = classifier(news_data['title'][i], top_k=None)
	postive_score = sentiment[0]['score']
	negative_score = sentiment[1]['score']
	neutral_score = sentiment[2]['score']
	reformmated_time_stamp = pd.to_datetime(news_data['published'][i]).date()
	news_sentiment.append({'Date': reformmated_time_stamp, 'positive_score': postive_score, 'negative_score': negative_score, 'neutral_score': neutral_score})
	return pd.DataFrame(news_sentiment)

	def combine_data(self, stock_data, news_sentiment):
	news_sentiment = (
	news_sentiment
	.groupby('Date')
	.mean()
	.fillna(0)
	.reset_index()
	.set_index('Date')
	.sort_index()
	)

	common_index = pd.date_range(
	start=pd.Timestamp(min(pd.Timestamp(stock_data.index[0]), pd.Timestamp(news_sentiment.index[0]))),
	end=pd.Timestamp(max(pd.Timestamp(stock_data.index[-1]), pd.Timestamp(news_sentiment.index[-1]))),
	freq='D'
	)
	stock_data = stock_data.reindex(common_index).fillna(-1)

	news_sentiment = news_sentiment.reindex(common_index).fillna(0)

	#Ensure stock_data and news_sentiment have combatile indices
	stock_data.index = pd.to_datetime(stock_data.index).normalize()
	news_sentiment.index = pd.to_datetime(news_sentiment.index).normalize()

	combined_data = pd.merge(
	stock_data,
	news_sentiment,
	how='left',
	left_index=True,
	right_index=True
	)

	#Drop all close values that are -1
	combined_data = combined_data[combined_data['Close'] != -1]
	#fill all missing values with 0
	combined_data = combined_data.fillna(0)

	return combined_data

	def news_decay(self, Combined_data, decay_rate):
	#We have lots of days in the data with no news. We will fill these days with the previous days news * decay_rate
	#This will allow us to have a more continuous news data
	combined_data = Combined_data.copy()
	news_columns = ['positive_score', 'negative_score', 'neutral_score']
	#We want to start from the oldest date and work our way to the newest date
	for i in range(1, len(combined_data)):
	for column in news_columns:
	if combined_data[column][i] == 0:
	combined_data[column][i] = combined_data[column][i-1] * decay_rate
	return combined_data