#Class to fetch news and stock data from the web for a specific ticker and combine them into a dataframe.

import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime
from pygooglenews import GoogleNews
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
class DataLoader:
    def __init__(self, ticker, time_period_news, time_period_stock, news_decay_rate = 0):
        self.ticker = ticker
        self.time_period_news = time_period_news
        self.time_period_stock = time_period_stock
        self.news_decay_rate = news_decay_rate

    def get_data(self):
        stock_data = self.get_stock_data()
        news_data = self.get_news_data()
        news_sentiment = self.get_sentiment(news_data)
        combined_data = self.combine_data(stock_data, news_sentiment)

        if self.news_decay_rate != 0:
            combined_data = self.news_decay(combined_data, self.news_decay_rate)

        return combined_data


    def get_stock_data(self):
        data = yf.download(self.ticker, period = self.time_period_stock)
        df = pd.DataFrame()
        df['Open'] = data['Open']
        df['Close'] = data['Close']
        df['High'] = data['High']
        df['Low'] = data['Low']

        return df

    def get_news_data(self):
        googlenews = GoogleNews()
        news_data = googlenews.search(self.ticker, when=self.time_period_news)
        news_data = pd.DataFrame(news_data['entries'])
        return news_data

    def get_sentiment(self, news_data):
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

        news_sentiment = []
        for i in range(len(news_data)):
            sentiment = classifier(news_data['title'][i], top_k=None)
            postive_score = sentiment[0]['score']
            negative_score = sentiment[1]['score']
            neutral_score = sentiment[2]['score']
            reformmated_time_stamp = pd.to_datetime(news_data['published'][i]).date()
            news_sentiment.append({'Date': reformmated_time_stamp, 'positive_score': postive_score, 'negative_score': negative_score, 'neutral_score': neutral_score})
        return pd.DataFrame(news_sentiment)

    def combine_data(self, stock_data, news_sentiment):
        news_sentiment = (
            news_sentiment
            .groupby('Date')
            .mean()
            .fillna(0)
            .reset_index()
            .set_index('Date')
            .sort_index()
        )

        common_index = pd.date_range(
            start=pd.Timestamp(min(pd.Timestamp(stock_data.index[0]), pd.Timestamp(news_sentiment.index[0]))),
            end=pd.Timestamp(max(pd.Timestamp(stock_data.index[-1]), pd.Timestamp(news_sentiment.index[-1]))),
            freq='D'
        )
        stock_data = stock_data.reindex(common_index).fillna(-1)

        news_sentiment = news_sentiment.reindex(common_index).fillna(0)

        #Ensure stock_data and news_sentiment have combatile indices
        stock_data.index = pd.to_datetime(stock_data.index).normalize()
        news_sentiment.index = pd.to_datetime(news_sentiment.index).normalize()

        combined_data = pd.merge(
            stock_data,
            news_sentiment,
            how='left',
            left_index=True,
            right_index=True
        )

        #Drop all close values that are -1
        combined_data = combined_data[combined_data['Close'] != -1]
        #fill all missing values with 0
        combined_data = combined_data.fillna(0)

        return combined_data

    def news_decay(self, Combined_data, decay_rate):
        #We have lots of days in the data with no news. We will fill these days with the previous days news * decay_rate
        #This will allow us to have a more continuous news data
        combined_data = Combined_data.copy()
        news_columns = ['positive_score', 'negative_score', 'neutral_score']
        #We want to start from the oldest date and work our way to the newest date
        for i in range(1, len(combined_data)):
            for column in news_columns:
                if combined_data[column][i] == 0:
                    combined_data[column][i] = combined_data[column][i-1] * decay_rate
        return combined_data