File size: 4,484 Bytes
95528fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#Class to fetch news and stock data from the web for a specific ticker and combine them into a dataframe.

import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime
from pygooglenews import GoogleNews
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
class DataLoader:
    def __init__(self, ticker, time_period_news, time_period_stock, news_decay_rate = 0):
        self.ticker = ticker
        self.time_period_news = time_period_news
        self.time_period_stock = time_period_stock
        self.news_decay_rate = news_decay_rate

    def get_data(self):
        stock_data = self.get_stock_data()
        news_data = self.get_news_data()
        news_sentiment = self.get_sentiment(news_data)
        combined_data = self.combine_data(stock_data, news_sentiment)

        if self.news_decay_rate != 0:
            combined_data = self.news_decay(combined_data, self.news_decay_rate)

        return combined_data


    def get_stock_data(self):
        data = yf.download(self.ticker, period = self.time_period_stock)
        df = pd.DataFrame()
        df['Open'] = data['Open']
        df['Close'] = data['Close']
        df['High'] = data['High']
        df['Low'] = data['Low']

        return df

    def get_news_data(self):
        googlenews = GoogleNews()
        news_data = googlenews.search(self.ticker, when=self.time_period_news)
        news_data = pd.DataFrame(news_data['entries'])
        return news_data

    def get_sentiment(self, news_data):
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

        news_sentiment = []
        for i in range(len(news_data)):
            sentiment = classifier(news_data['title'][i], top_k=None)
            postive_score = sentiment[0]['score']
            negative_score = sentiment[1]['score']
            neutral_score = sentiment[2]['score']
            reformmated_time_stamp = pd.to_datetime(news_data['published'][i]).date()
            news_sentiment.append({'Date': reformmated_time_stamp, 'positive_score': postive_score, 'negative_score': negative_score, 'neutral_score': neutral_score})
        return pd.DataFrame(news_sentiment)

    def combine_data(self, stock_data, news_sentiment):
        news_sentiment = (
            news_sentiment
            .groupby('Date')
            .mean()
            .fillna(0)
            .reset_index()
            .set_index('Date')
            .sort_index()
        )

        common_index = pd.date_range(
            start=pd.Timestamp(min(pd.Timestamp(stock_data.index[0]), pd.Timestamp(news_sentiment.index[0]))),
            end=pd.Timestamp(max(pd.Timestamp(stock_data.index[-1]), pd.Timestamp(news_sentiment.index[-1]))),
            freq='D'
        )
        stock_data = stock_data.reindex(common_index).fillna(-1)

        news_sentiment = news_sentiment.reindex(common_index).fillna(0)

        #Ensure stock_data and news_sentiment have combatile indices
        stock_data.index = pd.to_datetime(stock_data.index).normalize()
        news_sentiment.index = pd.to_datetime(news_sentiment.index).normalize()

        combined_data = pd.merge(
            stock_data,
            news_sentiment,
            how='left',
            left_index=True,
            right_index=True
        )

        #Drop all close values that are -1
        combined_data = combined_data[combined_data['Close'] != -1]
        #fill all missing values with 0
        combined_data = combined_data.fillna(0)

        return combined_data

    def news_decay(self, Combined_data, decay_rate):
        #We have lots of days in the data with no news. We will fill these days with the previous days news * decay_rate
        #This will allow us to have a more continuous news data
        combined_data = Combined_data.copy()
        news_columns = ['positive_score', 'negative_score', 'neutral_score']
        #We want to start from the oldest date and work our way to the newest date
        for i in range(1, len(combined_data)):
            for column in news_columns:
                if combined_data[column][i] == 0:
                    combined_data[column][i] = combined_data[column][i-1] * decay_rate
        return combined_data