max-bevza commited on
Commit
cbce622
·
verified ·
1 Parent(s): 3507a33

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .envrc +2 -0
  2. .gitattributes +3 -0
  3. .gitignore +4 -0
  4. .vercelignore +1 -0
  5. README.md +35 -6
  6. TestRepro.ipynb +732 -0
  7. all_metric_stats.csv +0 -0
  8. classification/model_with_extra_features/classification.py +221 -0
  9. classification/model_with_extra_features/custom_model.py +44 -0
  10. classification/model_with_extra_features/final_dataset_since_october_2022.parquet.gzip +3 -0
  11. classification/model_with_extra_features/same_day_as_viral_with_features_train_test_balanced_accuracy.txt +4 -0
  12. classification/model_with_extra_features/text_preprocessing.py +110 -0
  13. classification/model_with_only_language_models/classification.py +221 -0
  14. classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip +3 -0
  15. classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt +3 -0
  16. classification/model_with_only_language_models/same_day_as_viral_train_test_balanced_accuracy.txt +4 -0
  17. classification/model_with_only_language_models/same_day_as_viral_with_features_train_test_balanced_accuracy.txt +2 -0
  18. classification/model_with_only_language_models/test.parquet.gzip +0 -0
  19. classification/model_with_only_language_models/text_preprocessing.py +110 -0
  20. classification/model_with_only_language_models/train.parquet.gzip +0 -0
  21. data/control.csv +3 -0
  22. data/viral.csv +1042 -0
  23. main.py +86 -0
  24. metric_analysis/1-standardize_metrics.py +53 -0
  25. metric_analysis/2023-precision-recall-update.py +35 -0
  26. metric_analysis/output_original/hard_threshold_viral_covered_vs_new_tweets_labeled.csv +0 -0
  27. metric_analysis/output_original/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  28. metric_analysis/output_original/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  29. metric_analysis/output_original/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  30. metric_analysis/output_original/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv +102 -0
  31. metric_analysis/output_original/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv +102 -0
  32. metric_analysis/output_original/virality_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  33. metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv +102 -0
  34. metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv +102 -0
  35. metric_analysis/output_original/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv +102 -0
  36. metric_analysis/output_standardized/hard_threshold_viral_covered_vs_new_tweets_labeled.csv +843 -0
  37. metric_analysis/output_standardized/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  38. metric_analysis/output_standardized/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  39. metric_analysis/output_standardized/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  40. metric_analysis/output_standardized/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv +102 -0
  41. metric_analysis/output_standardized/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv +102 -0
  42. metric_analysis/output_standardized/virality_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
  43. metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv +102 -0
  44. metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv +35 -0
  45. metric_analysis/output_standardized/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv +26 -0
  46. metric_analysis/twitter_viral_model.ipynb +2303 -0
  47. metric_analysis/viral_tweet_user_exploration.ipynb +1208 -0
  48. othercode/collect_users_tweets.py +95 -0
  49. othercode/hydrate_tweets.py +127 -0
  50. othercode/text_preprocessing.py +110 -0
.envrc ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ layout python
2
+ use_nodejs 22
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ classification/model_with_extra_features/final_dataset_since_october_2022.parquet.gzip filter=lfs diff=lfs merge=lfs -text
37
+ classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip filter=lfs diff=lfs merge=lfs -text
38
+ data/control.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .vercel
2
+ .direnv
3
+ .ipynb_checkpoints
4
+ __pycache__
.vercelignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .direnv
README.md CHANGED
@@ -1,12 +1,41 @@
1
  ---
2
  title: ViralTweets
3
- emoji: 📉
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.12.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: ViralTweets
3
+ app_file: main.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.12.0
 
 
6
  ---
7
+ # ViralTweets
8
 
9
+ This repository contains the data for the paper "**Measuring and Detecting Virality on Social Media: The Case of Twitter's Viral Tweets Topic**".
10
+
11
+ Main files:
12
+
13
+ ``data/viral.csv``: IDs of tweets scraped from viral topics
14
+
15
+ ``data/control.csv``: IDs of tweets posted by users who went viral at least once
16
+
17
+ ``all_metric_stats.csv``: The stats of the metrics tested
18
+
19
+ ``viral_tweets_html_id_extractor.ipynb``: scraper for viral tweets
20
+
21
+
22
+
23
+ Others:
24
+
25
+ ``classification``: reproduction code for classification
26
+
27
+ ``metric_analysis``: intermediate results and codes for metric stats
28
+
29
+ ``othercode``: other code
30
+
31
+ Please email me tugrulcanelmas at gmail.com to get full access to data if you wish.
32
+
33
+ Please cite our paper if you use our data.
34
+
35
+ ### Contributors:
36
+
37
+ Stephane Selim (EPFL)
38
+
39
+ Célia Houssiaux (EPFL)
40
+
41
+ Tuğrulcan Elmas (EPFL / IU Bloomington)
TestRepro.ipynb ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "3f7f2ede-4f06-4d5a-b19c-30a7fc4406bc",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%load_ext autoreload\n",
11
+ "%autoreload 2"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "77cdea1b-525e-493c-9eca-c99d33d9ac54",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "from torch.utils.data import DataLoader\n",
23
+ "from torch.nn import functional as F\n",
24
+ "import torch"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 3,
30
+ "id": "a5d0f4dd-0f71-4314-9e0e-62311de3eef3",
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "#all_tweets_labeled = pd.read_parquet('classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip')"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 4,
40
+ "id": "da3bcd2a-b6c1-4026-8905-777b4ac351ad",
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "#all_tweets_labeled.head()"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 246,
50
+ "id": "e996e9fe-4dc1-4a4c-82a0-8cb3a7862ee8",
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "all_tweets_labeled = pd.DataFrame([\n",
55
+ " {\"id\": 1, \"text\": \"\"\"tl;dr\n",
56
+ "\n",
57
+ "Humans are just ChatGPT Wrappers in sunglasses\n",
58
+ " \n",
59
+ "& I couldn’t be more optimistic about the future as a result\n",
60
+ "\n",
61
+ "Thank you \n",
62
+ "@ekang426322\n",
63
+ " for an exceptionally curated day at BUIDL Europe!\n",
64
+ " 🫶\"\"\", \"viral\": 1},\n",
65
+ " {\"id\": 2, \"text\": \"\"\"USD0++ discovered a new source of yield — depeg. \n",
66
+ "\n",
67
+ "Respect to the innovation\n",
68
+ "\"\"\", \"viral\": 0},\n",
69
+ " {\"id\": 3, \"text\": \"\"\"here you can see 4 ai agents \n",
70
+ "@dongossen100\n",
71
+ " , me, \n",
72
+ "@WorldWideWarden16\n",
73
+ " and \n",
74
+ "@provenauthority291\n",
75
+ " discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence\"\"\",\n",
76
+ " \"viral\": 1},\n",
77
+ " {\"id\": 4, \"text\": \"\"\"\n",
78
+ " arrived to lisbon, building energy is the air\"\"\", \"viral\": 0},\n",
79
+ " dict(id=5,text=\"\"\"\n",
80
+ " received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\n",
81
+ "\n",
82
+ "can't wait to address it all\n",
83
+ "\n",
84
+ "main points:\n",
85
+ "\n",
86
+ "- show rank in X DMs to quickly filter out inbox\n",
87
+ "\n",
88
+ "- rank labels are too distracting (already fixed) \n",
89
+ "\n",
90
+ "- add an option for users to toggle on/off scores inside the feed\n",
91
+ "\n",
92
+ "- add a percentile label, e.g. qw 801 (Top 0.1%)\n",
93
+ "\n",
94
+ "- enable others to add reviews to impact the rank \n",
95
+ "\n",
96
+ "- explain in detail how rankings are calculated \n",
97
+ "\n",
98
+ "- show breakdowns of people in DeFi, DePin, Memecoins etc.\n",
99
+ "\n",
100
+ "- make X Rank opensource \n",
101
+ "\n",
102
+ "- create a web version\n",
103
+ "\n",
104
+ "p.s. the current version is just a tiny step in our roadmap for the next two months. \n",
105
+ "\n",
106
+ "thank you for the feedback \n",
107
+ "@socialfi_panda101\n",
108
+ " \n",
109
+ "@adamkillam100\n",
110
+ " \n",
111
+ "@FamKien106\n",
112
+ " \n",
113
+ "@antongotchi104\n",
114
+ " \n",
115
+ "@kliuless128\n",
116
+ " \n",
117
+ "@0xsudogm163\n",
118
+ " \n",
119
+ "@monosarin120\n",
120
+ " \n",
121
+ "@flb_xyz56\n",
122
+ " 🫶\n",
123
+ " \"\"\",\n",
124
+ " viral=0),\n",
125
+ " dict(id=6, text=\"\"\"ai agents are in the air\n",
126
+ "\n",
127
+ "and web3 is trained to sniff out alpha\"\"\", viral=1),\n",
128
+ " dict(id=7, text=\"\"\"While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀\"\"\", viral=1),\n",
129
+ "])"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 247,
135
+ "id": "a0f4c14d-c9e4-4de6-b723-8e7c0e166b90",
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "data": {
140
+ "text/html": [
141
+ "<div>\n",
142
+ "<style scoped>\n",
143
+ " .dataframe tbody tr th:only-of-type {\n",
144
+ " vertical-align: middle;\n",
145
+ " }\n",
146
+ "\n",
147
+ " .dataframe tbody tr th {\n",
148
+ " vertical-align: top;\n",
149
+ " }\n",
150
+ "\n",
151
+ " .dataframe thead th {\n",
152
+ " text-align: right;\n",
153
+ " }\n",
154
+ "</style>\n",
155
+ "<table border=\"1\" class=\"dataframe\">\n",
156
+ " <thead>\n",
157
+ " <tr style=\"text-align: right;\">\n",
158
+ " <th></th>\n",
159
+ " <th>id</th>\n",
160
+ " <th>text</th>\n",
161
+ " <th>viral</th>\n",
162
+ " </tr>\n",
163
+ " </thead>\n",
164
+ " <tbody>\n",
165
+ " <tr>\n",
166
+ " <th>0</th>\n",
167
+ " <td>1</td>\n",
168
+ " <td>tl;dr\\n\\nHumans are just ChatGPT Wrappers in s...</td>\n",
169
+ " <td>1</td>\n",
170
+ " </tr>\n",
171
+ " <tr>\n",
172
+ " <th>1</th>\n",
173
+ " <td>2</td>\n",
174
+ " <td>USD0++ discovered a new source of yield — depe...</td>\n",
175
+ " <td>0</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>2</th>\n",
179
+ " <td>3</td>\n",
180
+ " <td>here you can see 4 ai agents \\n@dongossen100\\n...</td>\n",
181
+ " <td>1</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>3</th>\n",
185
+ " <td>4</td>\n",
186
+ " <td>\\n arrived to lisbon, building energy is th...</td>\n",
187
+ " <td>0</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>4</th>\n",
191
+ " <td>5</td>\n",
192
+ " <td>\\n received a wealth of valuable feedback o...</td>\n",
193
+ " <td>0</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>5</th>\n",
197
+ " <td>6</td>\n",
198
+ " <td>ai agents are in the air\\n\\nand web3 is traine...</td>\n",
199
+ " <td>1</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>6</th>\n",
203
+ " <td>7</td>\n",
204
+ " <td>While Trump is going to do something great wit...</td>\n",
205
+ " <td>1</td>\n",
206
+ " </tr>\n",
207
+ " </tbody>\n",
208
+ "</table>\n",
209
+ "</div>"
210
+ ],
211
+ "text/plain": [
212
+ " id text viral\n",
213
+ "0 1 tl;dr\\n\\nHumans are just ChatGPT Wrappers in s... 1\n",
214
+ "1 2 USD0++ discovered a new source of yield — depe... 0\n",
215
+ "2 3 here you can see 4 ai agents \\n@dongossen100\\n... 1\n",
216
+ "3 4 \\n arrived to lisbon, building energy is th... 0\n",
217
+ "4 5 \\n received a wealth of valuable feedback o... 0\n",
218
+ "5 6 ai agents are in the air\\n\\nand web3 is traine... 1\n",
219
+ "6 7 While Trump is going to do something great wit... 1"
220
+ ]
221
+ },
222
+ "execution_count": 247,
223
+ "metadata": {},
224
+ "output_type": "execute_result"
225
+ }
226
+ ],
227
+ "source": [
228
+ "all_tweets_labeled"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 248,
234
+ "id": "3e8326c3-1df6-435d-b0ee-e7b9449c6675",
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "from classification.model_with_only_language_models.text_preprocessing import clean_tweet"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 249,
244
+ "id": "5bb79b0c-42d1-4f1c-ad65-7ebfbbd17098",
245
+ "metadata": {},
246
+ "outputs": [],
247
+ "source": [
248
+ "dataset = all_tweets_labeled\n",
249
+ "\n",
250
+ "dataset.loc[:, \"viral\"] = dataset.viral.astype(int)\n",
251
+ "dataset[\"cleaned_text\"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": 250,
257
+ "id": "f45533d3-f3f6-49bc-b347-663d72fffa34",
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "dataset = dataset.dropna()\n",
262
+ "dataset = dataset[['id', 'cleaned_text', 'viral']]"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 251,
268
+ "id": "4eb4afa9-3de4-4579-b1a3-9418ca534453",
269
+ "metadata": {},
270
+ "outputs": [
271
+ {
272
+ "data": {
273
+ "text/html": [
274
+ "<div>\n",
275
+ "<style scoped>\n",
276
+ " .dataframe tbody tr th:only-of-type {\n",
277
+ " vertical-align: middle;\n",
278
+ " }\n",
279
+ "\n",
280
+ " .dataframe tbody tr th {\n",
281
+ " vertical-align: top;\n",
282
+ " }\n",
283
+ "\n",
284
+ " .dataframe thead th {\n",
285
+ " text-align: right;\n",
286
+ " }\n",
287
+ "</style>\n",
288
+ "<table border=\"1\" class=\"dataframe\">\n",
289
+ " <thead>\n",
290
+ " <tr style=\"text-align: right;\">\n",
291
+ " <th></th>\n",
292
+ " <th>id</th>\n",
293
+ " <th>cleaned_text</th>\n",
294
+ " <th>viral</th>\n",
295
+ " </tr>\n",
296
+ " </thead>\n",
297
+ " <tbody>\n",
298
+ " <tr>\n",
299
+ " <th>0</th>\n",
300
+ " <td>1</td>\n",
301
+ " <td>tl ;d rHumans are just ChatGPT Wrappers in sun...</td>\n",
302
+ " <td>1</td>\n",
303
+ " </tr>\n",
304
+ " <tr>\n",
305
+ " <th>1</th>\n",
306
+ " <td>2</td>\n",
307
+ " <td>USD 0 + + discovered a new source of yield — d...</td>\n",
308
+ " <td>0</td>\n",
309
+ " </tr>\n",
310
+ " <tr>\n",
311
+ " <th>2</th>\n",
312
+ " <td>3</td>\n",
313
+ " <td>here you can see 4 ai agents @USER , me , @USE...</td>\n",
314
+ " <td>1</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>3</th>\n",
318
+ " <td>4</td>\n",
319
+ " <td>arrived to lisbon , building energy is the air</td>\n",
320
+ " <td>0</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>4</th>\n",
324
+ " <td>5</td>\n",
325
+ " <td>received a wealth of valuable feedback on the ...</td>\n",
326
+ " <td>0</td>\n",
327
+ " </tr>\n",
328
+ " </tbody>\n",
329
+ "</table>\n",
330
+ "</div>"
331
+ ],
332
+ "text/plain": [
333
+ " id cleaned_text viral\n",
334
+ "0 1 tl ;d rHumans are just ChatGPT Wrappers in sun... 1\n",
335
+ "1 2 USD 0 + + discovered a new source of yield — d... 0\n",
336
+ "2 3 here you can see 4 ai agents @USER , me , @USE... 1\n",
337
+ "3 4 arrived to lisbon , building energy is the air 0\n",
338
+ "4 5 received a wealth of valuable feedback on the ... 0"
339
+ ]
340
+ },
341
+ "execution_count": 251,
342
+ "metadata": {},
343
+ "output_type": "execute_result"
344
+ }
345
+ ],
346
+ "source": [
347
+ "dataset.head()"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 252,
353
+ "id": "f6f076f8-3b0e-446b-ac69-582e1bcf1ee0",
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": [
357
+ "from datasets import Dataset"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": 253,
363
+ "id": "86ca78a6-998d-45f5-bc0e-d22531dbc174",
364
+ "metadata": {},
365
+ "outputs": [
366
+ {
367
+ "data": {
368
+ "text/plain": [
369
+ "Dataset({\n",
370
+ " features: ['id', 'cleaned_text', 'viral'],\n",
371
+ " num_rows: 7\n",
372
+ "})"
373
+ ]
374
+ },
375
+ "execution_count": 253,
376
+ "metadata": {},
377
+ "output_type": "execute_result"
378
+ }
379
+ ],
380
+ "source": [
381
+ "ds = Dataset.from_pandas(dataset)\n",
382
+ "ds"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 340,
388
+ "id": "e88ed93f-0b0c-4743-a506-9a4006534151",
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": [
392
+ "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
393
+ "from transformers import DataCollatorWithPadding\n",
394
+ "from transformers import BertweetTokenizer"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "execution_count": 372,
400
+ "id": "4ec382e5-073b-40e1-8ce6-a6ff9e51644f",
401
+ "metadata": {},
402
+ "outputs": [],
403
+ "source": [
404
+ "class Tokenizer(BertweetTokenizer):\n",
405
+ " def __init__(self, *args, **kwargs):\n",
406
+ " return super().__init__(*args, **kwargs)\n",
407
+ "\n",
408
+ " def __call__(self, *args, **kwargs):\n",
409
+ " return super().__call__(*args, max_length=120, **kwargs)"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": 373,
415
+ "id": "56eb937a-483f-4f2f-b7fe-c3da2aa42526",
416
+ "metadata": {},
417
+ "outputs": [],
418
+ "source": [
419
+ "import torch\n",
420
+ "from transformers import AutoModelForSequenceClassification\n",
421
+ "\n",
422
+ "CHECKPOINT = \"classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt\"\n",
423
+ "MODEL_NAME = \"vinai/bertweet-base\"\n",
424
+ "\n",
425
+ "def get_device():\n",
426
+ " #device = torch.device(\"mps\") if torch.mps.is_available() else torch.device(\"cpu\")\n",
427
+ " return torch.device(\"cpu\")\n",
428
+ " return device\n",
429
+ " \n",
430
+ "\n",
431
+ "def get_model():\n",
432
+ " model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)\n",
433
+ " model.load_state_dict(torch.load(CHECKPOINT))\n",
434
+ " model.to(get_device())\n",
435
+ " tokenizer = Tokenizer.from_pretrained(MODEL_NAME, truncation=True, max_length=100)\n",
436
+ "\n",
437
+ " return tokenizer, model"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 374,
443
+ "id": "5fe5af4a-3eb8-4fe0-99e8-c967d61241f2",
444
+ "metadata": {},
445
+ "outputs": [
446
+ {
447
+ "name": "stderr",
448
+ "output_type": "stream",
449
+ "text": [
450
+ "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
451
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
452
+ "/var/folders/xd/g8p1g555153b4v2qp8q7shb00000gn/T/ipykernel_40634/3099302733.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
453
+ " model.load_state_dict(torch.load(CHECKPOINT))\n",
454
+ "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
455
+ "The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. \n",
456
+ "The class this function is called from is 'Tokenizer'.\n"
457
+ ]
458
+ }
459
+ ],
460
+ "source": [
461
+ "tokenizer, model = get_model()"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": 375,
467
+ "id": "6cdc0d7e-d264-49b8-822e-9a862a929a2f",
468
+ "metadata": {},
469
+ "outputs": [],
470
+ "source": [
471
+ "def tokenize_function(example, tokenizer):\n",
472
+ " # Truncate to max length. Note that a tweet's maximum length is 280\n",
473
+ " # TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding\n",
474
+ " #return tokenizer(example[\"cleaned_text\"], truncation=True, max_length=100)\n",
475
+ " return tokenizer(example[\"cleaned_text\"])"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "code",
480
+ "execution_count": 376,
481
+ "id": "bc27ce0b-66bb-4a6f-98c5-78983594c3bd",
482
+ "metadata": {},
483
+ "outputs": [
484
+ {
485
+ "data": {
486
+ "application/vnd.jupyter.widget-view+json": {
487
+ "model_id": "ee20a2b256964124930de15d8e97f4ef",
488
+ "version_major": 2,
489
+ "version_minor": 0
490
+ },
491
+ "text/plain": [
492
+ "Map: 0%| | 0/7 [00:00<?, ? examples/s]"
493
+ ]
494
+ },
495
+ "metadata": {},
496
+ "output_type": "display_data"
497
+ },
498
+ {
499
+ "name": "stderr",
500
+ "output_type": "stream",
501
+ "text": [
502
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
503
+ ]
504
+ }
505
+ ],
506
+ "source": [
507
+ "tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)\n",
508
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
509
+ "\n",
510
+ "#tokenized_datasets = tokenized_datasets.remove_columns([\"__index_level_0__\", \"cleaned_text\", \"id\"])\n",
511
+ "tokenized_datasets = tokenized_datasets.remove_columns([\"cleaned_text\", \"id\"])\n",
512
+ "tokenized_datasets = tokenized_datasets.rename_column(\"viral\", \"labels\")\n",
513
+ "tokenized_datasets.set_format(\"torch\")"
514
+ ]
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "execution_count": 377,
519
+ "id": "77a12396-386c-4aba-8ed4-e269ecda13a1",
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "eval_dataloader = DataLoader(tokenized_datasets, batch_size=1, collate_fn=data_collator)"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 378,
529
+ "id": "dc98302c-d539-4af3-8979-64156dda8317",
530
+ "metadata": {},
531
+ "outputs": [
532
+ {
533
+ "name": "stdout",
534
+ "output_type": "stream",
535
+ "text": [
536
+ "tensor([0.8640])\n",
537
+ "tensor([0.5687])\n",
538
+ "tensor([0.9722])\n",
539
+ "tensor([0.0006])\n",
540
+ "tensor([0.0033])\n",
541
+ "tensor([0.0091])\n",
542
+ "tensor([0.9982])\n"
543
+ ]
544
+ }
545
+ ],
546
+ "source": [
547
+ "if torch.mps.is_available():\n",
548
+ " torch.mps.empty_cache()\n",
549
+ "if torch.cuda.is_available():\n",
550
+ " torch.cuda.empty_cache()\n",
551
+ "\n",
552
+ "model.eval()\n",
553
+ "for batch in eval_dataloader:\n",
554
+ " batch = {k: v.to(get_device()) for k, v in batch.items()}\n",
555
+ " with torch.no_grad():\n",
556
+ " outputs = model(**batch)\n",
557
+ "\n",
558
+ " logits = outputs.logits\n",
559
+ " probabilities = F.softmax(logits, dim=-1)\n",
560
+ " predictions = torch.argmax(logits, dim=-1)\n",
561
+ " \n",
562
+ " print(probabilities[:, 1])\n",
563
+ " #print(predictions)"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": 379,
569
+ "id": "4feb1954-7ad2-461d-bf52-8dd2e0d6591f",
570
+ "metadata": {},
571
+ "outputs": [
572
+ {
573
+ "name": "stdout",
574
+ "output_type": "stream",
575
+ "text": [
576
+ "128.65210151672363 MiB\n"
577
+ ]
578
+ }
579
+ ],
580
+ "source": [
581
+ "print(sum(p.numel() for p in model.parameters()) / 1024**2, \"MiB\")"
582
+ ]
583
+ },
584
+ {
585
+ "cell_type": "code",
586
+ "execution_count": 380,
587
+ "id": "15e2dc8f-c38d-4828-9c90-638c9782eb54",
588
+ "metadata": {},
589
+ "outputs": [],
590
+ "source": [
591
+ "from transformers import pipeline"
592
+ ]
593
+ },
594
+ {
595
+ "cell_type": "code",
596
+ "execution_count": 381,
597
+ "id": "37af7000-ab64-4b1c-bd29-c648b433420f",
598
+ "metadata": {},
599
+ "outputs": [
600
+ {
601
+ "name": "stderr",
602
+ "output_type": "stream",
603
+ "text": [
604
+ "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
605
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
606
+ "/var/folders/xd/g8p1g555153b4v2qp8q7shb00000gn/T/ipykernel_40634/3099302733.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
607
+ " model.load_state_dict(torch.load(CHECKPOINT))\n",
608
+ "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
609
+ "The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. \n",
610
+ "The class this function is called from is 'Tokenizer'.\n"
611
+ ]
612
+ }
613
+ ],
614
+ "source": [
615
+ "tokenizer, model = get_model()"
616
+ ]
617
+ },
618
+ {
619
+ "cell_type": "code",
620
+ "execution_count": 382,
621
+ "id": "a05fa75b-e571-4b14-b158-1b43ee17871a",
622
+ "metadata": {},
623
+ "outputs": [
624
+ {
625
+ "name": "stderr",
626
+ "output_type": "stream",
627
+ "text": [
628
+ "Device set to use cpu\n"
629
+ ]
630
+ }
631
+ ],
632
+ "source": [
633
+ "pipe = pipeline(\n",
634
+ " 'text-classification',\n",
635
+ " model=model,\n",
636
+ " tokenizer=tokenizer,\n",
637
+ " device=\"cpu\",\n",
638
+ ")"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "code",
643
+ "execution_count": 383,
644
+ "id": "f1bcb478-c16f-4135-9d61-9df69538e8ce",
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": [
648
+ "texts = [\n",
649
+ " 'tl;dr\\n\\nHumans are just ChatGPT Wrappers in sunglasses\\n \\n& I couldn’t be more optimistic about the future as a result\\n\\nThank you \\n@ekang426322\\n for an exceptionally curated day at BUIDL Europe!\\n 🫶',\n",
650
+ " 'USD0++ discovered a new source of yield — depeg. \\n\\nRespect to the innovation\\n',\n",
651
+ " 'here you can see 4 ai agents \\n@dongossen100\\n , me, \\n@WorldWideWarden16\\n and \\n@provenauthority291\\n discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence',\n",
652
+ " '\\n arrived to lisbon, building energy is the air',\n",
653
+ " \"\\n received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\\n\\ncan't wait to address it all\\n\\nmain points:\\n\\n- show rank in X DMs to quickly filter out inbox\\n\\n- rank labels are too distracting (already fixed) \\n\\n- add an option for users to toggle on/off scores inside the feed\\n\\n- add a percentile label, e.g. qw 801 (Top 0.1%)\\n\\n- enable others to add reviews to impact the rank \\n\\n- explain in detail how rankings are calculated \\n\\n- show breakdowns of people in DeFi, DePin, Memecoins etc.\\n\\n- make X Rank opensource \\n\\n- create a web version\\n\\np.s. the current version is just a tiny step in our roadmap for the next two months. \\n\\nthank you for the feedback \\n@socialfi_panda101\\n \\n@adamkillam100\\n \\n@FamKien106\\n \\n@antongotchi104\\n \\n@kliuless128\\n \\n@0xsudogm163\\n \\n@monosarin120\\n \\n@flb_xyz56\\n 🫶\\n \",\n",
654
+ " 'ai agents are in the air\\n\\nand web3 is trained to sniff out alpha',\n",
655
+ " 'While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀',\n",
656
+ "]"
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "code",
661
+ "execution_count": 403,
662
+ "id": "52ab46d9-ed16-43dd-ab0b-4af0757e7c96",
663
+ "metadata": {},
664
+ "outputs": [
665
+ {
666
+ "name": "stdout",
667
+ "output_type": "stream",
668
+ "text": [
669
+ " 86.40%\n",
670
+ " 56.87%\n",
671
+ " 97.22%\n",
672
+ " 0.06%\n",
673
+ " 0.33%\n",
674
+ " 0.91%\n",
675
+ " 99.82%\n"
676
+ ]
677
+ }
678
+ ],
679
+ "source": [
680
+ "for text in texts:\n",
681
+ " res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)\n",
682
+ " LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]\n",
683
+ " print(f\"{LABEL_1_result:7.2%}\")"
684
+ ]
685
+ },
686
+ {
687
+ "cell_type": "code",
688
+ "execution_count": null,
689
+ "id": "033adc09-7c2f-414b-a7e4-d7d8095af580",
690
+ "metadata": {},
691
+ "outputs": [],
692
+ "source": []
693
+ },
694
+ {
695
+ "cell_type": "code",
696
+ "execution_count": null,
697
+ "id": "117e3390-130a-4750-ad6a-c03c80050b0f",
698
+ "metadata": {},
699
+ "outputs": [],
700
+ "source": []
701
+ },
702
+ {
703
+ "cell_type": "code",
704
+ "execution_count": null,
705
+ "id": "612dee88-0e40-4072-a3af-21a6f3dc5488",
706
+ "metadata": {},
707
+ "outputs": [],
708
+ "source": []
709
+ }
710
+ ],
711
+ "metadata": {
712
+ "kernelspec": {
713
+ "display_name": "Python (ViralTweets)",
714
+ "language": "python",
715
+ "name": "viraltweets"
716
+ },
717
+ "language_info": {
718
+ "codemirror_mode": {
719
+ "name": "ipython",
720
+ "version": 3
721
+ },
722
+ "file_extension": ".py",
723
+ "mimetype": "text/x-python",
724
+ "name": "python",
725
+ "nbconvert_exporter": "python",
726
+ "pygments_lexer": "ipython3",
727
+ "version": "3.12.2"
728
+ }
729
+ },
730
+ "nbformat": 4,
731
+ "nbformat_minor": 5
732
+ }
all_metric_stats.csv ADDED
The diff for this file is too large to render. See raw diff
 
classification/model_with_extra_features/classification.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MODIFY AS REQUIRED
2
+ import torch
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import numpy as np
6
+
7
+ import matplotlib.pyplot as plt
8
+
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from plotly.subplots import make_subplots
12
+
13
+ from sklearn.model_selection import train_test_split
14
+ from datasets import load_dataset
15
+ from datasets import Dataset, DatasetDict
16
+ from transformers import DataCollatorWithPadding
17
+
18
+ from torch.utils.data import DataLoader
19
+ from transformers import AutoTokenizer
20
+
21
+ from torch.optim import AdamW
22
+ from torch.nn import BCEWithLogitsLoss
23
+
24
+ from transformers import get_scheduler
25
+
26
+ from tqdm.auto import tqdm
27
+
28
+ import evaluate
29
+
30
+ from tqdm import tqdm
31
+ import logging
32
+ logging.basicConfig(level=logging.INFO)
33
+
34
+ from text_preprocessing import clean_tweet, clear_reply_mentions, normalizeTweet
35
+ from custom_model import CustomModel
36
+
37
+ '''
38
+ DATA_PATH = "../../data"
39
+
40
+ PROCESSED_PATH = f"{DATA_PATH}/processed"
41
+
42
+ PROCESSED_PATH_VIRAL = f'{DATA_PATH}/new/processed/viral'
43
+ PROCESSED_PATH_COVID = f'{DATA_PATH}/new/processed/covid'
44
+ '''
45
+
46
+ # Different models
47
+ BERT_BASE_UNCASED = "bert-base-uncased"
48
+ BERT_BASE_CASED = "bert-base-cased"
49
+ ROBERTA_BASE = "roberta-base"
50
+ BERT_TWEET = "vinai/bertweet-base"
51
+
52
+ # TODO: Don't forget to cite papers if you use some model
53
+ BERT_TINY = "prajjwal1/bert-tiny"
54
+
55
+ TWEET_MAX_LENGTH = 280
56
+
57
+ # TEST SPLIT RATIO + MODELS (ADD MORE MODELS FROM ABOVE)
58
+ MODELS = [BERT_TWEET, BERT_TINY, BERT_BASE_CASED, ROBERTA_BASE]
59
+ TEST_RATIO = 0.2
60
+
61
+ TOP_FEATURES = ["verified", "tweet_length", "possibly_sensitive", "sentiment", "nb_of_hashtags", "has_media", "nb_of_mentions"]
62
+
63
+ def preprocess_data(dataset):
64
+ dataset.loc[:, 'has_media'] = dataset.has_media.astype("int")
65
+ dataset.loc[:, 'possibly_sensitive'] = dataset.possibly_sensitive.astype("int")
66
+
67
+ #dataset = dataset[dataset.sentiment_score > 0.7]
68
+ dataset.loc[:, 'sentiment'] = dataset.sentiment.replace({'POSITIVE': 1, 'NEGATIVE': 0})
69
+ dataset.loc[:, 'verified'] = dataset['verified'].astype(int)
70
+
71
+ # remove tweets with 0 retweets (to eliminate their effects)
72
+ #dataset = dataset[dataset.retweet_count > 0]
73
+
74
+ ## UPDATE: Get tweets tweeted by the same user, on the same day he tweeted a viral tweet
75
+
76
+ # Get the date from datetime
77
+ # normalize() sets all datetimes clock to midnight, which is equivalent as keeping only the date part
78
+ dataset['date'] = dataset.created_at.dt.normalize()
79
+
80
+ viral_tweets = dataset[dataset.viral]
81
+ non_viral_tweets = dataset[~dataset.viral]
82
+
83
+ temp = non_viral_tweets.merge(viral_tweets[['author_id', 'date', 'id', 'viral']], on=['author_id', 'date'], suffixes=(None, '_y'))
84
+ same_day_viral_ids = temp.id_y.unique()
85
+
86
+ same_day_viral_tweets = viral_tweets[viral_tweets.id.isin(same_day_viral_ids)].drop_duplicates(subset=['author_id', 'date'])
87
+ same_day_non_viral_tweets = temp.drop_duplicates(subset=['author_id', 'date'])
88
+
89
+ logging.info(f"Number of viral tweets tweeted on the same day {len(same_day_viral_tweets)}")
90
+ logging.info(f"Number of non viral tweets tweeted on the same day {len(same_day_non_viral_tweets)}")
91
+
92
+ dataset = pd.concat([same_day_viral_tweets, same_day_non_viral_tweets], axis=0)
93
+ dataset = dataset[['id', 'text'] + TOP_FEATURES + ['viral']]
94
+
95
+ # Balance classes to have as many viral as non viral ones
96
+ #dataset = pd.concat([positives, negatives.sample(n=len(positives))])
97
+ #dataset = pd.concat([positives.iloc[:100], negatives.sample(n=len(positives)).iloc[:200]])
98
+
99
+ # Clean text to prepare for tokenization
100
+ #dataset = dataset.dropna()
101
+ dataset.loc[:, "viral"] = dataset.viral.astype(int)
102
+
103
+ # TODO: COMMENT IF YOU WANT TO KEEP TEXT AS IS
104
+ dataset["cleaned_text"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))
105
+
106
+ dataset = dataset.dropna()
107
+ dataset.loc[:, "extra_features"] = dataset[TOP_FEATURES].values.tolist()
108
+ dataset = dataset[['id', 'cleaned_text', 'extra_features', 'viral']]
109
+
110
+ return dataset
111
+
112
+ def prepare_dataset(sample_data, balance=False):
113
+ # Split the train and test data st each has a fixed proportion of viral tweets
114
+ train_dataset, eval_dataset = train_test_split(sample_data, test_size=TEST_RATIO, random_state=42, stratify=sample_data.viral)
115
+
116
+ # Balance test set
117
+ if balance:
118
+ eval_virals = eval_dataset[eval_dataset.viral == 1]
119
+ eval_non_virals = eval_dataset[eval_dataset.viral == 0]
120
+ eval_dataset = pd.concat([eval_virals, eval_non_virals.sample(n=len(eval_virals))])
121
+
122
+ logging.info('{:>5,} training samples with {:>5,} positives and {:>5,} negatives'.format(
123
+ len(train_dataset), len(train_dataset[train_dataset.viral == 1]), len(train_dataset[train_dataset.viral == 0])))
124
+ logging.info('{:>5,} validation samples with {:>5,} positives and {:>5,} negatives'.format(
125
+ len(eval_dataset), len(eval_dataset[eval_dataset.viral == 1]), len(eval_dataset[eval_dataset.viral == 0])))
126
+
127
+ train_dataset.to_parquet("train.parquet.gzip", compression='gzip')
128
+ eval_dataset.to_parquet("test.parquet.gzip", compression='gzip')
129
+
130
+ ds = load_dataset("parquet", data_files={'train': 'train.parquet.gzip', 'test': 'test.parquet.gzip'})
131
+ return ds
132
+
133
+ def tokenize_function(example, tokenizer):
134
+ # Truncate to max length. Note that a tweet's maximum length is 280
135
+ # TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding
136
+ return tokenizer(example["cleaned_text"], truncation=True)
137
+
138
+
139
+ def test_all_models(ds, nb_extra_dims, models=MODELS):
140
+ models_losses = {}
141
+ device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
142
+
143
+ output = ""
144
+
145
+ for checkpoint in models:
146
+ torch.mps.empty_cache()
147
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
148
+ custom_model = CustomModel(checkpoint, num_extra_dims=nb_extra_dims, num_labels=2)
149
+ custom_model.to(device)
150
+
151
+ tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)
152
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
153
+
154
+ tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__", "cleaned_text", "id"])
155
+ tokenized_datasets = tokenized_datasets.rename_column("viral", "labels")
156
+ tokenized_datasets.set_format("torch")
157
+
158
+ batch_size = 32
159
+
160
+ train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
161
+ eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)
162
+
163
+ criterion = BCEWithLogitsLoss()
164
+ optimizer = AdamW(custom_model.parameters(), lr=5e-5)
165
+
166
+ num_epochs = 15
167
+ num_training_steps = num_epochs * len(train_dataloader)
168
+ lr_scheduler = get_scheduler(
169
+ name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
170
+ )
171
+
172
+ progress_bar = tqdm(range(num_training_steps))
173
+
174
+ losses = []
175
+ custom_model.train()
176
+ for epoch in range(num_epochs):
177
+ for batch in train_dataloader:
178
+ batch = {k: v.to(device) for k, v in batch.items()}
179
+ logits = custom_model(**batch).squeeze()
180
+
181
+ loss = criterion(logits, batch['labels'].float())
182
+ #losses.append(loss.cpu().item())
183
+ losses.append(loss.item())
184
+ loss.backward()
185
+
186
+ optimizer.step()
187
+ lr_scheduler.step()
188
+ optimizer.zero_grad()
189
+ progress_bar.update(1)
190
+
191
+ models_losses[checkpoint] = losses
192
+
193
+ metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
194
+ custom_model.eval()
195
+ for batch in eval_dataloader:
196
+ batch = {k: v.to(device) for k, v in batch.items()}
197
+ with torch.no_grad():
198
+ logits = custom_model(**batch)
199
+
200
+ #predictions = torch.argmax(outputs, dim=-1)
201
+ predictions = torch.round(torch.sigmoid(logits))
202
+ metric.add_batch(predictions=predictions, references=batch["labels"])
203
+
204
+ output += f"checkpoint: {checkpoint}: {metric.compute()}\n"
205
+ logging.info(output)
206
+ with open("same_day_as_viral_with_features_train_test_balanced_accuracy.txt", "w") as text_file:
207
+ text_file.write(output)
208
+ return models_losses
209
+
210
+ def main():
211
+ # DATA FILE SHOULD BE AT THE ROOT WITH THIS SCRIPT
212
+ all_tweets_labeled = pd.read_parquet(f'final_dataset_since_october_2022.parquet.gzip')
213
+
214
+ dataset = preprocess_data(all_tweets_labeled)
215
+ ds = prepare_dataset(dataset, balance=True)
216
+
217
+ nb_extra_dims = len(TOP_FEATURES)
218
+ test_all_models(ds, nb_extra_dims=nb_extra_dims)
219
+
220
+ if __name__ == "__main__":
221
+ main()
classification/model_with_extra_features/custom_model.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoConfig, AutoModel
2
+
3
+ import torch
4
+
5
+ class CustomModel(torch.nn.Module):
6
+ """
7
+ This takes a transformer backbone and puts a slightly-modified classification head on top.
8
+
9
+ """
10
+
11
+ def __init__(self, model_name, num_extra_dims, num_labels=2):
12
+ # num_extra_dims corresponds to the number of extra dimensions of numerical/categorical data
13
+
14
+ super().__init__()
15
+
16
+ self.config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
17
+ self.transformer = AutoModel.from_pretrained(model_name, config=self.config)
18
+ num_hidden_size = self.transformer.config.hidden_size # May be different depending on which model you use. Common sizes are 768 and 1024. Look in the config.json file
19
+
20
+ self.linear_layer_1 = torch.nn.Linear(num_hidden_size+num_extra_dims, 32)
21
+ # Output size is 1 since this is a binary classification problem
22
+ self.linear_layer_2 = torch.nn.Linear(32, 16)
23
+ self.linear_layer_output = torch.nn.Linear(16, 1)
24
+ self.relu = torch.nn.LeakyReLU(0.6)
25
+ self.dropout_1 = torch.nn.Dropout(0.5)
26
+
27
+
28
+ def forward(self, input_ids, extra_features, attention_mask=None, token_type_ids=None, labels=None):
29
+ """
30
+ extra_features should be of shape [batch_size, dim]
31
+ where dim is the number of additional numerical/categorical dimensions
32
+ """
33
+
34
+ hidden_states = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # [batch size, sequence length, hidden size]
35
+
36
+ cls_embeds = hidden_states.last_hidden_state[:, 0, :] # [batch size, hidden size]
37
+
38
+ concat = torch.cat((cls_embeds, extra_features), dim=-1) # [batch size, hidden size+num extra dims]
39
+
40
+ output_1 = self.relu(self.linear_layer_1(concat)) # [batch size, num labels]
41
+ output_2 = self.relu(self.linear_layer_2(output_1))
42
+ final_output = self.dropout_1(self.linear_layer_output(output_2))
43
+
44
+ return final_output
classification/model_with_extra_features/final_dataset_since_october_2022.parquet.gzip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d648d6af7606807281540bb37516ac7b8ac5270df07cd43ffb3a0430d77306cf
3
+ size 35666675
classification/model_with_extra_features/same_day_as_viral_with_features_train_test_balanced_accuracy.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ checkpoint: vinai/bertweet-base: {'accuracy': 0.7770700636942676, 'recall': 0.8535031847133758, 'precision': 0.7403314917127072, 'f1': 0.7928994082840237}
2
+ checkpoint: prajjwal1/bert-tiny: {'accuracy': 0.7229299363057324, 'recall': 0.8853503184713376, 'precision': 0.6682692307692307, 'f1': 0.7616438356164382}
3
+ checkpoint: bert-base-cased: {'accuracy': 0.7038216560509554, 'recall': 0.8152866242038217, 'precision': 0.6666666666666666, 'f1': 0.7335243553008596}
4
+ checkpoint: roberta-base: {'accuracy': 0.7292993630573248, 'recall': 0.8598726114649682, 'precision': 0.6818181818181818, 'f1': 0.7605633802816901}
classification/model_with_extra_features/text_preprocessing.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+
3
+ def clear_reply_mentions(tweet):
4
+ '''Remove user mentions found in a reply to a tweet.
5
+
6
+ Example: @user1 @user2 okay @user3 -> okay @user3
7
+ '''
8
+ # We don't need to use any sophisticated tokenization here like nltk
9
+ tokens = tweet.split(" ")
10
+ for index in range(len(tokens)):
11
+ if not tokens[index].startswith("@"):
12
+ return " ".join(tokens[index:])
13
+ return ""
14
+
15
+ from emoji import demojize, is_emoji
16
+ from nltk.tokenize import TweetTokenizer
17
+
18
+ tweet_tokenizer = TweetTokenizer()
19
+
20
+ def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
21
+ lowercased_token = token.lower()
22
+ if token.startswith("@") and replace_user_mentions:
23
+ return "@USER"
24
+ elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
25
+ return "HTTPURL"
26
+ elif len(token) == 1 and is_emoji(token):
27
+ emojis_found.append(token)
28
+ if demojize_emojis:
29
+ return demojize(token)
30
+ else:
31
+ return token
32
+ else:
33
+ if token == "’":
34
+ return "'"
35
+ elif token == "…":
36
+ return "..."
37
+ else:
38
+ return token
39
+
40
+
41
+ def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
42
+ emojis_found = []
43
+ tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
44
+ normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found,
45
+ replace_user_mentions=replace_user_mentions,
46
+ replace_urls=replace_urls,
47
+ demojize_emojis=demojize_emojis) for token in tokens])
48
+
49
+ if bert_tweet_specific_processing:
50
+ normTweet = (
51
+ normTweet.replace("cannot ", "can not ")
52
+ .replace("n't ", " n't ")
53
+ .replace("n 't ", " n't ")
54
+ .replace("ca n't", "can't")
55
+ .replace("ai n't", "ain't")
56
+ )
57
+ normTweet = (
58
+ normTweet.replace("'m ", " 'm ")
59
+ .replace("'re ", " 're ")
60
+ .replace("'s ", " 's ")
61
+ .replace("'ll ", " 'll ")
62
+ .replace("'d ", " 'd ")
63
+ .replace("'ve ", " 've ")
64
+ )
65
+ normTweet = (
66
+ normTweet.replace(" p . m .", " p.m.")
67
+ .replace(" p . m ", " p.m ")
68
+ .replace(" a . m .", " a.m.")
69
+ .replace(" a . m ", " a.m ")
70
+ )
71
+
72
+ return " ".join(normTweet.split()), emojis_found
73
+
74
+
75
+ def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
76
+ demojize_emojis=True, bert_tweet_specific_processing=True):
77
+ '''Helper function to clean tweets. Highly customizable to fit different needs.
78
+
79
+ Params:
80
+ tweet: the tweet to clean
81
+ clear_html_chars: If true, will unescape any special html entities found in the tweet
82
+ replace_user_mentions: If true, will replace any user mention with the token @USER
83
+ replace_urls: If true, will replace any urls with the token HTTPURL
84
+ demojize_emojis: If true, will demojize emojis
85
+ bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model
86
+
87
+ Returns:
88
+ The cleaned tweet
89
+ '''
90
+ # First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
91
+ # These do not count in the character count of a tweet and may make the tweet length go way overboard.
92
+ cleaned_tweet = clear_reply_mentions(tweet)
93
+
94
+ # Second step: Remove any new lines
95
+ cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')
96
+
97
+ # Third step: if True, escape any html entities
98
+ if clear_html_chars:
99
+ cleaned_tweet = html.unescape(cleaned_tweet)
100
+
101
+ # Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
102
+ normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
103
+ replace_user_mentions=replace_user_mentions,
104
+ replace_urls=replace_urls,
105
+ demojize_emojis=demojize_emojis,
106
+ bert_tweet_specific_processing=bert_tweet_specific_processing)
107
+
108
+ # TODO: process emoticons? e.g. :)
109
+
110
+ return normalized_tweet
classification/model_with_only_language_models/classification.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MODIFY AS REQUIRED
2
+ import torch
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import numpy as np
6
+
7
+ import matplotlib.pyplot as plt
8
+
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from plotly.subplots import make_subplots
12
+
13
+ from sklearn.model_selection import train_test_split
14
+ from datasets import load_dataset
15
+ from datasets import Dataset, DatasetDict
16
+ from transformers import DataCollatorWithPadding
17
+
18
+ from torch.utils.data import DataLoader
19
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
20
+
21
+ from torch.optim import AdamW
22
+ from torch.nn import BCEWithLogitsLoss
23
+
24
+ from transformers import get_scheduler
25
+
26
+ from tqdm.auto import tqdm
27
+
28
+ import evaluate
29
+
30
+ from tqdm import tqdm
31
+ import logging
32
+ logging.basicConfig(level=logging.INFO)
33
+
34
+ from text_preprocessing import clean_tweet, clear_reply_mentions, normalizeTweet
35
+
36
+ '''
37
+ DATA_PATH = "../../data"
38
+
39
+ PROCESSED_PATH = f"{DATA_PATH}/processed"
40
+
41
+ PROCESSED_PATH_VIRAL = f'{DATA_PATH}/new/processed/viral'
42
+ PROCESSED_PATH_COVID = f'{DATA_PATH}/new/processed/covid'
43
+ '''
44
+
45
+ # Different models
46
+ BERT_BASE_UNCASED = "bert-base-uncased"
47
+ BERT_BASE_CASED = "bert-base-cased"
48
+ ROBERTA_BASE = "roberta-base"
49
+ BERT_TWEET = "vinai/bertweet-base"
50
+ # BERT_TWEET_LARGE = "vinai/bertweet-large"
51
+ DEBERTA_V3 = "microsoft/deberta-v3-base"
52
+
53
+ # TODO: Don't forget to cite papers if you use some model
54
+ BERT_TINY = "prajjwal1/bert-tiny"
55
+
56
+ TWEET_MAX_LENGTH = 280
57
+
58
+ # TEST SPLIT RATIO + MODELS (ADD MORE MODELS FROM ABOVE)
59
+ # MODELS = [BERT_TWEET, BERT_TINY, BERT_BASE_CASED, ROBERTA_BASE]
60
+ MODELS = [DEBERTA_V3]
61
+ TEST_RATIO = 0.2
62
+
63
+ def preprocess_data(dataset):
64
+ # remove tweets with 0 retweets (to eliminate their effects)
65
+ #dataset = dataset[dataset.retweet_count > 0]
66
+
67
+ ## UPDATE: Get tweets tweeted by the same user, on the same day he tweeted a viral tweet
68
+
69
+ # Get the date from datetime
70
+ # normalize() sets all datetimes clock to midnight, which is equivalent as keeping only the date part
71
+ dataset['date'] = dataset.created_at.dt.normalize()
72
+
73
+ viral_tweets = dataset[dataset.viral]
74
+ non_viral_tweets = dataset[~dataset.viral]
75
+
76
+ temp = non_viral_tweets.merge(viral_tweets[['author_id', 'date', 'id', 'viral']], on=['author_id', 'date'], suffixes=(None, '_y'))
77
+ same_day_viral_ids = temp.id_y.unique()
78
+
79
+ same_day_viral_tweets = viral_tweets[viral_tweets.id.isin(same_day_viral_ids)].drop_duplicates(subset=['author_id', 'date'])
80
+ same_day_non_viral_tweets = temp.drop_duplicates(subset=['author_id', 'date'])
81
+
82
+ logging.info(f"Number of viral tweets tweeted on the same day {len(same_day_viral_tweets)}")
83
+ logging.info(f"Number of non viral tweets tweeted on the same day {len(same_day_non_viral_tweets)}")
84
+
85
+ dataset = pd.concat([same_day_viral_tweets, same_day_non_viral_tweets], axis=0)
86
+ dataset = dataset[['id', 'text', 'viral']]
87
+
88
+ # Balance classes to have as many viral as non viral ones
89
+ #dataset = pd.concat([positives, negatives.sample(n=len(positives))])
90
+ #dataset = pd.concat([positives.iloc[:100], negatives.sample(n=len(positives)).iloc[:200]])
91
+
92
+ # Clean text to prepare for tokenization
93
+ #dataset = dataset.dropna()
94
+ dataset.loc[:, "viral"] = dataset.viral.astype(int)
95
+
96
+ # TODO: COMMENT IF YOU WANT TO KEEP TEXT AS IS
97
+ dataset["cleaned_text"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))
98
+
99
+ dataset = dataset.dropna()
100
+ dataset = dataset[['id', 'cleaned_text', 'viral']]
101
+
102
+ return dataset
103
+
104
+ def prepare_dataset(sample_data, balance=False):
105
+ # Split the train and test data st each has a fixed proportion of viral tweets
106
+ train_dataset, eval_dataset = train_test_split(sample_data, test_size=TEST_RATIO, random_state=42, stratify=sample_data.viral)
107
+
108
+ # Balance test set
109
+ if balance:
110
+ eval_virals = eval_dataset[eval_dataset.viral == 1]
111
+ eval_non_virals = eval_dataset[eval_dataset.viral == 0]
112
+ eval_dataset = pd.concat([eval_virals, eval_non_virals.sample(n=len(eval_virals))])
113
+
114
+ logging.info('{:>5,} training samples with {:>5,} positives and {:>5,} negatives'.format(
115
+ len(train_dataset), len(train_dataset[train_dataset.viral == 1]), len(train_dataset[train_dataset.viral == 0])))
116
+ logging.info('{:>5,} validation samples with {:>5,} positives and {:>5,} negatives'.format(
117
+ len(eval_dataset), len(eval_dataset[eval_dataset.viral == 1]), len(eval_dataset[eval_dataset.viral == 0])))
118
+
119
+ train_dataset.to_parquet("train.parquet.gzip", compression='gzip')
120
+ eval_dataset.to_parquet("test.parquet.gzip", compression='gzip')
121
+
122
+ ds = load_dataset("parquet", data_files={'train': 'train.parquet.gzip', 'test': 'test.parquet.gzip'})
123
+ return ds
124
+
125
+ def tokenize_function(example, tokenizer):
126
+ # Truncate to max length. Note that a tweet's maximum length is 280
127
+ # TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding
128
+ return tokenizer(example["cleaned_text"], truncation=True)
129
+
130
+
131
+ def test_all_models(ds, models=MODELS):
132
+ models_losses = {}
133
+ device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
134
+
135
+ output = ""
136
+
137
+ for checkpoint in models:
138
+ torch.mps.empty_cache()
139
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
140
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
141
+ model.to(device)
142
+
143
+ tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)
144
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
145
+
146
+ tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__", "cleaned_text", "id"])
147
+ tokenized_datasets = tokenized_datasets.rename_column("viral", "labels")
148
+ tokenized_datasets.set_format("torch")
149
+
150
+ batch_size = 32
151
+
152
+ train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
153
+ eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)
154
+
155
+ criterion = BCEWithLogitsLoss()
156
+ optimizer = AdamW(model.parameters(), lr=5e-5)
157
+
158
+ optimizer = AdamW(model.parameters(), lr=5e-5)
159
+
160
+ num_epochs = 15
161
+ num_training_steps = num_epochs * len(train_dataloader)
162
+ lr_scheduler = get_scheduler(
163
+ name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
164
+ )
165
+
166
+ progress_bar = tqdm(range(num_training_steps))
167
+
168
+ exp_loss = None
169
+ losses = []
170
+ model.train()
171
+ for epoch in range(num_epochs):
172
+ for batch in train_dataloader:
173
+ batch = {k: v.to(device) for k, v in batch.items()}
174
+ outputs = model(**batch)
175
+
176
+ loss = outputs.loss
177
+ losses.append(loss.item())
178
+ loss.backward()
179
+
180
+ if exp_loss is None:
181
+ exp_loss = loss.cpu().item()
182
+ else:
183
+ exp_loss = 0.9 * exp_loss + 0.1 * loss.cpu().item()
184
+
185
+ optimizer.step()
186
+ lr_scheduler.step()
187
+ optimizer.zero_grad()
188
+ progress_bar.update(1)
189
+ progress_bar.set_postfix({"loss": exp_loss, "epoch": epoch})
190
+ torch.save(model.state_dict(), f"models/trained_{checkpoint.replace('/', '_')}.pt")
191
+
192
+ models_losses[checkpoint] = losses
193
+
194
+ metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
195
+ model.eval()
196
+ for batch in eval_dataloader:
197
+ batch = {k: v.to(device) for k, v in batch.items()}
198
+ with torch.no_grad():
199
+ outputs = model(**batch)
200
+
201
+ logits = outputs.logits
202
+ predictions = torch.argmax(logits, dim=-1)
203
+ metric.add_batch(predictions=predictions, references=batch["labels"])
204
+
205
+ output += f"checkpoint: {checkpoint}: {metric.compute()}\n"
206
+ logging.info(output)
207
+ with open("same_day_as_viral_with_features_train_test_balanced_accuracy.txt", "w") as text_file:
208
+ text_file.write(output)
209
+ return models_losses
210
+
211
+ def main():
212
+ # DATA FILE SHOULD BE AT THE ROOT WITH THIS SCRIPT
213
+ all_tweets_labeled = pd.read_parquet(f'final_dataset_since_october_2022.parquet.gzip')
214
+
215
+ dataset = preprocess_data(all_tweets_labeled)
216
+ ds = prepare_dataset(dataset, balance=False)
217
+
218
+ test_all_models(ds)
219
+
220
+ if __name__ == "__main__":
221
+ main()
classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d648d6af7606807281540bb37516ac7b8ac5270df07cd43ffb3a0430d77306cf
3
+ size 35666675
classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d389880329ae1db97cd7e43c851c277c32d32ce73a8dd21deaa77f381fcd50b
3
+ size 539690276
classification/model_with_only_language_models/same_day_as_viral_train_test_balanced_accuracy.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ checkpoint: vinai/bertweet-base: {'accuracy': 0.7484076433121019, 'recall': 0.821656050955414, 'precision': 0.7166666666666667, 'f1': 0.7655786350148368}
2
+ checkpoint: prajjwal1/bert-tiny: {'accuracy': 0.7292993630573248, 'recall': 0.8343949044585988, 'precision': 0.6894736842105263, 'f1': 0.755043227665706}
3
+ checkpoint: bert-base-cased: {'accuracy': 0.6942675159235668, 'recall': 0.7643312101910829, 'precision': 0.6703910614525139, 'f1': 0.7142857142857143}
4
+ checkpoint: roberta-base: {'accuracy': 0.7420382165605095, 'recall': 0.8343949044585988, 'precision': 0.7043010752688172, 'f1': 0.7638483965014577}
classification/model_with_only_language_models/same_day_as_viral_with_features_train_test_balanced_accuracy.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ checkpoint: vinai/bertweet-base: {'accuracy': 0.7714285714285715, 'recall': 0.8280254777070064, 'precision': 0.7428571428571429, 'f1': 0.7831325301204819}
2
+ checkpoint: vinai/bertweet-large: {'accuracy': 0.7365079365079366, 'recall': 0.8535031847133758, 'precision': 0.6907216494845361, 'f1': 0.7635327635327636}
classification/model_with_only_language_models/test.parquet.gzip ADDED
Binary file (19.8 kB). View file
 
classification/model_with_only_language_models/text_preprocessing.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+
3
+ def clear_reply_mentions(tweet):
4
+ '''Remove user mentions found in a reply to a tweet.
5
+
6
+ Example: @user1 @user2 okay @user3 -> okay @user3
7
+ '''
8
+ # We don't need to use any sophisticated tokenization here like nltk
9
+ tokens = tweet.split(" ")
10
+ for index in range(len(tokens)):
11
+ if not tokens[index].startswith("@"):
12
+ return " ".join(tokens[index:])
13
+ return ""
14
+
15
+ from emoji import demojize, is_emoji
16
+ from nltk.tokenize import TweetTokenizer
17
+
18
+ tweet_tokenizer = TweetTokenizer()
19
+
20
+ def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
21
+ lowercased_token = token.lower()
22
+ if token.startswith("@") and replace_user_mentions:
23
+ return "@USER"
24
+ elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
25
+ return "HTTPURL"
26
+ elif len(token) == 1 and is_emoji(token):
27
+ emojis_found.append(token)
28
+ if demojize_emojis:
29
+ return demojize(token)
30
+ else:
31
+ return token
32
+ else:
33
+ if token == "’":
34
+ return "'"
35
+ elif token == "…":
36
+ return "..."
37
+ else:
38
+ return token
39
+
40
+
41
+ def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
42
+ emojis_found = []
43
+ tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
44
+ normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found,
45
+ replace_user_mentions=replace_user_mentions,
46
+ replace_urls=replace_urls,
47
+ demojize_emojis=demojize_emojis) for token in tokens])
48
+
49
+ if bert_tweet_specific_processing:
50
+ normTweet = (
51
+ normTweet.replace("cannot ", "can not ")
52
+ .replace("n't ", " n't ")
53
+ .replace("n 't ", " n't ")
54
+ .replace("ca n't", "can't")
55
+ .replace("ai n't", "ain't")
56
+ )
57
+ normTweet = (
58
+ normTweet.replace("'m ", " 'm ")
59
+ .replace("'re ", " 're ")
60
+ .replace("'s ", " 's ")
61
+ .replace("'ll ", " 'll ")
62
+ .replace("'d ", " 'd ")
63
+ .replace("'ve ", " 've ")
64
+ )
65
+ normTweet = (
66
+ normTweet.replace(" p . m .", " p.m.")
67
+ .replace(" p . m ", " p.m ")
68
+ .replace(" a . m .", " a.m.")
69
+ .replace(" a . m ", " a.m ")
70
+ )
71
+
72
+ return " ".join(normTweet.split()), emojis_found
73
+
74
+
75
+ def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
76
+ demojize_emojis=True, bert_tweet_specific_processing=True):
77
+ '''Helper function to clean tweets. Highly customizable to fit different needs.
78
+
79
+ Params:
80
+ tweet: the tweet to clean
81
+ clear_html_chars: If true, will unescape any special html entities found in the tweet
82
+ replace_user_mentions: If true, will replace any user mention with the token @USER
83
+ replace_urls: If true, will replace any urls with the token HTTPURL
84
+ demojize_emojis: If true, will demojize emojis
85
+ bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model
86
+
87
+ Returns:
88
+ The cleaned tweet
89
+ '''
90
+ # First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
91
+ # These do not count in the character count of a tweet and may make the tweet length go way overboard.
92
+ cleaned_tweet = clear_reply_mentions(tweet)
93
+
94
+ # Second step: Remove any new lines
95
+ cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')
96
+
97
+ # Third step: if True, escape any html entities
98
+ if clear_html_chars:
99
+ cleaned_tweet = html.unescape(cleaned_tweet)
100
+
101
+ # Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
102
+ normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
103
+ replace_user_mentions=replace_user_mentions,
104
+ replace_urls=replace_urls,
105
+ demojize_emojis=demojize_emojis,
106
+ bert_tweet_specific_processing=bert_tweet_specific_processing)
107
+
108
+ # TODO: process emoticons? e.g. :)
109
+
110
+ return normalized_tweet
classification/model_with_only_language_models/train.parquet.gzip ADDED
Binary file (67.4 kB). View file
 
data/control.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d7490cb0993941b8a024c1671e9b9ac8e5914de3dce2aad9d68de0c47d7ae4
3
+ size 22727820
data/viral.csv ADDED
@@ -0,0 +1,1042 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tweet_id
2
+ 1584932886167244801
3
+ 1593947297615794176
4
+ 1591270450294059008
5
+ 1581699730639589378
6
+ 1591043902521638913
7
+ 1584903132362211333
8
+ 1593612903801929728
9
+ 1593479807739625473
10
+ 1591541757908508674
11
+ 1593770726061924352
12
+ 1584737507203485696
13
+ 1594121081245384707
14
+ 1590100845982781441
15
+ 1593653567516299264
16
+ 1585021201704882177
17
+ 1594345561146392579
18
+ 1592506452916289536
19
+ 1594029216059703297
20
+ 1594004245535154176
21
+ 1592615266953101312
22
+ 1590143170649935872
23
+ 1591484993188298752
24
+ 1593265509796978690
25
+ 1594386871215267840
26
+ 1591609295279374336
27
+ 1590051500901822464
28
+ 1592276529354268672
29
+ 1594108905071996928
30
+ 1590107396776476672
31
+ 1590056737591422977
32
+ 1592682602884128768
33
+ 1592591165786054657
34
+ 1590937532035780608
35
+ 1594008547414736896
36
+ 1585026683962556416
37
+ 1584802342083661824
38
+ 1593020357853982722
39
+ 1593956647629598721
40
+ 1591076060535422977
41
+ 1592413862493851648
42
+ 1592937493472874502
43
+ 1591425970996908032
44
+ 1593404493827391488
45
+ 1581841556591751168
46
+ 1594355223694147584
47
+ 1590299243990102017
48
+ 1592553375572578304
49
+ 1581711870255575041
50
+ 1590718839284985859
51
+ 1592620843993268224
52
+ 1591882690730815490
53
+ 1592956946529017857
54
+ 1584521556427886597
55
+ 1584729652215828481
56
+ 1592942420110430208
57
+ 1593750550423773190
58
+ 1592690842900525056
59
+ 1592723627648372736
60
+ 1593431015309066242
61
+ 1590182204914929666
62
+ 1592186934201577472
63
+ 1592584760844771328
64
+ 1590780608644775937
65
+ 1594441159358615552
66
+ 1582024160087183361
67
+ 1591149563402145793
68
+ 1590786900822200320
69
+ 1592274617057816576
70
+ 1590872573407293441
71
+ 1593886676346363904
72
+ 1594188205448105988
73
+ 1593380253451403265
74
+ 1584647230635462656
75
+ 1594375031680663553
76
+ 1591090938990198784
77
+ 1590148564063518720
78
+ 1590767742805647360
79
+ 1594118443174871040
80
+ 1593699015417159680
81
+ 1592161331754721281
82
+ 1591480117473472514
83
+ 1581701001496518657
84
+ 1594119859826479104
85
+ 1591921147611856901
86
+ 1582017942060863489
87
+ 1591153087082688512
88
+ 1582090308220686336
89
+ 1591445228351029248
90
+ 1590514595973988352
91
+ 1584739482062184448
92
+ 1593742582961639425
93
+ 1592632272158879745
94
+ 1592692406998429696
95
+ 1592911136734969856
96
+ 1592238993730670593
97
+ 1593024744491978752
98
+ 1593400387381465088
99
+ 1590364067306573827
100
+ 1584900232860631041
101
+ 1582017607279923201
102
+ 1581887155433279489
103
+ 1592312202169810944
104
+ 1590317790871031811
105
+ 1592505565695315970
106
+ 1592544903397265409
107
+ 1594117873261240326
108
+ 1594216494078230529
109
+ 1594703707673079808
110
+ 1584721574296682501
111
+ 1590109110644903937
112
+ 1594076475770425347
113
+ 1591138469442568192
114
+ 1590864178583318528
115
+ 1591955167175659521
116
+ 1592616005187956737
117
+ 1594387207434838017
118
+ 1591427829128134656
119
+ 1591713956338491393
120
+ 1592271789312573440
121
+ 1592874573892317185
122
+ 1592579050249236480
123
+ 1592721942733533185
124
+ 1584650521364688896
125
+ 1585111756887977984
126
+ 1590815960113487891
127
+ 1590815704218861568
128
+ 1594131004301656064
129
+ 1593763223224057857
130
+ 1584947082842230790
131
+ 1592124116748103681
132
+ 1594100421127290882
133
+ 1592453153169223681
134
+ 1584867689264476160
135
+ 1592494074715836416
136
+ 1584931955938070528
137
+ 1584726237171568640
138
+ 1592560270270173184
139
+ 1584741658649776128
140
+ 1584584149318651905
141
+ 1591190276113981440
142
+ 1590727361984888834
143
+ 1593411758911549440
144
+ 1594051495472746496
145
+ 1594063774721650688
146
+ 1591880750101827584
147
+ 1593830994867589126
148
+ 1584720902700568578
149
+ 1592950775076257793
150
+ 1584561799843614720
151
+ 1591860534101827584
152
+ 1591866661434720256
153
+ 1584566687659003904
154
+ 1593991931541049350
155
+ 1590060339135381505
156
+ 1584808596738813952
157
+ 1593787603853094916
158
+ 1594345571262799873
159
+ 1593656488773537792
160
+ 1591868427932962816
161
+ 1594084459770429440
162
+ 1581662270186090496
163
+ 1591422937349132289
164
+ 1590744874483339264
165
+ 1591297640133795841
166
+ 1592962528816287744
167
+ 1591840251907276800
168
+ 1590195663702458369
169
+ 1584843627930718208
170
+ 1592708602716123137
171
+ 1593779273877291008
172
+ 1590783953375723520
173
+ 1594688865100795905
174
+ 1590354018622402560
175
+ 1594109089520308224
176
+ 1584986718322950144
177
+ 1593764585290473479
178
+ 1592566201414475777
179
+ 1592294532439502848
180
+ 1584654305952337922
181
+ 1593819425748455424
182
+ 1590016468066140160
183
+ 1581809942570287104
184
+ 1592646198175092737
185
+ 1592732829682528256
186
+ 1593256204473864195
187
+ 1591277386640736256
188
+ 1594471398902272000
189
+ 1594363137419104261
190
+ 1581709339361886208
191
+ 1592594235294982147
192
+ 1592668553521811456
193
+ 1584730705086455808
194
+ 1584531644836646915
195
+ 1584580045368352768
196
+ 1584643007986683905
197
+ 1581758037554888704
198
+ 1593066025008107520
199
+ 1593253992171126786
200
+ 1591875679377031169
201
+ 1591108798244347905
202
+ 1591646097520611328
203
+ 1592204998255185920
204
+ 1593745429765947397
205
+ 1594552471531454464
206
+ 1591814551800250368
207
+ 1592926448498937856
208
+ 1593927022769750017
209
+ 1591001937218093056
210
+ 1590604334558871552
211
+ 1581821777579106305
212
+ 1581898364144144384
213
+ 1593647645645414400
214
+ 1590344271961677827
215
+ 1592228025885945856
216
+ 1594199444018561025
217
+ 1594158151061983235
218
+ 1593861908490293249
219
+ 1590052480142172160
220
+ 1593315954184028160
221
+ 1594028451807506432
222
+ 1590188416192610304
223
+ 1591867143616069634
224
+ 1593266642892595201
225
+ 1591398036793597954
226
+ 1581652475555127296
227
+ 1592399074736902144
228
+ 1594102237281849345
229
+ 1594419326697848833
230
+ 1590887994973900800
231
+ 1584808125601026048
232
+ 1594410749656322048
233
+ 1592233871021867009
234
+ 1592392374063620096
235
+ 1592068317157941249
236
+ 1592816576415559683
237
+ 1584951794580742145
238
+ 1581906374396506112
239
+ 1593956302824566785
240
+ 1590839330201014273
241
+ 1591548912292491264
242
+ 1585089769935343616
243
+ 1593615212799746050
244
+ 1581690578571444225
245
+ 1593342410205134849
246
+ 1581676349994856451
247
+ 1592815064234721281
248
+ 1590508168441839617
249
+ 1594485884988030977
250
+ 1594836721102888960
251
+ 1591088972016803842
252
+ 1590101414386143232
253
+ 1584761833247756288
254
+ 1590153744976867329
255
+ 1591898753350664192
256
+ 1592447124343775232
257
+ 1594180142703575040
258
+ 1590901563647983618
259
+ 1594080238945947650
260
+ 1594229593874964481
261
+ 1592536293762019328
262
+ 1591543271150407680
263
+ 1591177229865582592
264
+ 1590863898273452032
265
+ 1592934665597616128
266
+ 1593047972623048704
267
+ 1593424009080848389
268
+ 1591357062692634625
269
+ 1590189415799758848
270
+ 1591143431019331584
271
+ 1590072621864869888
272
+ 1592690621697101824
273
+ 1592579340793171969
274
+ 1590185325149978625
275
+ 1592712852514959362
276
+ 1593713382959595521
277
+ 1591492092043595776
278
+ 1593707058397847556
279
+ 1591557990444064768
280
+ 1591222092351275008
281
+ 1594086416484630531
282
+ 1594381233575829511
283
+ 1591491501678526464
284
+ 1581811762822664193
285
+ 1592550639543803904
286
+ 1593735934733754368
287
+ 1590274908239826944
288
+ 1581746748846141441
289
+ 1593719366385901568
290
+ 1592524002639376385
291
+ 1584749509145690114
292
+ 1593264045469143043
293
+ 1584965726724837377
294
+ 1590821075981856769
295
+ 1592677748383719424
296
+ 1592003897132199937
297
+ 1581799078597230594
298
+ 1593750298224648193
299
+ 1593734302382129152
300
+ 1584684864304914432
301
+ 1594029166059524097
302
+ 1584630796840996864
303
+ 1594322584731951110
304
+ 1590133855801204736
305
+ 1591517445814358017
306
+ 1594319831108816898
307
+ 1594053138633621504
308
+ 1592942121602121728
309
+ 1590879809852301315
310
+ 1591813892841541632
311
+ 1591469036793393155
312
+ 1584652769096781824
313
+ 1594763150368411648
314
+ 1584642009410973696
315
+ 1594771442729721856
316
+ 1582060359178854400
317
+ 1590691417906876416
318
+ 1594055363342598144
319
+ 1591250467618959361
320
+ 1591131814890188802
321
+ 1590016441092169728
322
+ 1590071040209997826
323
+ 1584746408351272960
324
+ 1593229099308548098
325
+ 1593656191443410946
326
+ 1592681211176312832
327
+ 1593302498718224386
328
+ 1591769082336280576
329
+ 1593591679377735685
330
+ 1584949615568175104
331
+ 1592619146743017473
332
+ 1584655489857486848
333
+ 1592345304434868225
334
+ 1592776050685407233
335
+ 1592281453186134016
336
+ 1593367844325949442
337
+ 1590171569770229760
338
+ 1594451431549194240
339
+ 1592149621962584064
340
+ 1581654698549202944
341
+ 1592694495333986304
342
+ 1585020957898723328
343
+ 1593776060516237317
344
+ 1593707705465815042
345
+ 1591574660290052096
346
+ 1594147204339732480
347
+ 1590354604159795201
348
+ 1591459582618267648
349
+ 1591441282957967360
350
+ 1592595549865537537
351
+ 1590149965283987457
352
+ 1592492545258291201
353
+ 1592316200276533248
354
+ 1592651505550917632
355
+ 1591242670755676160
356
+ 1591804790555955201
357
+ 1591912966810918912
358
+ 1594367305034571777
359
+ 1581749087933562880
360
+ 1590527640414621697
361
+ 1591135304337215488
362
+ 1592541725670912002
363
+ 1592252153812615168
364
+ 1593043322914091009
365
+ 1590927402338689024
366
+ 1581730850705506304
367
+ 1582002053983145987
368
+ 1591018546905354244
369
+ 1592147121809285122
370
+ 1593235425162063874
371
+ 1592251929144758273
372
+ 1590101047904714757
373
+ 1593892998462726144
374
+ 1590064513809727489
375
+ 1591086179847659520
376
+ 1591199565059100672
377
+ 1594719218632720385
378
+ 1594369024410124288
379
+ 1591730791369043970
380
+ 1590871372838109189
381
+ 1590043224235704322
382
+ 1584770222866993152
383
+ 1594011438032502784
384
+ 1592156930356690945
385
+ 1594547086363590658
386
+ 1594474457002872832
387
+ 1591795256713953281
388
+ 1584795643394289664
389
+ 1591196561723527168
390
+ 1591046399936065537
391
+ 1593392197675753475
392
+ 1591136726852186113
393
+ 1591309750016573441
394
+ 1592921043152560128
395
+ 1593692760225333251
396
+ 1581743475178938368
397
+ 1584870736535388163
398
+ 1591491057900204032
399
+ 1593489765910728705
400
+ 1591937669097226241
401
+ 1594063579002867712
402
+ 1581732169222787072
403
+ 1591257399780073472
404
+ 1591361388710350849
405
+ 1590091235276255232
406
+ 1584662662902054912
407
+ 1594475910278807557
408
+ 1593781469331173377
409
+ 1594362041523113984
410
+ 1593151866715926528
411
+ 1581841156173754373
412
+ 1592652261679071232
413
+ 1584984440300003328
414
+ 1594048275245682690
415
+ 1594160666096668673
416
+ 1584976145979936768
417
+ 1592805028162842624
418
+ 1581746410210226176
419
+ 1593143320938958849
420
+ 1590541447832342528
421
+ 1591485729884061699
422
+ 1591930970273374212
423
+ 1590399258066554881
424
+ 1594107997755789315
425
+ 1584641748302987264
426
+ 1592842640026144768
427
+ 1594347483164835840
428
+ 1590762672307388416
429
+ 1590885266029518848
430
+ 1590813909732425728
431
+ 1593027340619382784
432
+ 1590372754427359232
433
+ 1591823348035891200
434
+ 1592350462694813698
435
+ 1592640116220080129
436
+ 1581928389505843200
437
+ 1591657525459681280
438
+ 1591024644370595840
439
+ 1591161964419432449
440
+ 1592578617623875584
441
+ 1592835021727207424
442
+ 1591259039551623170
443
+ 1593427787251171330
444
+ 1593910091325255680
445
+ 1584994235916627969
446
+ 1591829980895653890
447
+ 1591150403315728384
448
+ 1593751615323922433
449
+ 1594660901143326720
450
+ 1591416296025227265
451
+ 1594432253630861313
452
+ 1591100087325032448
453
+ 1592903001093861377
454
+ 1593058791377219585
455
+ 1590393634394107904
456
+ 1593336599655419904
457
+ 1590449768480014336
458
+ 1594085747442728962
459
+ 1592279720468811777
460
+ 1591469596766347265
461
+ 1594089411360202756
462
+ 1591896706509336576
463
+ 1581919999870267392
464
+ 1584563232789786624
465
+ 1590305174949490688
466
+ 1590809573367382016
467
+ 1591888226100609024
468
+ 1591215518262099969
469
+ 1592739592922202113
470
+ 1593414387187998720
471
+ 1593729958815432704
472
+ 1593418938200752128
473
+ 1590072455208800256
474
+ 1593751492929867776
475
+ 1594039118152945664
476
+ 1594003784388124675
477
+ 1594724296093728769
478
+ 1590367404533510144
479
+ 1591350427374997504
480
+ 1581779457857449985
481
+ 1592496174493478912
482
+ 1593340544691752960
483
+ 1590527106735542273
484
+ 1593314023616937992
485
+ 1582068981653188608
486
+ 1592228084790566914
487
+ 1591856193567428609
488
+ 1584733340179255296
489
+ 1593043564153475075
490
+ 1590751921417379840
491
+ 1581672995277647872
492
+ 1590825946181144577
493
+ 1592321489340014592
494
+ 1593404774245974018
495
+ 1592862723217690624
496
+ 1592835534673809408
497
+ 1591466775727988737
498
+ 1590226183651987457
499
+ 1591742929701568512
500
+ 1590236530861563905
501
+ 1594345672815394819
502
+ 1594358207127896071
503
+ 1591062787446706178
504
+ 1593351364851163136
505
+ 1590051193610723328
506
+ 1584942999678914565
507
+ 1593052032709328898
508
+ 1593653246353997824
509
+ 1591508639126556672
510
+ 1582001113204604930
511
+ 1592608398721581057
512
+ 1592562373357101058
513
+ 1592958767507079168
514
+ 1591523511168008192
515
+ 1584668584042967041
516
+ 1594358061858230274
517
+ 1594741976775245824
518
+ 1592498457042096128
519
+ 1584727972782559232
520
+ 1590012084104089601
521
+ 1593987535919009792
522
+ 1581819691512713216
523
+ 1593342048874151938
524
+ 1590775114484060163
525
+ 1594397069476773889
526
+ 1593473916751618048
527
+ 1581777502527098880
528
+ 1581914647510147072
529
+ 1590229652723077120
530
+ 1592276391818428416
531
+ 1592936641135775744
532
+ 1594018594031009804
533
+ 1594330383084142594
534
+ 1593779350293323777
535
+ 1591166437971791872
536
+ 1591872052805660672
537
+ 1581723066492592128
538
+ 1590649216745234433
539
+ 1593653622344130563
540
+ 1591735356483076102
541
+ 1591492436790239234
542
+ 1594422480004866113
543
+ 1584688847622848513
544
+ 1592187558163017729
545
+ 1585002070306082816
546
+ 1590398209134067717
547
+ 1590271200084643840
548
+ 1594198516523417600
549
+ 1592283077715304450
550
+ 1590197812125310976
551
+ 1590391895926411266
552
+ 1594009413668417536
553
+ 1590480221257359361
554
+ 1590923069568462849
555
+ 1593644793430687745
556
+ 1594306075729068034
557
+ 1584555558585016320
558
+ 1581816018040946689
559
+ 1591419036490924032
560
+ 1582087656837902336
561
+ 1593010883638472704
562
+ 1593637163328995336
563
+ 1594087936982544385
564
+ 1593045630074884099
565
+ 1592055749521608704
566
+ 1594226059331137536
567
+ 1584586808620625920
568
+ 1592600430168903680
569
+ 1584533803292250112
570
+ 1593983467435556866
571
+ 1584722316726259712
572
+ 1590348590093737984
573
+ 1592932873186988032
574
+ 1591872909118963712
575
+ 1592574256642428931
576
+ 1593394493461364736
577
+ 1594305866114686978
578
+ 1593273627583078401
579
+ 1592051970076200960
580
+ 1593035255652306945
581
+ 1592610023502680066
582
+ 1591616332851974145
583
+ 1581723091234787328
584
+ 1593617464881217541
585
+ 1590400043261235200
586
+ 1581991973585530880
587
+ 1594027801262579712
588
+ 1594126958010699776
589
+ 1592765159851687936
590
+ 1590436834244702213
591
+ 1593764685484261376
592
+ 1591205091755057152
593
+ 1594346986299236352
594
+ 1594048948423241728
595
+ 1590124686092165120
596
+ 1592965757436301312
597
+ 1591419399319261186
598
+ 1593902458354688000
599
+ 1594410401806139392
600
+ 1594147573052628992
601
+ 1581753804638167045
602
+ 1584924021778706435
603
+ 1592572435735670785
604
+ 1592365343825342464
605
+ 1584940462078857216
606
+ 1585036648186011649
607
+ 1592042027134627840
608
+ 1582027600335032320
609
+ 1593403708863401986
610
+ 1585034306728382464
611
+ 1592202761889755136
612
+ 1592340512379633666
613
+ 1592166785415274496
614
+ 1593654882346536961
615
+ 1591646166546288644
616
+ 1594415094255226882
617
+ 1591263468098920455
618
+ 1594174861831610368
619
+ 1591917897471098880
620
+ 1592231550783049730
621
+ 1592324240979922944
622
+ 1590990236360015872
623
+ 1584919027717218304
624
+ 1592914265958387713
625
+ 1594172544747438080
626
+ 1581816339018436608
627
+ 1591097885499994112
628
+ 1591242143523540992
629
+ 1592989843281567746
630
+ 1581911862215868418
631
+ 1585022874993451008
632
+ 1591476627086934017
633
+ 1592648080054050816
634
+ 1592854388506238977
635
+ 1584691937558159360
636
+ 1594687064192462849
637
+ 1594372743352684547
638
+ 1593364734677381120
639
+ 1591343889344270336
640
+ 1591857120768843776
641
+ 1592790505754365952
642
+ 1590854281854603269
643
+ 1591144433550249985
644
+ 1591935412473315329
645
+ 1584950741617487874
646
+ 1590563592494936066
647
+ 1590481343393435648
648
+ 1594117796547670017
649
+ 1591442159936638978
650
+ 1590631263094050816
651
+ 1593660175063318528
652
+ 1591953261816602624
653
+ 1594474944238616576
654
+ 1591069763375366144
655
+ 1590460276952162304
656
+ 1592619192293142529
657
+ 1591586464269492224
658
+ 1591208497655746560
659
+ 1584868956527300608
660
+ 1594818192093872128
661
+ 1590520584328577025
662
+ 1591443259775221762
663
+ 1590503297156513794
664
+ 1592176482151010307
665
+ 1584619668928503808
666
+ 1593366727907192832
667
+ 1593411302692626433
668
+ 1593978500796522500
669
+ 1593804572119556097
670
+ 1581678720909463553
671
+ 1594311331213365249
672
+ 1590782712616738816
673
+ 1594259908362670080
674
+ 1591884969357770754
675
+ 1592739800766783489
676
+ 1591785060922949632
677
+ 1593956540846903297
678
+ 1594603597891006464
679
+ 1594362056416714752
680
+ 1593284228669706241
681
+ 1590669779584831496
682
+ 1592914227391782913
683
+ 1593895472787324928
684
+ 1581993597859426305
685
+ 1592556049420386304
686
+ 1584740792601411585
687
+ 1584629161247006721
688
+ 1591457113595052032
689
+ 1590553214340182018
690
+ 1592262105503502336
691
+ 1592199519181373442
692
+ 1591490334122741761
693
+ 1592615654158655488
694
+ 1591601831259828229
695
+ 1590352916959080448
696
+ 1584617872663670784
697
+ 1590775767298101248
698
+ 1591408088665710593
699
+ 1591637750515732480
700
+ 1594362004013293568
701
+ 1592125182772412416
702
+ 1592328406150307841
703
+ 1593466144832339970
704
+ 1591146771677614080
705
+ 1594109754854416384
706
+ 1592923822373580801
707
+ 1592539676824719361
708
+ 1594156156163100677
709
+ 1590062498367311872
710
+ 1591519496770301952
711
+ 1590288916447657984
712
+ 1591349039215579136
713
+ 1581901472475725824
714
+ 1584707560795348992
715
+ 1584598360551718912
716
+ 1584610043085094912
717
+ 1590313643203497984
718
+ 1592187096856662016
719
+ 1594474911434612737
720
+ 1592831263865712641
721
+ 1581927859610079232
722
+ 1591309090642595840
723
+ 1594313832402661378
724
+ 1591156546989588480
725
+ 1591998031645536258
726
+ 1593975769021272068
727
+ 1592266902390595584
728
+ 1593018828601974784
729
+ 1592926585359060993
730
+ 1594509841548058624
731
+ 1591173731833253889
732
+ 1582043766378024960
733
+ 1590898120048939008
734
+ 1591621240388820993
735
+ 1592243174931660803
736
+ 1592168429624365058
737
+ 1592559930841567232
738
+ 1594044519258931200
739
+ 1593260714218725377
740
+ 1581872859529433094
741
+ 1584651600291061760
742
+ 1590201802045394946
743
+ 1591083686480711680
744
+ 1592210208105037824
745
+ 1584576897622904832
746
+ 1591606416447799296
747
+ 1593779195037065216
748
+ 1591285460013576192
749
+ 1592726193006333955
750
+ 1594117163136188419
751
+ 1585075798041468929
752
+ 1592306809796452352
753
+ 1593096180015456257
754
+ 1593775860863082499
755
+ 1581732104726925313
756
+ 1590755452157325314
757
+ 1592695927919124480
758
+ 1591496900376764417
759
+ 1591183384101154816
760
+ 1593501868961181696
761
+ 1592605963596402689
762
+ 1593354700535087111
763
+ 1594087094883860480
764
+ 1581846666629390337
765
+ 1593781714203324416
766
+ 1594556851332390914
767
+ 1592811405987958784
768
+ 1591121551906443265
769
+ 1590183416607444994
770
+ 1591128713680539648
771
+ 1593105360805822464
772
+ 1593728653883576322
773
+ 1590011566757670912
774
+ 1590607856339800064
775
+ 1593612864731881472
776
+ 1593804962689175553
777
+ 1581798074568974338
778
+ 1591114856115359745
779
+ 1591240995357700096
780
+ 1590288815092289536
781
+ 1584705050643156993
782
+ 1593447822451347456
783
+ 1592303223330996226
784
+ 1594178509714472963
785
+ 1591432111390883840
786
+ 1594462345618771968
787
+ 1590628471914770435
788
+ 1590857044802039810
789
+ 1591504609939709952
790
+ 1584946087827505152
791
+ 1593350223186825221
792
+ 1581754808502865922
793
+ 1582162745054461954
794
+ 1591051620867248131
795
+ 1582077161057681409
796
+ 1591508684856872960
797
+ 1591895494871203840
798
+ 1590735236882915328
799
+ 1590091378406883329
800
+ 1590290073475772417
801
+ 1585028458518040577
802
+ 1593257581946220546
803
+ 1591539520608284672
804
+ 1591733660558184449
805
+ 1594023348169715713
806
+ 1594013417370980352
807
+ 1593985139926867968
808
+ 1593082069399523328
809
+ 1594010343478800384
810
+ 1591506159751802883
811
+ 1594161468764741641
812
+ 1591152301817102336
813
+ 1590160982709764096
814
+ 1581805981230514176
815
+ 1590295663631556608
816
+ 1591814750152921089
817
+ 1591618839829426177
818
+ 1593879504766009344
819
+ 1592084203998711808
820
+ 1594292538898321414
821
+ 1592674870210400256
822
+ 1591432007015632896
823
+ 1592340157537345537
824
+ 1590377761348423682
825
+ 1594190861667274755
826
+ 1593286137543548932
827
+ 1594128585455009795
828
+ 1592718738956365826
829
+ 1593850858894245889
830
+ 1591494640259178500
831
+ 1591923274514313216
832
+ 1584916360936685568
833
+ 1592915632601075712
834
+ 1590011683145416704
835
+ 1593292235222978560
836
+ 1594423088619331584
837
+ 1594604452425646080
838
+ 1592565860128129024
839
+ 1594401119727366145
840
+ 1594351645021782016
841
+ 1584995921519341568
842
+ 1590628167416705024
843
+ 1591719724987342848
844
+ 1594243897248518144
845
+ 1592624359516274688
846
+ 1584578124418666496
847
+ 1594679887918379009
848
+ 1593008166316707840
849
+ 1584661366853758976
850
+ 1594368681227833344
851
+ 1590731558264393729
852
+ 1590300788861337600
853
+ 1593842771529732097
854
+ 1594541995082780672
855
+ 1590789809286811648
856
+ 1590758963942866944
857
+ 1591592992577839104
858
+ 1593343172410855434
859
+ 1591063320454651906
860
+ 1593351666484400128
861
+ 1591811086269186048
862
+ 1590508439926865920
863
+ 1590745145305026570
864
+ 1592296389060751361
865
+ 1594350932833480704
866
+ 1590417798475714560
867
+ 1592979533506150400
868
+ 1593219988038770690
869
+ 1584576796002877441
870
+ 1593784760387735552
871
+ 1581691410566483968
872
+ 1581753676666138624
873
+ 1590504689241849856
874
+ 1584550522543304704
875
+ 1590567832173301760
876
+ 1591172828308668419
877
+ 1592636753323884547
878
+ 1592856964077613058
879
+ 1592703088854728704
880
+ 1592263701738004480
881
+ 1591858604386164740
882
+ 1593245930823696387
883
+ 1591580896494518272
884
+ 1592003898021408768
885
+ 1590179659521851393
886
+ 1594198724149919744
887
+ 1592654778987053057
888
+ 1594767543549714433
889
+ 1590152297472229376
890
+ 1591106443318788097
891
+ 1590058001888849920
892
+ 1582164813928550400
893
+ 1594189887464345600
894
+ 1584744902566752256
895
+ 1592929419949023232
896
+ 1594360547943874560
897
+ 1592905771486171142
898
+ 1591602175876399104
899
+ 1592571278397149185
900
+ 1594202033074352128
901
+ 1592574912409264128
902
+ 1592875359577464833
903
+ 1591912431009529856
904
+ 1592994508114993153
905
+ 1594342856973582336
906
+ 1593099906348384256
907
+ 1594523625419599874
908
+ 1592206458044940291
909
+ 1594505849002663936
910
+ 1590778724043358208
911
+ 1593026641760223233
912
+ 1592625882723590144
913
+ 1591836672383406080
914
+ 1590527437418684417
915
+ 1592580537516191745
916
+ 1584638928422043648
917
+ 1590073894182227969
918
+ 1581942283133669376
919
+ 1594002702618529793
920
+ 1593901369257254912
921
+ 1591480349217148928
922
+ 1591309259853402113
923
+ 1591231998844366849
924
+ 1592590992309628930
925
+ 1592726099326566401
926
+ 1593967327808294913
927
+ 1594458638524874752
928
+ 1591016952633479168
929
+ 1592780529203843073
930
+ 1593957492127850496
931
+ 1594348941784629248
932
+ 1592920457925210112
933
+ 1594163715901108225
934
+ 1593033065202606081
935
+ 1581793105723854848
936
+ 1593368198157664257
937
+ 1585013335233220611
938
+ 1584673283022413824
939
+ 1590382569069367297
940
+ 1581881907520106497
941
+ 1592292889891340289
942
+ 1593621687987232769
943
+ 1590317270093692930
944
+ 1594300575239540736
945
+ 1590851904141418497
946
+ 1592138879301455872
947
+ 1592957879841325058
948
+ 1593312415151968256
949
+ 1591097497279397888
950
+ 1591833273160073216
951
+ 1591961306315329539
952
+ 1591737120372449281
953
+ 1591085639247634432
954
+ 1582206874270674944
955
+ 1590400261528641538
956
+ 1594740637668347904
957
+ 1592319989645574145
958
+ 1594192295770865664
959
+ 1584659959832543234
960
+ 1592152675512365056
961
+ 1581713910385647617
962
+ 1590453582134149122
963
+ 1591086475567071232
964
+ 1593956576615874561
965
+ 1594092319749427200
966
+ 1590727466221719553
967
+ 1592904461927985152
968
+ 1584746224900792321
969
+ 1591303442676428800
970
+ 1592910110787276801
971
+ 1594407232111722496
972
+ 1593394402793328642
973
+ 1594419340073439233
974
+ 1591454131117756417
975
+ 1591911384610406400
976
+ 1593815170752839681
977
+ 1590332309575593985
978
+ 1590570044198883330
979
+ 1593448201889120259
980
+ 1590741333316374529
981
+ 1584642845503541248
982
+ 1591942232818552836
983
+ 1591894439210917888
984
+ 1592533896436285441
985
+ 1591357709743685633
986
+ 1591872813916426241
987
+ 1590026034216595456
988
+ 1591541158727278592
989
+ 1592234746456719361
990
+ 1592599533871312896
991
+ 1592592750322798592
992
+ 1592272878887899136
993
+ 1593338673306771457
994
+ 1593420348006998016
995
+ 1591979442720870401
996
+ 1592722897067380736
997
+ 1593107895088189440
998
+ 1584951473473212416
999
+ 1591138011240357889
1000
+ 1591530518117765120
1001
+ 1593412753363849216
1002
+ 1590768004983582721
1003
+ 1591709174706352129
1004
+ 1592610492153233411
1005
+ 1590850986519629824
1006
+ 1590039646905708544
1007
+ 1592719858885881857
1008
+ 1589394620743823360
1009
+ 1589627294750224384
1010
+ 1589407148462964736
1011
+ 1589415336922984448
1012
+ 1589422829959020545
1013
+ 1589629488207654914
1014
+ 1589499499361480704
1015
+ 1589574654720901121
1016
+ 1589427231650353154
1017
+ 1589396237056606213
1018
+ 1589618205483749377
1019
+ 1589373794241871872
1020
+ 1589667659552989186
1021
+ 1589387612699398144
1022
+ 1589362003864612864
1023
+ 1589396468129230848
1024
+ 1589373743255932928
1025
+ 1589572212235046912
1026
+ 1589417699561869312
1027
+ 1589341658222448640
1028
+ 1589447097329549312
1029
+ 1589341792637714434
1030
+ 1589576111322968064
1031
+ 1589430097899290624
1032
+ 1589721864213237760
1033
+ 1589438531898077184
1034
+ 1589620636015808513
1035
+ 1589639229591912448
1036
+ 1589533143840980993
1037
+ 1589591059566583813
1038
+ 1589414428692209665
1039
+ 1589609023435071495
1040
+ 1589408559028408321
1041
+ 1589484934699569153
1042
+ 1589430117880954880
main.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoModelForSequenceClassification
4
+ from transformers import BertweetTokenizer
5
+ from transformers import pipeline
6
+ from functools import lru_cache
7
+
8
+ from classification.model_with_only_language_models.text_preprocessing import clean_tweet
9
+
10
+
11
+
12
+ texts = [
13
+ 'tl;dr\n\nHumans are just ChatGPT Wrappers in sunglasses\n \n& I couldn’t be more optimistic about the future as a result\n\nThank you \n@ekang426322\n for an exceptionally curated day at BUIDL Europe!\n 🫶',
14
+ 'USD0++ discovered a new source of yield — depeg. \n\nRespect to the innovation\n',
15
+ 'here you can see 4 ai agents \n@dongossen100\n , me, \n@WorldWideWarden16\n and \n@provenauthority291\n discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence',
16
+ '\n arrived to lisbon, building energy is the air',
17
+ "\n received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\n\ncan't wait to address it all\n\nmain points:\n\n- show rank in X DMs to quickly filter out inbox\n\n- rank labels are too distracting (already fixed) \n\n- add an option for users to toggle on/off scores inside the feed\n\n- add a percentile label, e.g. qw 801 (Top 0.1%)\n\n- enable others to add reviews to impact the rank \n\n- explain in detail how rankings are calculated \n\n- show breakdowns of people in DeFi, DePin, Memecoins etc.\n\n- make X Rank opensource \n\n- create a web version\n\np.s. the current version is just a tiny step in our roadmap for the next two months. \n\nthank you for the feedback \n@socialfi_panda101\n \n@adamkillam100\n \n@FamKien106\n \n@antongotchi104\n \n@kliuless128\n \n@0xsudogm163\n \n@monosarin120\n \n@flb_xyz56\n 🫶\n ",
18
+ 'ai agents are in the air\n\nand web3 is trained to sniff out alpha',
19
+ 'While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀',
20
+ ]
21
+ # import pandas as pd
22
+ # pd.DataFrame({'texts': texts}).to_csv('examples.csv')
23
+
24
+
25
+ CHECKPOINT = "classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt"
26
+ MODEL_NAME = "vinai/bertweet-base"
27
+
28
+
29
+ class Tokenizer(BertweetTokenizer):
30
+ def __init__(self, *args, **kwargs):
31
+ return super().__init__(*args, **kwargs)
32
+
33
+ def __call__(self, *args, **kwargs):
34
+ return super().__call__(*args, max_length=100, **kwargs)
35
+
36
+
37
+
38
+ def get_model():
39
+ device = 'cpu'
40
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
41
+ model.load_state_dict(torch.load(CHECKPOINT, map_location=device))
42
+ tokenizer = Tokenizer.from_pretrained(MODEL_NAME, truncation=True, max_length=120)
43
+ return tokenizer, model
44
+
45
+
46
+ @lru_cache(1)
47
+ def get_pipeline():
48
+ tokenizer, model = get_model()
49
+ return pipeline(
50
+ 'text-classification',
51
+ model=model,
52
+ tokenizer=tokenizer,
53
+ device="cpu",
54
+ )
55
+
56
+
57
+ def evaluate(text: str) -> float:
58
+ pipe = get_pipeline()
59
+ res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)
60
+ LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]
61
+ # print(f"{LABEL_1_result:7.2%}")
62
+ return LABEL_1_result
63
+
64
+
65
+ # def serve():
66
+ # pipe()
67
+ # for text in texts:
68
+ # res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)
69
+ # LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]
70
+ # print(f"{LABEL_1_result:7.2%}")
71
+
72
+
73
+ def greet(text: str):
74
+ chance: float = evaluate(text)
75
+ return f"Chance to become viral: {chance:.2%}"
76
+
77
+
78
+ if __name__ == "__main__":
79
+ demo = gr.Interface(
80
+ fn=greet,
81
+ inputs=["text"],
82
+ outputs=["text"],
83
+ examples=[[t] for t in texts],
84
+ )
85
+
86
+ demo.launch()
metric_analysis/1-standardize_metrics.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from glob import glob
3
+ from sklearn import metrics
4
+ from statistics import harmonic_mean
5
+
6
+
7
+
8
+ files = glob('output_original/*.csv')
9
+ theoretical = 1357228
10
+
11
+ dfs = []
12
+
13
+ for file in files:
14
+ filename = file.split('/')[-1]
15
+ df = pd.read_csv(file)
16
+ df.columns = ['tpr', 'new_tweets', 'threshold']
17
+ df['fpr'] = df['new_tweets'] / df['new_tweets'].max()
18
+ df['fpr2'] = df['new_tweets'] / theoretical
19
+ df = df.sort_values(by = ['tpr', 'new_tweets'])
20
+ df = df.drop_duplicates(subset = ['tpr'], keep = 'first')
21
+ df.to_csv('output_standardized/%s' % filename, index = False)
22
+ df['metric'] = filename.split('.csv')[0]
23
+ roc1 = metrics.auc(df['fpr'], df['tpr'])
24
+ roc2 = metrics.auc(df['fpr2'], df['tpr'])
25
+ df['roc1'] = roc1
26
+ df['roc2'] = roc2
27
+
28
+ #roc3
29
+ df95 = df.copy()
30
+ df95 = df95[df95.fpr2 <= 0.016]
31
+ df95['fpr2'] = df95['fpr2']*(1/0.016)
32
+ tprmax = df95.tpr.max()
33
+ if(tprmax < 1):
34
+ fpr2_max = df95.fpr2.max()
35
+ multipli = 1/fpr2_max
36
+ tpr_interpolated = tprmax*multipli
37
+
38
+ tpr = df95['tpr']
39
+ fpr = df95['fpr2']
40
+ tpr.loc[-1] = tpr_interpolated
41
+ fpr.loc[-1] = 1
42
+
43
+ roc95 = metrics.auc(fpr, tpr)
44
+
45
+ df['roc95'] = roc95
46
+ df['fpr3'] = df.fpr2*(1/0.016)
47
+ df['harmonic'] = harmonic_mean([roc95,roc1])
48
+ dfs.append(df)
49
+
50
+ df = pd.concat(dfs)
51
+ df.to_csv('merged_outputs.csv', index = False)
52
+
53
+
metric_analysis/2023-precision-recall-update.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from scipy.stats import hmean
3
+
4
+ df = pd.read_csv('merged_outputs.csv')
5
+ df['recall'] = df['tpr']
6
+ df['tp'] = df['tpr']*1008
7
+ df['fp'] = df['new_tweets'] - df['tp']
8
+ df['precision'] = df['tp'] / (df['tp'] + df['fp'])
9
+
10
+ df['f1'] = hmean(df[['precision', 'recall']], axis=1)
11
+ df['f2'] = 5 * (df['precision'] * df['recall']) / ((4 * df['precision'] ) + df['recall'])
12
+ df['f3'] = 10 * (df['precision'] * df['recall']) / ((9 * df['precision'] ) + df['recall'])
13
+ df['f4'] = 17 * (df['precision'] * df['recall']) / ((16 * df['precision'] ) + df['recall'])
14
+ df['f5'] = 26 * (df['precision'] * df['recall']) / ((25 * df['precision'] ) + df['recall'])
15
+
16
+ # df['f1'] = harmonic_mean([df['precision'], df['recall']])
17
+ metric_names = {
18
+ 'hard_threshold_viral_covered_vs_new_tweets_labeled' : 'RT > T',
19
+ 'virality_avg_retweets_viral_covered_vs_new_tweets_labeled' : 'RT > Avg. RT',
20
+ 'log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled' : 'log(RT / Followers)',
21
+ 'virality_median_retweets_viral_covered_vs_new_tweets_labeled 2': 'RT > Med. RT',
22
+ 'retweets_over_log_followers_viral_covered_vs_new_tweets_labeled': 'RT / log(Followers)',
23
+ 'roberta_paper_metric_viral_covered_vs_new_tweets_labeled': 'Influence Score',
24
+ 'virality_followers_viral_covered_vs_new_tweets_labeled': 'RT / Followers',
25
+ 'log_retweets_over_followers_viral_covered_vs_new_tweets_labeled': 'log(RT) / Followers',
26
+ 'virality_median_retweets_viral_covered_vs_new_tweets_labeled': 'unused',
27
+ 'virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled': 'RT Percentile'
28
+ }
29
+
30
+ df['metric_name'] = '?'
31
+ for key, name in metric_names.items():
32
+ df.loc[df.metric == key, 'metric_name'] = name
33
+
34
+ df.to_csv('all_metric_stats.csv', index = False)
35
+ print()
metric_analysis/output_original/hard_threshold_viral_covered_vs_new_tweets_labeled.csv ADDED
The diff for this file is too large to render. See raw diff
 
metric_analysis/output_original/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_log_retweets_over_followers,nb_of_tweets_labeled_as_viral_log_retweets_over_followers,thresholds_log_retweets_over_followers
2
+ 1.0,319615,2.0823825573913686e-05
3
+ 0.9890873015873016,273938,4.051692731176368e-05
4
+ 0.9791666666666666,255601,4.912336231094642e-05
5
+ 0.9692460317460317,232131,6.06772297454726e-05
6
+ 0.9593253968253969,215154,6.953399127171892e-05
7
+ 0.9494047619047619,198735,7.906615255441045e-05
8
+ 0.939484126984127,177935,9.274401243552317e-05
9
+ 0.9295634920634921,163454,0.00010376301156781099
10
+ 0.9196428571428571,153038,0.00011162448201235416
11
+ 0.9097222222222222,142348,0.00012118386659212715
12
+ 0.8998015873015873,133957,0.00012903905621694119
13
+ 0.8898809523809523,127440,0.0001359807274111013
14
+ 0.8799603174603174,119119,0.00014553053532917479
15
+ 0.8700396825396826,113262,0.00015434076599951405
16
+ 0.8601190476190477,109675,0.0001597665218121121
17
+ 0.8492063492063492,107026,0.0001638177776107626
18
+ 0.8392857142857143,102033,0.00017177097081931192
19
+ 0.8293650793650794,97923,0.0001784615254643663
20
+ 0.8194444444444444,94063,0.00018575504552597574
21
+ 0.8095238095238095,90777,0.00019285167032283564
22
+ 0.7996031746031746,87559,0.00019934327697333082
23
+ 0.7896825396825397,84808,0.0002054679665547375
24
+ 0.7797619047619048,78571,0.00021844458141730431
25
+ 0.7698412698412699,75718,0.000226263674225816
26
+ 0.7599206349206349,72948,0.00023435184719918162
27
+ 0.75,70894,0.00024075728743866513
28
+ 0.7400793650793651,67584,0.0002501977740079047
29
+ 0.7301587301587301,62958,0.00026505673893843605
30
+ 0.7202380952380952,58502,0.0002800657421942939
31
+ 0.7093253968253969,56801,0.0002862937577127316
32
+ 0.6994047619047619,55365,0.00029176513580133967
33
+ 0.689484126984127,53140,0.00030080706952046094
34
+ 0.6795634920634921,49826,0.00031641255088538567
35
+ 0.6696428571428571,47808,0.00032637026326093506
36
+ 0.6597222222222222,44674,0.0003421251688588788
37
+ 0.6498015873015873,42986,0.00035116572142065137
38
+ 0.6398809523809523,42099,0.00035691935841298465
39
+ 0.6299603174603174,41027,0.00036494634545917705
40
+ 0.6200396825396826,39620,0.00037486982308993465
41
+ 0.6101190476190477,37806,0.0003870781771608389
42
+ 0.6001984126984127,36276,0.00039804193717933356
43
+ 0.5902777777777778,34629,0.00041120044887010685
44
+ 0.5803571428571429,32866,0.00042635419690807295
45
+ 0.5694444444444444,31354,0.00043719376387239573
46
+ 0.5595238095238095,29882,0.0004524594289673326
47
+ 0.5496031746031746,28535,0.00046812040833873927
48
+ 0.5396825396825397,26667,0.00048767647518345024
49
+ 0.5297619047619048,26013,0.0004976799255063901
50
+ 0.5198412698412699,25296,0.0005061834084447068
51
+ 0.5099206349206349,24228,0.0005212916903156303
52
+ 0.5,23032,0.0005395738872296509
53
+ 0.49007936507936506,21566,0.0005647021922270769
54
+ 0.4801587301587302,20284,0.0005895765740513234
55
+ 0.47023809523809523,19552,0.0006047211659253739
56
+ 0.4603174603174603,18354,0.0006301131324195934
57
+ 0.4503968253968254,17340,0.000652506927330073
58
+ 0.44047619047619047,16628,0.0006743321938094685
59
+ 0.4305555555555556,15822,0.000696980606374073
60
+ 0.41964285714285715,15421,0.0007106285418656998
61
+ 0.4097222222222222,14806,0.0007321509024842147
62
+ 0.3998015873015873,13622,0.0007683790460187478
63
+ 0.3898809523809524,12745,0.0008026821281700681
64
+ 0.37996031746031744,11952,0.000838786912177239
65
+ 0.37003968253968256,11298,0.0008656145917666774
66
+ 0.3601190476190476,10941,0.0008840972038673189
67
+ 0.3501984126984127,10089,0.0009323687651642673
68
+ 0.3402777777777778,9730,0.0009582618477334784
69
+ 0.33035714285714285,8697,0.001031829515028949
70
+ 0.32043650793650796,8209,0.0010706819674658472
71
+ 0.310515873015873,7611,0.001133266050563012
72
+ 0.3005952380952381,6991,0.001199729393075923
73
+ 0.2906746031746032,6705,0.0012345399503041437
74
+ 0.27976190476190477,6015,0.0013358734220584026
75
+ 0.2698412698412698,5224,0.0014599380842198974
76
+ 0.25992063492063494,4872,0.0015278241082717179
77
+ 0.25,4524,0.001633968898128735
78
+ 0.2400793650793651,4018,0.00176387022399823
79
+ 0.23015873015873015,3623,0.001904940963842261
80
+ 0.22023809523809523,3242,0.0020431615576899315
81
+ 0.21031746031746032,2985,0.0021472681131348547
82
+ 0.2003968253968254,2658,0.0022891258443659395
83
+ 0.19047619047619047,2303,0.002497439020684527
84
+ 0.18055555555555555,2070,0.0026847167747472557
85
+ 0.17063492063492064,1624,0.0030947814200735606
86
+ 0.16071428571428573,1401,0.0033397443279497524
87
+ 0.15079365079365079,1139,0.0036561499334258323
88
+ 0.13988095238095238,939,0.004087702547298808
89
+ 0.12996031746031747,822,0.004439768080440451
90
+ 0.12003968253968254,670,0.0050640430518192414
91
+ 0.11011904761904762,548,0.005569573349817609
92
+ 0.1001984126984127,520,0.005863037495295288
93
+ 0.09027777777777778,451,0.006424002096785331
94
+ 0.08035714285714286,350,0.007426007042593891
95
+ 0.07043650793650794,274,0.0082195927599472
96
+ 0.060515873015873016,181,0.010618942533330624
97
+ 0.050595238095238096,128,0.01316034253303508
98
+ 0.040674603174603176,87,0.01728591025833494
99
+ 0.030753968253968252,58,0.021158446941601766
100
+ 0.020833333333333332,31,0.030542020193855542
101
+ 0.010912698412698412,14,0.04581366291335541
102
+ 0.000992063492063492,1,0.11101775162012853
metric_analysis/output_original/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_log_retweets_over_log_followers,nb_of_tweets_labeled_as_viral_log_retweets_over_log_followers,thresholds_log_retweets_over_log_followers
2
+ 1.0,31041,0.6240002852947716
3
+ 0.9890873015873016,16933,0.7230443483570232
4
+ 0.9791666666666666,14286,0.7484793180463
5
+ 0.9692460317460317,12783,0.7640086373583896
6
+ 0.9593253968253969,12034,0.7726483778092785
7
+ 0.9494047619047619,11239,0.7818005566060252
8
+ 0.939484126984127,10668,0.7889122408794803
9
+ 0.9295634920634921,10231,0.7936671192146495
10
+ 0.9196428571428571,9768,0.7997056994974557
11
+ 0.9097222222222222,9436,0.8038518801656075
12
+ 0.8998015873015873,9235,0.8063837662960968
13
+ 0.8898809523809523,8823,0.8121718746758682
14
+ 0.8799603174603174,8537,0.8158236353302109
15
+ 0.8700396825396826,8191,0.8201902577121126
16
+ 0.8601190476190477,7913,0.8243482587654483
17
+ 0.8492063492063492,7660,0.8282315008546971
18
+ 0.8392857142857143,7401,0.8321194850614858
19
+ 0.8293650793650794,7125,0.8367692646747009
20
+ 0.8194444444444444,6820,0.8417075589387941
21
+ 0.8095238095238095,6728,0.8431442930562224
22
+ 0.7996031746031746,6515,0.8466514160755445
23
+ 0.7896825396825397,6358,0.8493448528501626
24
+ 0.7797619047619048,6129,0.8531910961698003
25
+ 0.7698412698412699,5987,0.8559459884429319
26
+ 0.7599206349206349,5814,0.8589762954909695
27
+ 0.75,5607,0.8625694432138333
28
+ 0.7400793650793651,5439,0.8656201351014363
29
+ 0.7301587301587301,5187,0.8702700733610532
30
+ 0.7202380952380952,5043,0.8733238070593741
31
+ 0.7093253968253969,4908,0.8762024305187027
32
+ 0.6994047619047619,4644,0.8816572865395638
33
+ 0.689484126984127,4421,0.8867262402778523
34
+ 0.6795634920634921,4343,0.8884327201390895
35
+ 0.6696428571428571,4234,0.8909887319607015
36
+ 0.6597222222222222,4098,0.8946920241479999
37
+ 0.6498015873015873,3976,0.8971731862315426
38
+ 0.6398809523809523,3866,0.9000547970039445
39
+ 0.6299603174603174,3718,0.90335209700547
40
+ 0.6200396825396826,3581,0.9074973981023631
41
+ 0.6101190476190477,3415,0.9118727064143507
42
+ 0.6001984126984127,3203,0.918171233112645
43
+ 0.5902777777777778,3057,0.9223133679230249
44
+ 0.5803571428571429,2954,0.9257993297998169
45
+ 0.5694444444444444,2785,0.9314740928203532
46
+ 0.5595238095238095,2614,0.9370237927380607
47
+ 0.5496031746031746,2523,0.9403613239507382
48
+ 0.5396825396825397,2450,0.9431937683554847
49
+ 0.5297619047619048,2376,0.9461150743517935
50
+ 0.5198412698412699,2275,0.9504137942708433
51
+ 0.5099206349206349,2189,0.9537739623026252
52
+ 0.5,2095,0.9570957173459096
53
+ 0.49007936507936506,2006,0.9611672660302412
54
+ 0.4801587301587302,1905,0.9655758384839201
55
+ 0.47023809523809523,1831,0.9686141408612416
56
+ 0.4603174603174603,1781,0.9715297580192088
57
+ 0.4503968253968254,1692,0.9762955472165364
58
+ 0.44047619047619047,1633,0.9790934785418005
59
+ 0.4305555555555556,1561,0.9831903649682059
60
+ 0.41964285714285715,1499,0.9865132333995609
61
+ 0.4097222222222222,1417,0.9918971124207675
62
+ 0.3998015873015873,1341,0.9972496209976179
63
+ 0.3898809523809524,1281,1.0016513621903245
64
+ 0.37996031746031744,1237,1.0053147119107226
65
+ 0.37003968253968256,1160,1.012688867600473
66
+ 0.3601190476190476,1074,1.0190172621424873
67
+ 0.3501984126984127,1035,1.0230244967357478
68
+ 0.3402777777777778,984,1.0277467024345215
69
+ 0.33035714285714285,914,1.035761826178403
70
+ 0.32043650793650796,871,1.0416438955273315
71
+ 0.310515873015873,812,1.048103007385223
72
+ 0.3005952380952381,770,1.0524500566259047
73
+ 0.2906746031746032,721,1.060571954677911
74
+ 0.27976190476190477,663,1.0691477573528578
75
+ 0.2698412698412698,609,1.0775125729219717
76
+ 0.25992063492063494,573,1.0840944112778226
77
+ 0.25,540,1.0893285138897673
78
+ 0.2400793650793651,508,1.0950675043632472
79
+ 0.23015873015873015,472,1.1020695947838788
80
+ 0.22023809523809523,436,1.1083128293352051
81
+ 0.21031746031746032,399,1.1160912776174
82
+ 0.2003968253968254,362,1.1254166588926875
83
+ 0.19047619047619047,334,1.1355234542124109
84
+ 0.18055555555555555,294,1.1487927586475322
85
+ 0.17063492063492064,270,1.1596741156004886
86
+ 0.16071428571428573,238,1.183222859975457
87
+ 0.15079365079365079,221,1.191433139144886
88
+ 0.13988095238095238,201,1.2048894357033295
89
+ 0.12996031746031747,181,1.2226659947251466
90
+ 0.12003968253968254,163,1.237761303530253
91
+ 0.11011904761904762,149,1.2546164327696954
92
+ 0.1001984126984127,124,1.2884022960547663
93
+ 0.09027777777777778,113,1.3061771441754433
94
+ 0.08035714285714286,100,1.3250067925542386
95
+ 0.07043650793650794,86,1.342219079933877
96
+ 0.060515873015873016,70,1.3793064865985623
97
+ 0.050595238095238096,59,1.397787734828259
98
+ 0.040674603174603176,45,1.4578807811516883
99
+ 0.030753968253968252,34,1.5487864668450748
100
+ 0.020833333333333332,23,1.6432671810611028
101
+ 0.010912698412698412,11,1.7468918034538925
102
+ 0.000992063492063492,1,2.143417655682428
metric_analysis/output_original/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_retweets_over_log_followers,nb_of_tweets_labeled_as_viral_retweets_over_log_followers,thresholds_retweets_over_log_followers
2
+ 1.0,24647,96.18719446805103
3
+ 0.9890873015873016,16064,195.41188564042682
4
+ 0.9791666666666666,13979,244.9501617824125
5
+ 0.9692460317460317,13013,272.4577979927763
6
+ 0.9593253968253969,12079,304.2604884972614
7
+ 0.9494047619047619,11501,326.7121276970904
8
+ 0.939484126984127,10899,351.11195753265446
9
+ 0.9295634920634921,10240,379.36847411850994
10
+ 0.9196428571428571,10033,389.38471901077924
11
+ 0.9097222222222222,9560,411.73768652001644
12
+ 0.8998015873015873,9298,425.4498120093774
13
+ 0.8898809523809523,8904,446.87607462645417
14
+ 0.8799603174603174,8452,476.55673451118935
15
+ 0.8700396825396826,8235,491.0665617707924
16
+ 0.8601190476190477,8058,502.77030023552385
17
+ 0.8492063492063492,7973,509.5041443087782
18
+ 0.8392857142857143,7800,523.7004888975509
19
+ 0.8293650793650794,7603,539.923690961875
20
+ 0.8194444444444444,7357,560.2436517890462
21
+ 0.8095238095238095,7157,576.0696671571288
22
+ 0.7996031746031746,7011,587.5531675251212
23
+ 0.7896825396825397,6797,608.3009252958204
24
+ 0.7797619047619048,6566,631.0274981581211
25
+ 0.7698412698412699,6444,641.222980944397
26
+ 0.7599206349206349,6301,655.3005304817065
27
+ 0.75,6151,671.9318091327671
28
+ 0.7400793650793651,6008,687.8141314252327
29
+ 0.7301587301587301,5846,706.1316731892126
30
+ 0.7202380952380952,5736,718.2535271058875
31
+ 0.7093253968253969,5524,745.3425802314556
32
+ 0.6994047619047619,5400,760.2850310752603
33
+ 0.689484126984127,5234,781.4194696938304
34
+ 0.6795634920634921,5109,800.5203812652611
35
+ 0.6696428571428571,5037,811.3520395758455
36
+ 0.6597222222222222,4901,832.7804952963619
37
+ 0.6498015873015873,4786,849.9546866922489
38
+ 0.6398809523809523,4672,870.5048665882368
39
+ 0.6299603174603174,4558,889.5148546157745
40
+ 0.6200396825396826,4423,913.4868359265707
41
+ 0.6101190476190477,4366,923.3244117903994
42
+ 0.6001984126984127,4297,934.7004834106867
43
+ 0.5902777777777778,4189,954.8779497176456
44
+ 0.5803571428571429,4107,972.2112451667427
45
+ 0.5694444444444444,4022,994.8012552878178
46
+ 0.5595238095238095,3948,1010.3955348603816
47
+ 0.5496031746031746,3868,1027.9967506835908
48
+ 0.5396825396825397,3733,1060.6487591725645
49
+ 0.5297619047619048,3664,1078.0676366493603
50
+ 0.5198412698412699,3581,1099.8695901550593
51
+ 0.5099206349206349,3454,1133.927886665909
52
+ 0.5,3403,1146.3720700339513
53
+ 0.49007936507936506,3309,1173.011743073717
54
+ 0.4801587301587302,3237,1192.4468751700258
55
+ 0.47023809523809523,3181,1210.132391580637
56
+ 0.4603174603174603,3144,1221.3638625370725
57
+ 0.4503968253968254,3040,1252.4280115900583
58
+ 0.44047619047619047,2982,1273.901632813587
59
+ 0.4305555555555556,2900,1301.7988554511148
60
+ 0.41964285714285715,2824,1327.1136527390647
61
+ 0.4097222222222222,2693,1366.6130386272125
62
+ 0.3998015873015873,2629,1392.3103276130262
63
+ 0.3898809523809524,2589,1408.5466934453984
64
+ 0.37996031746031744,2500,1440.5856788553633
65
+ 0.37003968253968256,2406,1483.3271367722798
66
+ 0.3601190476190476,2357,1506.1373828761034
67
+ 0.3501984126984127,2310,1527.467028791344
68
+ 0.3402777777777778,2204,1578.2817061633984
69
+ 0.33035714285714285,2150,1609.3989544747326
70
+ 0.32043650793650796,2076,1658.641956527047
71
+ 0.310515873015873,1984,1711.5371500241918
72
+ 0.3005952380952381,1918,1753.2702562983825
73
+ 0.2906746031746032,1857,1791.705607659909
74
+ 0.27976190476190477,1813,1818.039475418674
75
+ 0.2698412698412698,1753,1852.2105720993286
76
+ 0.25992063492063494,1693,1888.1800931525004
77
+ 0.25,1646,1924.0614094625103
78
+ 0.2400793650793651,1595,1957.2434949168635
79
+ 0.23015873015873015,1508,2016.6788884308942
80
+ 0.22023809523809523,1424,2083.2008109637877
81
+ 0.21031746031746032,1354,2133.4300161851493
82
+ 0.2003968253968254,1296,2181.038558435724
83
+ 0.19047619047619047,1253,2218.5893192510925
84
+ 0.18055555555555555,1187,2298.6494790579704
85
+ 0.17063492063492064,1153,2343.3129244431043
86
+ 0.16071428571428573,1089,2409.9784637782077
87
+ 0.15079365079365079,1043,2465.783943579372
88
+ 0.13988095238095238,995,2512.2825954364475
89
+ 0.12996031746031747,903,2644.884840422699
90
+ 0.12003968253968254,847,2720.031974699821
91
+ 0.11011904761904762,776,2855.125879628761
92
+ 0.1001984126984127,691,3057.4880652118327
93
+ 0.09027777777777778,634,3191.2654159655003
94
+ 0.08035714285714286,579,3300.3631730935804
95
+ 0.07043650793650794,522,3443.658749880984
96
+ 0.060515873015873016,450,3670.968448884167
97
+ 0.050595238095238096,366,4018.4875984146765
98
+ 0.040674603174603176,280,4475.92163141317
99
+ 0.030753968253968252,253,4696.706526876823
100
+ 0.020833333333333332,164,5331.458638244872
101
+ 0.010912698412698412,82,6411.553184403061
102
+ 0.000992063492063492,14,10241.747512102276
metric_analysis/output_original/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_roberta_paper_metric,nb_of_tweets_labeled_as_viral_roberta_paper_metric,thresholds_roberta_paper_metric
2
+ 1.0,783980,9.052889576573479e-05
3
+ 0.9890873015873016,606979,0.0003409942258809795
4
+ 0.9791666666666666,474954,0.0008589343284472877
5
+ 0.9692460317460317,341935,0.002285587114824846
6
+ 0.9593253968253969,225314,0.0059620822317802535
7
+ 0.9494047619047619,156999,0.011989569411313709
8
+ 0.939484126984127,115968,0.020634770088828545
9
+ 0.9295634920634921,90944,0.03106544666018859
10
+ 0.9196428571428571,67477,0.04963223513274843
11
+ 0.9097222222222222,53428,0.07044762254504668
12
+ 0.8998015873015873,44279,0.0931967814168663
13
+ 0.8898809523809523,35102,0.13081025220011888
14
+ 0.8799603174603174,28663,0.1745575677179806
15
+ 0.8700396825396826,24972,0.21250227357914633
16
+ 0.8601190476190477,22479,0.2465673793342506
17
+ 0.8492063492063492,20166,0.2869145938607858
18
+ 0.8392857142857143,19304,0.304132160348205
19
+ 0.8293650793650794,17811,0.3390347868176622
20
+ 0.8194444444444444,16677,0.3732660056369872
21
+ 0.8095238095238095,15401,0.4148045252952486
22
+ 0.7996031746031746,14505,0.4477972728644806
23
+ 0.7896825396825397,13194,0.5055156298192968
24
+ 0.7797619047619048,12508,0.5409698646849892
25
+ 0.7698412698412699,11931,0.5754220771104159
26
+ 0.7599206349206349,11316,0.6182192984983755
27
+ 0.75,10726,0.6646019719884612
28
+ 0.7400793650793651,10314,0.7019883742889699
29
+ 0.7301587301587301,9788,0.750920049260784
30
+ 0.7202380952380952,9007,0.8391207342688993
31
+ 0.7093253968253969,8603,0.888609564785895
32
+ 0.6994047619047619,8206,0.9414799820328927
33
+ 0.689484126984127,7661,1.0212425782882102
34
+ 0.6795634920634921,7289,1.0890041480074235
35
+ 0.6696428571428571,6780,1.1855272483643158
36
+ 0.6597222222222222,6467,1.2602164579934503
37
+ 0.6498015873015873,6161,1.327281088505106
38
+ 0.6398809523809523,5882,1.4051718048191333
39
+ 0.6299603174603174,5321,1.5835118067715899
40
+ 0.6200396825396826,4966,1.722560123794516
41
+ 0.6101190476190477,4686,1.8317436498146948
42
+ 0.6001984126984127,4484,1.9245385016106697
43
+ 0.5902777777777778,4249,2.051449708767312
44
+ 0.5803571428571429,4128,2.125304741103856
45
+ 0.5694444444444444,3687,2.4125368969101033
46
+ 0.5595238095238095,3468,2.583086678098461
47
+ 0.5496031746031746,3248,2.7815895186317805
48
+ 0.5396825396825397,3152,2.8954501822960066
49
+ 0.5297619047619048,3081,2.955250022079203
50
+ 0.5198412698412699,2908,3.1785787340504648
51
+ 0.5099206349206349,2707,3.428223279567193
52
+ 0.5,2593,3.59870356795998
53
+ 0.49007936507936506,2502,3.7528407022738315
54
+ 0.4801587301587302,2384,3.9638262404902136
55
+ 0.47023809523809523,2157,4.4262194471708876
56
+ 0.4603174603174603,2041,4.7390803022779195
57
+ 0.4503968253968254,1956,4.94650983750502
58
+ 0.44047619047619047,1898,5.125327551050057
59
+ 0.4305555555555556,1743,5.658337935222293
60
+ 0.41964285714285715,1669,5.972958366751632
61
+ 0.4097222222222222,1590,6.348934235182757
62
+ 0.3998015873015873,1476,6.895417771804837
63
+ 0.3898809523809524,1381,7.553164524632679
64
+ 0.37996031746031744,1318,7.95447265160399
65
+ 0.37003968253968256,1182,8.841857620201926
66
+ 0.3601190476190476,1082,9.709458682853722
67
+ 0.3501984126984127,1013,10.492987863082893
68
+ 0.3402777777777778,967,11.162073579916996
69
+ 0.33035714285714285,914,11.924377006525063
70
+ 0.32043650793650796,868,12.499962631113736
71
+ 0.310515873015873,830,13.104004547610508
72
+ 0.3005952380952381,776,14.116974236232336
73
+ 0.2906746031746032,710,15.75957395134853
74
+ 0.27976190476190477,656,17.503118276572664
75
+ 0.2698412698412698,617,18.715704509161537
76
+ 0.25992063492063494,589,19.614803085421624
77
+ 0.25,530,21.76810557585511
78
+ 0.2400793650793651,493,23.383787555656212
79
+ 0.23015873015873015,440,27.26674857348407
80
+ 0.22023809523809523,413,29.10383773593751
81
+ 0.21031746031746032,376,32.10956578521053
82
+ 0.2003968253968254,354,34.49758353529179
83
+ 0.19047619047619047,327,37.27337021039718
84
+ 0.18055555555555555,300,41.873141561514856
85
+ 0.17063492063492064,272,48.023909569922445
86
+ 0.16071428571428573,241,54.36857680039037
87
+ 0.15079365079365079,216,61.84780146447067
88
+ 0.13988095238095238,197,70.16356545125636
89
+ 0.12996031746031747,178,82.08798118734508
90
+ 0.12003968253968254,164,90.38189281594519
91
+ 0.11011904761904762,149,99.31178628400392
92
+ 0.1001984126984127,128,114.91598022184297
93
+ 0.09027777777777778,114,126.08462267664385
94
+ 0.08035714285714286,97,157.76121015148178
95
+ 0.07043650793650794,81,195.60887797794308
96
+ 0.060515873015873016,68,220.9913721970096
97
+ 0.050595238095238096,57,261.8047511787649
98
+ 0.040674603174603176,47,319.3778539107914
99
+ 0.030753968253968252,34,463.6375267632411
100
+ 0.020833333333333332,22,642.3126335309646
101
+ 0.010912698412698412,11,913.2204554544222
102
+ 0.000992063492063492,1,3533.739728797425
metric_analysis/output_original/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_virality_avg_retweets,nb_of_tweets_labeled_as_viral_virality_avg_retweets,thresholds_virality_avg_retweets
2
+ 1.0,99470,0.7599240299001586
3
+ 0.9890873015873016,62079,1.5480748118436458
4
+ 0.9791666666666666,50271,2.0852622686277305
5
+ 0.9692460317460317,43642,2.5354184580479755
6
+ 0.9593253968253969,37923,3.0599798377048133
7
+ 0.9494047619047619,34261,3.4972565994343037
8
+ 0.939484126984127,30526,4.072360076314477
9
+ 0.9295634920634921,28223,4.501034715181624
10
+ 0.9196428571428571,26082,4.999137001078749
11
+ 0.9097222222222222,24560,5.39307122938644
12
+ 0.8998015873015873,21629,6.278415213300843
13
+ 0.8898809523809523,20244,6.807979662864933
14
+ 0.8809523809523809,19770,7.0
15
+ 0.8700396825396826,17741,7.978472899264016
16
+ 0.8601190476190477,17036,8.371478269233975
17
+ 0.8492063492063492,15873,9.0580071984413
18
+ 0.8392857142857143,14621,9.997273862423672
19
+ 0.8293650793650794,13090,11.388669220662166
20
+ 0.8194444444444444,11889,12.681181379795396
21
+ 0.8095238095238095,11077,13.707925776186906
22
+ 0.7996031746031746,9920,15.580116677414129
23
+ 0.7896825396825397,9532,16.27014209744153
24
+ 0.7797619047619048,9205,16.915461328983927
25
+ 0.7698412698412699,8483,18.548537529332275
26
+ 0.7599206349206349,8060,19.641355294192593
27
+ 0.75,7589,20.906896682604128
28
+ 0.7400793650793651,6999,22.874625271719133
29
+ 0.7301587301587301,6475,25.012739959261822
30
+ 0.7202380952380952,6079,26.954204438555614
31
+ 0.7093253968253969,5808,28.183043893453764
32
+ 0.6994047619047619,5385,30.602140549030686
33
+ 0.689484126984127,5052,32.567920421715776
34
+ 0.6795634920634921,4677,35.74808318428063
35
+ 0.6696428571428571,4393,38.21854701763988
36
+ 0.6597222222222222,4164,40.57703645974158
37
+ 0.6498015873015873,3886,43.93985563230335
38
+ 0.6398809523809523,3665,46.847492839618525
39
+ 0.6299603174603174,3524,49.07891508810281
40
+ 0.6200396825396826,3193,54.78245772103394
41
+ 0.6101190476190477,3103,56.50224523966051
42
+ 0.6001984126984127,2892,60.763416151670796
43
+ 0.5902777777777778,2770,63.546233103760386
44
+ 0.5803571428571429,2584,68.03407090113993
45
+ 0.5694444444444444,2484,70.65733437208469
46
+ 0.5595238095238095,2391,73.26624858192154
47
+ 0.5496031746031746,2252,78.06253536357954
48
+ 0.5396825396825397,2052,86.08126921842629
49
+ 0.5297619047619048,1975,88.97259652512547
50
+ 0.5198412698412699,1831,96.64859215298337
51
+ 0.5099206349206349,1723,102.7628034997729
52
+ 0.5,1611,110.04173671014203
53
+ 0.49007936507936506,1523,117.28492475872802
54
+ 0.4801587301587302,1444,124.34249552435475
55
+ 0.47023809523809523,1369,130.09518134268558
56
+ 0.4603174603174603,1290,140.8240668782626
57
+ 0.4503968253968254,1200,150.88390964671086
58
+ 0.44047619047619047,1145,157.99851670144255
59
+ 0.4305555555555556,1101,164.50275477412396
60
+ 0.41964285714285715,1034,175.0797930343889
61
+ 0.4097222222222222,963,187.94005117417709
62
+ 0.3998015873015873,920,193.56605517288256
63
+ 0.3898809523809524,875,202.1903749398219
64
+ 0.37996031746031744,822,216.09849275103548
65
+ 0.37003968253968256,767,235.05164885003057
66
+ 0.3601190476190476,730,249.2392647610653
67
+ 0.3501984126984127,693,264.6301245685592
68
+ 0.3402777777777778,657,275.9762698041734
69
+ 0.33035714285714285,623,292.8722993281888
70
+ 0.32043650793650796,578,310.3159564904467
71
+ 0.310515873015873,546,330.0766694728744
72
+ 0.3005952380952381,507,362.2200443714963
73
+ 0.2906746031746032,474,384.5033519093221
74
+ 0.27976190476190477,444,408.7457694650668
75
+ 0.2698412698412698,421,446.810815150956
76
+ 0.25992063492063494,384,485.52769942049525
77
+ 0.25,367,506.6784740394677
78
+ 0.2400793650793651,348,533.9551789024323
79
+ 0.23015873015873015,327,551.4285181823311
80
+ 0.22023809523809523,303,599.4004450619773
81
+ 0.21031746031746032,286,634.4445375768611
82
+ 0.2003968253968254,266,691.4859160249838
83
+ 0.19047619047619047,248,741.7820711022046
84
+ 0.18055555555555555,232,786.2690567569583
85
+ 0.17063492063492064,219,815.3028458797158
86
+ 0.16071428571428573,201,866.7503880337599
87
+ 0.15079365079365079,182,976.0304653073382
88
+ 0.13988095238095238,171,1058.1367647948662
89
+ 0.12996031746031747,156,1119.7867538390494
90
+ 0.12003968253968254,136,1253.92530228473
91
+ 0.11011904761904762,123,1367.2655234393055
92
+ 0.1001984126984127,112,1440.1384708594214
93
+ 0.09027777777777778,102,1482.542530248446
94
+ 0.08035714285714286,88,1637.9180724687665
95
+ 0.07043650793650794,74,1862.5270685892517
96
+ 0.060515873015873016,62,2003.0277986460685
97
+ 0.050595238095238096,51,2152.268107221841
98
+ 0.040674603174603176,41,2303.2090263636537
99
+ 0.030753968253968252,31,2500.869915344092
100
+ 0.020833333333333332,21,2581.3315487148006
101
+ 0.010912698412698412,11,2754.925734752474
102
+ 0.000992063492063492,1,3128.6932364568866
metric_analysis/output_original/virality_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_virality_followers,nb_of_tweets_labeled_as_viral_virality_followers,thresholds_virality_followers
2
+ 1.0,48760,0.009847748921535678
3
+ 0.9890873015873016,20751,0.043596964289750774
4
+ 0.9791666666666666,16478,0.06253956872764575
5
+ 0.9692460317460317,14613,0.07495774715579945
6
+ 0.9593253968253969,14007,0.079949907336287
7
+ 0.9494047619047619,12755,0.09122480283361462
8
+ 0.939484126984127,12184,0.09777349815621926
9
+ 0.9295634920634921,11792,0.10219187496413727
10
+ 0.9196428571428571,11156,0.10996057722208087
11
+ 0.9097222222222222,10647,0.11691437777426382
12
+ 0.8998015873015873,10242,0.12351575701361578
13
+ 0.8898809523809523,9888,0.1296152659458055
14
+ 0.8799603174603174,9485,0.13690525138144047
15
+ 0.8700396825396826,9156,0.14254183437978457
16
+ 0.8601190476190477,8785,0.1499039332865125
17
+ 0.8492063492063492,8428,0.15736230391691272
18
+ 0.8392857142857143,8146,0.16459621222010382
19
+ 0.8293650793650794,7952,0.16919448066654186
20
+ 0.8194444444444444,7671,0.17584905054593755
21
+ 0.8095238095238095,7453,0.18177820463978472
22
+ 0.7996031746031746,7172,0.19122035156187825
23
+ 0.7896825396825397,6925,0.19884880531962626
24
+ 0.7797619047619048,6648,0.20896555791065008
25
+ 0.7698412698412699,6505,0.21502254115615835
26
+ 0.7599206349206349,6314,0.22201613044061455
27
+ 0.75,6103,0.22969755001706385
28
+ 0.7400793650793651,5863,0.2395665365261546
29
+ 0.7301587301587301,5642,0.24941243620086148
30
+ 0.7202380952380952,5257,0.2691313357520127
31
+ 0.7093253968253969,5083,0.2786949054424083
32
+ 0.6994047619047619,4873,0.29185505269258716
33
+ 0.689484126984127,4709,0.3015478411563975
34
+ 0.6795634920634921,4530,0.31431907587712016
35
+ 0.6696428571428571,4365,0.3263783963721026
36
+ 0.6597222222222222,4190,0.3419197998258842
37
+ 0.6498015873015873,4074,0.35240449858579814
38
+ 0.6398809523809523,3906,0.36738570635841483
39
+ 0.6299603174603174,3846,0.37350307338950794
40
+ 0.6200396825396826,3714,0.38569974894121606
41
+ 0.6101190476190477,3433,0.4182156945481237
42
+ 0.6001984126984127,3261,0.4398674447582947
43
+ 0.5902777777777778,3160,0.45260475188842475
44
+ 0.5803571428571429,3009,0.4744476851437965
45
+ 0.5694444444444444,2811,0.5065435623907723
46
+ 0.5595238095238095,2651,0.5376141635489305
47
+ 0.5496031746031746,2548,0.554468003063123
48
+ 0.5396825396825397,2480,0.569913190896405
49
+ 0.5297619047619048,2381,0.5911688811978919
50
+ 0.5198412698412699,2275,0.6145070157601543
51
+ 0.5099206349206349,2189,0.6347704849378184
52
+ 0.5,2102,0.6582912665179754
53
+ 0.49007936507936506,2000,0.6861455175535648
54
+ 0.4801587301587302,1910,0.7136122363660038
55
+ 0.47023809523809523,1840,0.7363839449817713
56
+ 0.4603174603174603,1755,0.7707349266582455
57
+ 0.4503968253968254,1690,0.7949358527010969
58
+ 0.44047619047619047,1623,0.8251487414647105
59
+ 0.4305555555555556,1550,0.8543538482681353
60
+ 0.41964285714285715,1492,0.8837801041456175
61
+ 0.4097222222222222,1407,0.933151427225055
62
+ 0.3998015873015873,1340,0.9728949369054694
63
+ 0.3898809523809524,1281,1.0159094482787683
64
+ 0.37996031746031744,1238,1.0517923348956835
65
+ 0.37003968253968256,1157,1.126624449492463
66
+ 0.3601190476190476,1079,1.1946824800319331
67
+ 0.3501984126984127,1035,1.2350532566427777
68
+ 0.3402777777777778,973,1.3111586957558676
69
+ 0.33035714285714285,930,1.3788231439711065
70
+ 0.32043650793650796,868,1.4648138997453877
71
+ 0.310515873015873,804,1.5657617737942116
72
+ 0.3005952380952381,757,1.6483081903461123
73
+ 0.2906746031746032,713,1.7528878661684644
74
+ 0.27976190476190477,663,1.8714128881700722
75
+ 0.2698412698412698,612,2.01276188638082
76
+ 0.25992063492063494,567,2.1149449648249323
77
+ 0.25,532,2.212570952273289
78
+ 0.2400793650793651,516,2.262779417926834
79
+ 0.23015873015873015,489,2.3775402418530684
80
+ 0.22023809523809523,440,2.5712123367735846
81
+ 0.21031746031746032,412,2.6883909458302067
82
+ 0.2003968253968254,378,2.9011109573007374
83
+ 0.19047619047619047,343,3.1589097951921734
84
+ 0.18055555555555555,318,3.3862837057887427
85
+ 0.17063492063492064,286,3.653621381908462
86
+ 0.16071428571428573,255,4.081553139911354
87
+ 0.15079365079365079,233,4.498533912223866
88
+ 0.13988095238095238,209,5.123369293423201
89
+ 0.12996031746031747,181,5.865184280255489
90
+ 0.12003968253968254,164,6.426211688059448
91
+ 0.11011904761904762,150,7.225324262618784
92
+ 0.1001984126984127,132,8.241192277609564
93
+ 0.09027777777777778,114,8.966047810728663
94
+ 0.08035714285714286,100,9.792804922314204
95
+ 0.07043650793650794,88,11.423473275192162
96
+ 0.060515873015873016,73,13.329307594828814
97
+ 0.050595238095238096,56,17.36539923954372
98
+ 0.040674603174603176,45,20.418879061412717
99
+ 0.030753968253968252,34,27.75797742299627
100
+ 0.020833333333333332,22,41.26621697088786
101
+ 0.010912698412698412,11,59.904126358909444
102
+ 0.000992063492063492,1,162.90697674418604
metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_virality_median_retweets,nb_of_tweets_labeled_as_viral_virality_median_retweets,thresholds_virality_median_retweets
2
+ 1.0,134409,1.0
3
+ 0.9876923076923076,107699,1.5609094071743144
4
+ 0.9784615384615385,100735,1.9992355751561925
5
+ 0.9692307692307692,90951,2.2612983428008073
6
+ 0.96,85185,2.7031504264095916
7
+ 0.9476923076923077,77011,3.2563426595787153
8
+ 0.9384615384615385,65714,4.641389084785888
9
+ 0.9292307692307692,61820,5.1008903893765485
10
+ 0.92,52119,7.312447220141592
11
+ 0.9076923076923077,46235,9.17001356841133
12
+ 0.8984615384615384,44033,10.027246243440574
13
+ 0.8892307692307693,39488,12.639793663256718
14
+ 0.88,32972,17.645820519988817
15
+ 0.8676923076923077,28429,23.0279127002554
16
+ 0.8584615384615385,27034,25.41807646695045
17
+ 0.8492307692307692,24366,30.565914650720305
18
+ 0.84,22694,34.49191625472521
19
+ 0.8276923076923077,20576,40.511283764967985
20
+ 0.8184615384615385,19538,44.345175283888736
21
+ 0.8092307692307692,17102,55.05751773049646
22
+ 0.8,16463,58.88816251076612
23
+ 0.7876923076923077,15830,62.71639784946234
24
+ 0.7784615384615384,14743,70.05540389053742
25
+ 0.7692307692307693,14247,74.15760000000002
26
+ 0.76,12714,88.54265957446803
27
+ 0.7507692307692307,12267,93.8173076923077
28
+ 0.7384615384615385,11823,99.04954407294835
29
+ 0.7292307692307692,10868,113.25071895424837
30
+ 0.72,10400,121.42667780562527
31
+ 0.7107692307692308,9775,133.6674772036474
32
+ 0.6984615384615385,9504,138.80271646859083
33
+ 0.6892307692307692,9241,144.15345029239765
34
+ 0.68,8443,165.98497588652486
35
+ 0.6707692307692308,8079,175.3568580560256
36
+ 0.6584615384615384,7985,178.01515789473683
37
+ 0.6492307692307693,7801,184.9621954484605
38
+ 0.64,7367,201.76998989694889
39
+ 0.6307692307692307,6802,226.3273758865248
40
+ 0.6184615384615385,6055,263.0394202898551
41
+ 0.6092307692307692,5681,289.52545311268716
42
+ 0.6,5261,320.21842105263147
43
+ 0.5907692307692308,4972,345.265
44
+ 0.5784615384615385,4849,357.453793103448
45
+ 0.5692307692307692,4667,379.78126543209873
46
+ 0.56,4518,396.6374331550802
47
+ 0.5507692307692308,4115,451.9967654986525
48
+ 0.5384615384615384,3471,561.6797385620916
49
+ 0.5292307692307693,3301,600.444358974359
50
+ 0.52,3127,644.4646666666665
51
+ 0.5107692307692308,2919,700.650976744186
52
+ 0.5015384615384615,2736,753.530612244898
53
+ 0.48923076923076925,2619,793.9771428571429
54
+ 0.48,2396,887.333684210527
55
+ 0.4707692307692308,2231,975.3399999999999
56
+ 0.46153846153846156,2071,1087.9872268907568
57
+ 0.4492307692307692,1975,1153.6727272727283
58
+ 0.44,1823,1287.986666666667
59
+ 0.4307692307692308,1755,1346.0
60
+ 0.42153846153846153,1607,1516.499999999999
61
+ 0.40923076923076923,1447,1719.88
62
+ 0.4,1407,1778.9000000000003
63
+ 0.39076923076923076,1368,1840.1704347826085
64
+ 0.38153846153846155,1292,1982.1173333333327
65
+ 0.36923076923076925,1271,2013.0342857142857
66
+ 0.36,1195,2140.38
67
+ 0.3507692307692308,1152,2251.7999999999997
68
+ 0.3415384615384615,1125,2290.6666666666665
69
+ 0.3292307692307692,1048,2474.848000000001
70
+ 0.32,997,2616.640000000003
71
+ 0.31076923076923074,946,2800.0149999999994
72
+ 0.30153846153846153,878,3054.0666666666657
73
+ 0.28923076923076924,845,3166.7999999999993
74
+ 0.28,784,3394.6466666666665
75
+ 0.27076923076923076,775,3428.8199999999997
76
+ 0.26153846153846155,719,3691.0899999999992
77
+ 0.2523076923076923,645,4030.0
78
+ 0.24,609,4260.660000000002
79
+ 0.23076923076923078,526,4824.393333333336
80
+ 0.22153846153846155,487,5280.888
81
+ 0.2123076923076923,456,5618.76
82
+ 0.2,419,6033.5599999999995
83
+ 0.19076923076923077,384,6516.36
84
+ 0.18153846153846154,338,7073.840000000001
85
+ 0.1723076923076923,317,7362.599999999994
86
+ 0.16,299,7523.599999999994
87
+ 0.15076923076923077,278,7806.799999999999
88
+ 0.14153846153846153,241,8540.986666666664
89
+ 0.13230769230769232,233,8752.44
90
+ 0.12,209,9376.240000000002
91
+ 0.11076923076923077,194,9990.320000000002
92
+ 0.10153846153846154,179,10952.400000000009
93
+ 0.09230769230769231,167,11616.800000000007
94
+ 0.08,135,13536.373333333351
95
+ 0.07076923076923076,120,14684.879999999996
96
+ 0.06153846153846154,106,16057.560000000001
97
+ 0.052307692307692305,95,17173.200000000004
98
+ 0.04,77,19165.079999999958
99
+ 0.03076923076923077,65,21221.51999999996
100
+ 0.021538461538461538,52,24319.559999999983
101
+ 0.012307692307692308,31,28349.359999999968
102
+ 0.003076923076923077,5,56750.0
metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_virality_median_retweets,nb_of_tweets_labeled_as_viral_virality_median_retweets,thresholds_virality_median_retweets
2
+ 1.0,511764,0.0
3
+ 1.0,511764,0.0
4
+ 1.0,511764,0.0
5
+ 1.0,511764,0.0
6
+ 1.0,511764,0.0
7
+ 1.0,511764,0.0
8
+ 1.0,511764,0.0
9
+ 1.0,511764,0.0
10
+ 1.0,511764,0.0
11
+ 1.0,511764,0.0
12
+ 1.0,511764,0.0
13
+ 1.0,511764,0.0
14
+ 1.0,511764,0.0
15
+ 1.0,511764,0.0
16
+ 1.0,511764,0.0
17
+ 1.0,511764,0.0
18
+ 1.0,511764,0.0
19
+ 1.0,511764,0.0
20
+ 1.0,511764,0.0
21
+ 1.0,511764,0.0
22
+ 1.0,511764,0.0
23
+ 1.0,511764,0.0
24
+ 1.0,511764,0.0
25
+ 1.0,511764,0.0
26
+ 1.0,511764,0.0
27
+ 1.0,511764,0.0
28
+ 1.0,511764,0.0
29
+ 1.0,511764,0.0
30
+ 1.0,511764,0.0
31
+ 1.0,511764,0.0
32
+ 1.0,511764,0.0
33
+ 1.0,511764,0.0
34
+ 1.0,511764,0.0
35
+ 1.0,511764,0.0
36
+ 1.0,511764,0.0
37
+ 1.0,511764,0.0
38
+ 1.0,511764,0.0
39
+ 1.0,511764,0.0
40
+ 1.0,511764,0.0
41
+ 1.0,511764,0.0
42
+ 1.0,511764,0.0
43
+ 1.0,511764,0.0
44
+ 1.0,511764,0.0
45
+ 1.0,511764,0.0
46
+ 1.0,511764,0.0
47
+ 1.0,511764,0.0
48
+ 1.0,511764,0.0
49
+ 1.0,511764,0.0
50
+ 1.0,511764,0.0
51
+ 1.0,511764,0.0
52
+ 1.0,511764,0.0
53
+ 1.0,511764,0.0
54
+ 1.0,511764,0.0
55
+ 1.0,511764,0.0
56
+ 1.0,511764,0.0
57
+ 1.0,511764,0.0
58
+ 1.0,511764,0.0
59
+ 1.0,511764,0.0
60
+ 1.0,511764,0.0
61
+ 1.0,511764,0.0
62
+ 1.0,511764,0.0
63
+ 1.0,511764,0.0
64
+ 1.0,511764,0.0
65
+ 1.0,511764,0.0
66
+ 1.0,511764,0.0
67
+ 1.0,511764,0.0
68
+ 1.0,511764,0.0
69
+ 1.0,511764,0.0
70
+ 0.32242063492063494,134409,1.0
71
+ 0.310515873015873,86051,2.6456794555995926
72
+ 0.3005952380952381,61961,5.027143821742062
73
+ 0.2906746031746032,45428,9.636950619740496
74
+ 0.27976190476190477,28429,23.02300286355545
75
+ 0.2698412698412698,22643,34.6215403148756
76
+ 0.25992063492063494,17026,55.781783181357575
77
+ 0.25,14577,71.5297497155859
78
+ 0.2400793650793651,12130,95.42360795606041
79
+ 0.23015873015873015,9916,130.25747244296335
80
+ 0.22023809523809523,8532,163.15705
81
+ 0.21031746031746032,7881,181.93631481481526
82
+ 0.2003968253968254,6185,256.67272727272757
83
+ 0.19047619047619047,4979,344.9141220238097
84
+ 0.18055555555555555,4507,397.92005347593505
85
+ 0.17063492063492064,3285,605.9448076923071
86
+ 0.16071428571428573,2717,759.2236734693877
87
+ 0.15079365079365079,2189,1004.0008333333315
88
+ 0.13988095238095238,1789,1316.0649999999996
89
+ 0.12996031746031747,1422,1754.7858823529411
90
+ 0.12003968253968254,1274,2010.7097142857142
91
+ 0.11011904761904762,1129,2285.5833333333335
92
+ 0.1001984126984127,947,2795.887500000001
93
+ 0.09027777777777778,783,3395.8991666666666
94
+ 0.08035714285714286,643,4035.7200000000007
95
+ 0.07043650793650794,476,5388.830000000004
96
+ 0.060515873015873016,370,6665.859999999988
97
+ 0.050595238095238096,284,7694.966666666665
98
+ 0.040674603174603176,219,9203.879999999965
99
+ 0.030753968253968252,172,11420.029999999995
100
+ 0.020833333333333332,108,15734.440000000006
101
+ 0.010912698412698412,68,20769.80999999998
102
+ 0.000992063492063492,5,56750.0
metric_analysis/output_original/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ percentage_of_viral_covered_virality_retweet_percentile_per_user,nb_of_tweets_labeled_as_viral_virality_retweet_percentile_per_user,thresholds_virality_retweet_percentile_per_user
2
+ 1.0,1357228,0.0
3
+ 1.0,1357165,0.01
4
+ 1.0,1357075,0.02
5
+ 1.0,1356795,0.03
6
+ 1.0,1356479,0.04
7
+ 1.0,1355978,0.05
8
+ 1.0,1355779,0.06
9
+ 1.0,1355441,0.07
10
+ 1.0,1355077,0.08
11
+ 1.0,1353860,0.09
12
+ 1.0,1353400,0.1
13
+ 1.0,1352555,0.11
14
+ 1.0,1351992,0.12
15
+ 1.0,1351289,0.13
16
+ 1.0,1349915,0.14
17
+ 1.0,1348735,0.15
18
+ 1.0,1348367,0.16
19
+ 1.0,1347269,0.17
20
+ 1.0,1345947,0.18
21
+ 1.0,1344634,0.19
22
+ 1.0,1342750,0.2
23
+ 1.0,1341753,0.21
24
+ 1.0,1340292,0.22
25
+ 1.0,1336558,0.23
26
+ 1.0,1335271,0.24
27
+ 1.0,1333139,0.25
28
+ 1.0,1331241,0.26
29
+ 1.0,1328699,0.27
30
+ 1.0,1326128,0.28
31
+ 1.0,1324353,0.29
32
+ 1.0,1323643,0.3
33
+ 1.0,1320614,0.31
34
+ 1.0,1319676,0.32
35
+ 1.0,1314107,0.33
36
+ 1.0,1307821,0.34
37
+ 1.0,1305415,0.35
38
+ 1.0,1298334,0.36
39
+ 1.0,1297104,0.37
40
+ 1.0,1293457,0.38
41
+ 1.0,1289359,0.39
42
+ 1.0,1286088,0.4
43
+ 1.0,1282661,0.41
44
+ 1.0,1277658,0.42
45
+ 1.0,1272885,0.43
46
+ 1.0,1266459,0.44
47
+ 1.0,1260552,0.45
48
+ 1.0,1253282,0.46
49
+ 1.0,1249633,0.47
50
+ 1.0,1247341,0.48
51
+ 1.0,1243766,0.49
52
+ 1.0,1236825,0.5
53
+ 1.0,1225501,0.51
54
+ 1.0,1214100,0.52
55
+ 1.0,1203663,0.53
56
+ 1.0,1194775,0.54
57
+ 1.0,1189887,0.55
58
+ 1.0,1183528,0.56
59
+ 1.0,1174373,0.57
60
+ 0.9990079365079365,1162858,0.58
61
+ 0.9990079365079365,1149726,0.59
62
+ 0.9990079365079365,1138170,0.6
63
+ 0.9990079365079365,1123765,0.61
64
+ 0.9990079365079365,1108718,0.62
65
+ 0.9990079365079365,1093939,0.63
66
+ 0.9990079365079365,1072643,0.64
67
+ 0.9970238095238095,1054184,0.65
68
+ 0.9970238095238095,1029756,0.66
69
+ 0.996031746031746,1014722,0.67
70
+ 0.996031746031746,998191,0.68
71
+ 0.996031746031746,976124,0.69
72
+ 0.996031746031746,954849,0.7
73
+ 0.996031746031746,937666,0.71
74
+ 0.9950396825396826,912003,0.72
75
+ 0.9950396825396826,881940,0.73
76
+ 0.9950396825396826,850593,0.74
77
+ 0.9950396825396826,822465,0.75
78
+ 0.9940476190476191,795899,0.76
79
+ 0.9940476190476191,775478,0.77
80
+ 0.9940476190476191,741226,0.78
81
+ 0.9940476190476191,707711,0.79
82
+ 0.9940476190476191,680288,0.8
83
+ 0.9940476190476191,646190,0.81
84
+ 0.9930555555555556,610579,0.82
85
+ 0.9910714285714286,574204,0.83
86
+ 0.9880952380952381,528701,0.84
87
+ 0.9861111111111112,490552,0.85
88
+ 0.9851190476190477,462519,0.86
89
+ 0.9831349206349206,422834,0.87
90
+ 0.9791666666666666,375135,0.88
91
+ 0.9771825396825397,339615,0.89
92
+ 0.9742063492063492,310028,0.9
93
+ 0.9662698412698413,272234,0.91
94
+ 0.9583333333333334,238956,0.92
95
+ 0.9523809523809523,203539,0.93
96
+ 0.9424603174603174,160095,0.94
97
+ 0.9305555555555556,117906,0.95
98
+ 0.9206349206349206,92538,0.96
99
+ 0.9067460317460317,61903,0.97
100
+ 0.876984126984127,46379,0.98
101
+ 0.816468253968254,20605,0.99
102
+ 0.503968253968254,814,1.0
metric_analysis/output_standardized/hard_threshold_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,843 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,23,96321.9309930993,0.0008195845062894203,1.6946305263375057e-05
3
+ 0.0019841269841269,34,88482.72157215721,0.0012115597049495777,2.505105995455443e-05
4
+ 0.0029761904761904,40,83740.72127212721,0.001425364358764209,2.947183524065227e-05
5
+ 0.0039682539682539,48,80460.75757575758,0.001710437230517051,3.536620228878272e-05
6
+ 0.0049603174603174,49,80095.24842484249,0.001746071339486156,3.6102998169799034e-05
7
+ 0.0059523809523809,62,76728.71677167717,0.002209314756084524,4.568134462301102e-05
8
+ 0.0069444444444444,77,71342.26612661267,0.0027438263906211027,5.673328283825562e-05
9
+ 0.0079365079365079,86,69033.78727872788,0.0030645333713430496,6.336444576740239e-05
10
+ 0.0089285714285714,102,66003.9087908791,0.0036346791148487334,7.515317986366329e-05
11
+ 0.0099206349206349,104,65898.10351035104,0.0037059473327869436,7.662677162569591e-05
12
+ 0.0109126984126984,106,65667.25562556257,0.0037772155507251543,7.810036338772852e-05
13
+ 0.0119047619047619,114,64118.65106510651,0.004062288422477996,8.399473043585897e-05
14
+ 0.0128968253968253,122,63320.30213021302,0.004347361294230838,8.988909748398942e-05
15
+ 0.0138888888888888,131,61848.64686468647,0.0046680682749527845,9.652026041313618e-05
16
+ 0.0148809523809523,142,60136.52505250525,0.0050600434736129424,0.00010462501510431557
17
+ 0.0158730158730158,173,56943.12931293129,0.006164700851655204,0.00012746568741582107
18
+ 0.0168650793650793,176,56433.3402340234,0.00627160317856252,0.00012967607505887
19
+ 0.0178571428571428,180,56135.16171617162,0.006414139614438941,0.0001326232585829352
20
+ 0.0188492063492063,187,55000.1596159616,0.0066635783772226774,0.00013778082975004938
21
+ 0.0198412698412698,199,54028.67476747675,0.0070911876848519404,0.00014662238032224506
22
+ 0.0208333333333333,207,53268.800480048005,0.007376260556604782,0.0001525167473703755
23
+ 0.0218253968253968,210,53105.283228322834,0.0074831628835120975,0.00015472713501342442
24
+ 0.0228174603174603,233,50604.43114311432,0.008302747389801518,0.00017167344027679948
25
+ 0.0238095238095238,238,50383.20192019202,0.008480917934647045,0.000175357419681881
26
+ 0.0248015873015873,244,49584.85298529853,0.008694722588461675,0.00017977819496797884
27
+ 0.0257936507936507,264,47968.917791779175,0.00940740476784378,0.00019451411258830499
28
+ 0.0267857142857142,274,47449.5100510051,0.009763745857534832,0.00020188207139846805
29
+ 0.0287698412698412,279,47103.23822382238,0.009941916402380358,0.0002055660508035496
30
+ 0.0297619047619047,287,46304.88928892889,0.010226989274133201,0.00021146041785168005
31
+ 0.0307539682539682,291,46054.804080408045,0.010369525710009622,0.00021440760137574527
32
+ 0.0317460317460317,323,43890.60516051605,0.011509817197020988,0.00023798506956826707
33
+ 0.0327380952380952,332,43659.75727572757,0.011830524177742935,0.00024461623249741386
34
+ 0.0337301587301587,356,42486.280528052805,0.012685742793001461,0.00026229933364180523
35
+ 0.0347222222222222,358,42447.80588058806,0.012757011010939671,0.0002637729254038378
36
+ 0.0357142857142857,369,42053.44074407441,0.013148986209599828,0.0002718776800950172
37
+ 0.0367063492063492,373,41610.98229822982,0.01329152264547625,0.0002748248636190824
38
+ 0.0376984126984126,374,41601.36363636364,0.013327156754445355,0.00027556165950009875
39
+ 0.0386904761904761,379,41466.70237023702,0.013505327299290881,0.00027924563890518024
40
+ 0.0396825396825396,383,41322.42244224423,0.013647863735167302,0.0002821928224292455
41
+ 0.0406746031746031,394,40995.38793879388,0.01403983893382746,0.00029029757712042487
42
+ 0.0416666666666666,409,40754.921392139215,0.014574350568364038,0.00030134951533566945
43
+ 0.0426587301587301,417,40485.59885988599,0.01485942344011688,0.00030724388238379994
44
+ 0.0436507936507936,425,40168.18301830183,0.015144496311869721,0.0003131382494319304
45
+ 0.0446428571428571,428,39821.91119111912,0.015251398638777038,0.0003153486370749793
46
+ 0.0456349206349206,437,39494.876687668766,0.015572105619498984,0.00032197980000412606
47
+ 0.0466269841269841,455,38936.99429942994,0.01621351958094288,0.0003352421258624196
48
+ 0.0476190476190476,458,38840.80768076808,0.016320421907850192,0.0003374525135054685
49
+ 0.0486111111111111,469,38456.06120612061,0.016712397106510353,0.00034555726819664786
50
+ 0.0496031746031746,471,38398.349234923495,0.01678366532444856,0.0003470308599586805
51
+ 0.050595238095238,476,38205.97599759976,0.01696183586929409,0.000350714839363762
52
+ 0.0515873015873015,483,37811.61086108611,0.017211274632077826,0.0003558724105308762
53
+ 0.052579365079365,488,37542.28832883288,0.01738944517692335,0.0003595563899359577
54
+ 0.0535714285714285,508,36888.21932193219,0.018102127356305456,0.00037429230755628385
55
+ 0.054563492063492,523,36407.28622862286,0.018636638990842034,0.00038534424577152843
56
+ 0.0555555555555555,525,36311.099609961,0.018707907208780246,0.0003868178375335611
57
+ 0.056547619047619,551,35743.598559855986,0.01963439404197698,0.00040597453043998504
58
+ 0.0575396825396825,563,35474.27602760276,0.02006200334960624,0.0004148160810121807
59
+ 0.058531746031746,568,35349.23342334233,0.02024017389445177,0.00041850006041726227
60
+ 0.0595238095238095,570,35329.99609960996,0.020311442112389978,0.00041997365217929486
61
+ 0.060515873015873,597,34589.35913591359,0.02127356305455582,0.0004398671409667351
62
+ 0.0615079365079365,620,33656.34893489349,0.022093147560845242,0.0004568134462301102
63
+ 0.0625,624,33540.92499249925,0.022235683996721663,0.00045976062975417543
64
+ 0.0634920634920634,632,33348.55175517552,0.022520756868474504,0.00046565499680230587
65
+ 0.0644841269841269,640,33223.509150915095,0.022805829740227344,0.0004715493638504363
66
+ 0.0654761904761904,641,33204.27182718272,0.022841463849196452,0.00047228615973145266
67
+ 0.0664682539682539,653,33021.51725172518,0.023269073156825713,0.00048112771030364834
68
+ 0.0674603174603174,655,32983.04260426042,0.023340341374763925,0.00048260130206568094
69
+ 0.0684523809523809,657,32963.80528052805,0.023411609592702134,0.00048407489382771353
70
+ 0.0694444444444444,658,32954.18661866187,0.023447243701671238,0.00048481168970872983
71
+ 0.0704365079365079,659,32906.09330933093,0.023482877810640346,0.0004855484855897462
72
+ 0.0724206349206349,663,32896.47464746475,0.023625414246516766,0.0004884956691138114
73
+ 0.0734126984126984,665,32809.90669066907,0.023696682464454975,0.000489969260875844
74
+ 0.0744047619047619,671,32704.10141014101,0.023910487118269607,0.0004943900361619419
75
+ 0.0753968253968253,676,32579.058805880588,0.024088657663115135,0.0004980740155670234
76
+ 0.0763888888888889,695,32155.83768376837,0.024765705733528133,0.0005120731373063332
77
+ 0.0773809523809523,706,31934.608460846084,0.02515768093218829,0.0005201778919975126
78
+ 0.0783730158730158,716,31722.997899789974,0.025514022021879343,0.0005275458508076756
79
+ 0.0793650793650793,718,31703.760576057604,0.025585290239817555,0.0005290194425697082
80
+ 0.0803571428571428,722,31655.667266726672,0.025727826675693975,0.0005319666260937734
81
+ 0.0813492063492063,734,31299.77677767777,0.026155435983323237,0.0005408081766659691
82
+ 0.0823412698412698,740,31107.40354035404,0.02636924063713787,0.000545228951952067
83
+ 0.0833333333333333,748,30915.030303030304,0.02665431350889071,0.0005511233190001975
84
+ 0.0843253968253968,754,30866.93699369937,0.026868118162705342,0.0005555440942862953
85
+ 0.0853174603174603,760,30780.36903690369,0.027081922816519974,0.0005599648695723932
86
+ 0.0873015873015873,776,30280.198619861985,0.027652068560025656,0.0005717536036686541
87
+ 0.0882936507936508,784,30145.537353735373,0.027937141431778497,0.0005776479707167845
88
+ 0.0892857142857142,801,29847.358835883588,0.028542921284253286,0.0005901735006940617
89
+ 0.0902777777777777,804,29828.12151215121,0.028649823611160603,0.0005923838883371106
90
+ 0.0912698412698412,806,29780.02820282028,0.028721091829098815,0.0005938574800991432
91
+ 0.0922619047619047,842,29097.10321032103,0.030003919751986602,0.0006203821318157303
92
+ 0.0932539682539682,847,28856.63666366637,0.030182090296832127,0.0006240661112208118
93
+ 0.0942460317460317,856,28721.975397539754,0.030502797277554075,0.0006306972741499586
94
+ 0.0952380952380952,861,28616.1701170117,0.0306809678223996,0.0006343812535550401
95
+ 0.0962301587301587,869,28443.034203420342,0.030966040694152444,0.0006402756206031706
96
+ 0.0972222222222222,870,28433.41554155416,0.03100167480312155,0.0006410124164841869
97
+ 0.0982142857142857,874,28356.466246624663,0.03114421123899797,0.0006439596000082521
98
+ 0.0992063492063492,889,28000.57575757576,0.03167872287353455,0.0006550115382234967
99
+ 0.1001984126984127,905,27760.10921092109,0.03224886861704023,0.0006668002723197576
100
+ 0.1011904761904761,913,27596.59195919592,0.03253394148879307,0.0006726946393678881
101
+ 0.1021825396825396,915,27481.16801680168,0.03260520970673128,0.0006741682311299207
102
+ 0.1031746031746031,929,27269.557455745577,0.033104087232298754,0.000684483373464149
103
+ 0.1041666666666666,967,26836.717671767172,0.03445818337312476,0.0007124816169427686
104
+ 0.1051587301587301,968,26827.09900990099,0.03449381748209386,0.000713218412823785
105
+ 0.1061507936507936,972,26682.81908190819,0.03463635391797028,0.0007161655963478502
106
+ 0.1071428571428571,978,26634.725772577254,0.03485015857178491,0.0007205863716339481
107
+ 0.1081349206349206,983,26528.920492049205,0.03502832911663044,0.0007242703510390295
108
+ 0.1091269841269841,998,26355.78457845785,0.035562840751167016,0.0007353222892542741
109
+ 0.1101190476190476,1002,26307.691269126917,0.03570537718704344,0.0007382694727783394
110
+ 0.1111111111111111,1007,26269.216621662166,0.03588354773188896,0.0007419534521834209
111
+ 0.1121031746031746,1015,26153.792679267928,0.03616862060364181,0.0007478478192315514
112
+ 0.1130952380952381,1025,26086.462046204622,0.03652496169333286,0.0007552157780417144
113
+ 0.1140873015873015,1030,26028.7500750075,0.036703132238178386,0.000758899757446796
114
+ 0.115079365079365,1050,25807.52085208521,0.03741581441756049,0.0007736356750671222
115
+ 0.1160714285714285,1056,25759.427542754274,0.03762961907137512,0.0007780564503532199
116
+ 0.117063492063492,1074,25567.05430543054,0.038271033032819014,0.0007913187762115134
117
+ 0.1180555555555555,1075,25557.43564356436,0.03830666714178812,0.0007920555720925298
118
+ 0.119047619047619,1077,25538.198319831983,0.03837793535972633,0.0007935291638545624
119
+ 0.1200396825396825,1084,25461.24902490249,0.038627374122510064,0.0007986867350216765
120
+ 0.121031746031746,1089,25413.15571557156,0.03880554466735559,0.000802370714426758
121
+ 0.1220238095238095,1097,25307.350435043503,0.03909061753910843,0.0008082650814748885
122
+ 0.123015873015873,1106,25211.16381638164,0.039411324519830385,0.0008148962444040353
123
+ 0.1240079365079365,1112,25105.358535853586,0.03962512917364501,0.0008193170196901332
124
+ 0.125,1118,25047.646564656465,0.03983893382745964,0.000823737794976231
125
+ 0.1259920634920635,1120,25038.02790279028,0.03991020204539786,0.0008252113867382635
126
+ 0.1269841269841269,1128,24912.985298529853,0.0401952749171507,0.000831105753786394
127
+ 0.1279761904761904,1132,24884.12931293129,0.040337811353027116,0.0008340529373104592
128
+ 0.1289682539682539,1134,24874.510651065106,0.04040907957096533,0.0008355265290724918
129
+ 0.1299603174603174,1136,24864.89198919892,0.04048034778890354,0.0008370001208345245
130
+ 0.1309523809523809,1138,24855.27332733273,0.04055161600684175,0.0008384737125965571
131
+ 0.1319444444444444,1143,24797.56135613561,0.040729786551687276,0.0008421576920016387
132
+ 0.1329365079365079,1168,24451.289528952897,0.0416206392759149,0.0008605775890270463
133
+ 0.1339285714285714,1183,24172.348334833485,0.04215515091045149,0.0008716295272422909
134
+ 0.1349206349206349,1186,24162.729672967296,0.0422620532373588,0.0008738399148853399
135
+ 0.1369047619047619,1194,24105.01770177018,0.04254712610911164,0.0008797342819334702
136
+ 0.1378968253968254,1196,24076.161716171617,0.04261839432704985,0.0008812078736955029
137
+ 0.1388888888888889,1198,24056.924392439243,0.042689662544988065,0.0008826814654575355
138
+ 0.1398809523809523,1200,24037.68706870687,0.04276093076292627,0.0008841550572195681
139
+ 0.1408730158730158,1202,24018.4497449745,0.04283219898086448,0.0008856286489816007
140
+ 0.1418650793650793,1210,23931.881788178816,0.04311727185261732,0.0008915230160297312
141
+ 0.1428571428571428,1217,23797.220522052205,0.04336671061540106,0.0008966805871968454
142
+ 0.1438492063492063,1221,23777.98319831983,0.04350924705127748,0.0008996277707209106
143
+ 0.1448412698412698,1240,23556.75397539754,0.044186295121690485,0.0009136268924602204
144
+ 0.1458333333333333,1242,23537.51665166517,0.04425756333962869,0.000915100484222253
145
+ 0.1478174603174603,1249,23470.18601860186,0.04450700210241243,0.0009202580553893671
146
+ 0.1488095238095238,1258,23383.61806180618,0.044827709083134375,0.000926889218318514
147
+ 0.1498015873015873,1266,23287.43144314432,0.045112781954887216,0.0009327835853666443
148
+ 0.1507936507936507,1271,23268.194119411943,0.045290952499732744,0.0009364675647717259
149
+ 0.1517857142857142,1273,23248.95679567957,0.04536222071767095,0.0009379411565337585
150
+ 0.1527777777777778,1287,23095.058205820584,0.045861098243238425,0.0009482562988679869
151
+ 0.1547619047619047,1307,22931.54095409541,0.04657378042262053,0.0009629922164883129
152
+ 0.1557539682539682,1308,22921.922292229225,0.046609414531589635,0.0009637290123693293
153
+ 0.1577380952380952,1312,22912.30363036304,0.04675195096746606,0.0009666761958933945
154
+ 0.1587301587301587,1315,22902.68496849685,0.04685885329437337,0.0009688865835364434
155
+ 0.1597222222222222,1317,22883.44764476448,0.04693012151231159,0.000970360175298476
156
+ 0.1607142857142857,1319,22873.82898289829,0.047001389730249796,0.0009718337670605086
157
+ 0.1617063492063492,1320,22864.210321032104,0.0470370238392189,0.000972570562941525
158
+ 0.1626984126984127,1326,22816.11701170117,0.04725082849303353,0.0009769913382276227
159
+ 0.1636904761904762,1337,22719.930393039303,0.047642803691693686,0.0009850960929188022
160
+ 0.1646825396825396,1342,22691.074407440745,0.047820974236539214,0.0009887800723238837
161
+ 0.1656746031746031,1352,22585.26912691269,0.04817731532623027,0.0009961480311340468
162
+ 0.1666666666666666,1356,22479.46384638464,0.04831985176210669,0.000999095214658112
163
+ 0.1676587301587301,1366,22325.565256525653,0.048676192851797744,0.001006463173468275
164
+ 0.1686507936507936,1367,22315.946594659465,0.04871182696076685,0.0010071999693492914
165
+ 0.1706349206349206,1371,22296.70927092709,0.048854363396643265,0.0010101471528733566
166
+ 0.1716269841269841,1390,22133.19201920192,0.049531411467056266,0.0010241462746126665
167
+ 0.1726190476190476,1396,22065.86138613861,0.0497452161208709,0.0010285670498987644
168
+ 0.1736111111111111,1403,21998.53075307531,0.049994654883654635,0.0010337246210658784
169
+ 0.1746031746031746,1409,21960.05610561056,0.05020845953746927,0.0010381453963519763
170
+ 0.175595238095238,1418,21911.96279627963,0.05052916651819121,0.001044776559281123
171
+ 0.1765873015873016,1421,21883.106810681067,0.050636068845098525,0.0010469869469241719
172
+ 0.1775793650793651,1425,21863.869486948694,0.05077860528097495,0.001049934130448237
173
+ 0.1785714285714285,1430,21786.920192019203,0.05095677582582048,0.0010536181098533186
174
+ 0.179563492063492,1433,21767.68286828683,0.05106367815272779,0.0010558284974963676
175
+ 0.1805555555555555,1435,21738.82688268827,0.051134946370666,0.0010573020892584001
176
+ 0.181547619047619,1442,21690.73357335733,0.051384385133449735,0.0010624596604255144
177
+ 0.1825396825396825,1446,21661.877587758776,0.05152692156932616,0.0010654068439495796
178
+ 0.183531746031746,1451,21633.021602160217,0.05170509211417169,0.0010690908233546611
179
+ 0.1845238095238095,1467,21479.123012301232,0.05227523785767737,0.001080879557450922
180
+ 0.185515873015873,1487,21325.224422442243,0.052987920037059474,0.0010956154750712483
181
+ 0.1865079365079365,1493,21296.368436843684,0.05320172469087411,0.001100036250357346
182
+ 0.1875,1498,21238.656465646563,0.05337989523571963,0.0011037202297624275
183
+ 0.1884920634920635,1500,21219.419141914197,0.05345116345365784,0.0011051938215244602
184
+ 0.1894841269841269,1502,21200.18181818182,0.05352243167159605,0.0011066674132864927
185
+ 0.1914682539682539,1507,21180.944494449446,0.05370060221644158,0.0011103513926915742
186
+ 0.1924603174603174,1509,21171.325832583258,0.05377187043437979,0.001111824984453607
187
+ 0.1934523809523809,1512,21113.61386138614,0.0538787727612871,0.0011140353720966558
188
+ 0.1954365079365079,1517,21075.139213921397,0.05405694330613263,0.0011177193515017373
189
+ 0.1964285714285714,1518,21065.520552055204,0.054092577415101734,0.0011184561473827537
190
+ 0.1974206349206349,1527,20950.096609660966,0.054413284395823686,0.0011250873103119004
191
+ 0.1984126984126984,1533,20911.62196219622,0.05462708904963831,0.0011295080855979983
192
+ 0.1994047619047619,1543,20815.435343534355,0.05498343013932937,0.0011368760444081614
193
+ 0.2003968253968254,1545,20786.579357935792,0.055054698357267576,0.0011383496361701939
194
+ 0.2013888888888889,1549,20738.48604860486,0.055197234793144,0.0011412968196942593
195
+ 0.2023809523809523,1551,20709.6300630063,0.05526850301108221,0.0011427704114562918
196
+ 0.2033730158730158,1555,20651.91809180918,0.05541103944695863,0.001145717594980357
197
+ 0.2043650793650793,1589,20344.12091209121,0.056622599151908204,0.0011707686549349114
198
+ 0.2053571428571428,1594,20296.02760276028,0.05680076969675373,0.001174452634339993
199
+ 0.2063492063492063,1605,20161.366336633662,0.05719274489541389,0.0011825573890311724
200
+ 0.2073412698412698,1624,20065.1797179718,0.05786979296582689,0.0011965565107704822
201
+ 0.2083333333333333,1640,19940.13711371137,0.058439938709332576,0.0012083452448667432
202
+ 0.2093253968253968,1641,19930.518451845182,0.05847557281830168,0.0012090820407477593
203
+ 0.2103174603174603,1660,19843.950495049507,0.059152620888714674,0.0012230811624870692
204
+ 0.2113095238095238,1689,19680.433243324333,0.06018601004881873,0.001244448243036542
205
+ 0.2123015873015873,1694,19641.958595859585,0.06036418059366425,0.0012481322224416236
206
+ 0.2132936507936507,1698,19593.865286528653,0.06050671702954068,0.0012510794059656888
207
+ 0.2142857142857142,1714,19507.297329732974,0.06107686277304636,0.0012628681400619498
208
+ 0.2152777777777778,1715,19497.67866786679,0.06111249688201546,0.0012636049359429661
209
+ 0.2172619047619047,1721,19468.822682268223,0.061326301535830095,0.001268025711229064
210
+ 0.2182539682539682,1722,19459.20402040204,0.0613619356447992,0.0012687625071100802
211
+ 0.2192460317460317,1740,19334.16141614161,0.0620033496062431,0.0012820248329683738
212
+ 0.2202380952380952,1745,19314.92409240924,0.062181520151088625,0.0012857088123734554
213
+ 0.2212301587301587,1748,19295.686768676867,0.06228842247799594,0.0012879192000165042
214
+ 0.2222222222222222,1752,19276.449444944494,0.06243095891387236,0.0012908663835405694
215
+ 0.2232142857142857,1757,19228.35613561356,0.06260912945871788,0.001294550362945651
216
+ 0.2242063492063492,1758,19218.737473747376,0.06264476356768699,0.0012952871588266673
217
+ 0.2251984126984127,1768,19151.406840684067,0.06300110465737804,0.0013026551176368304
218
+ 0.2261904761904762,1774,19122.55085508551,0.06321490931119267,0.0013070758929229283
219
+ 0.2271825396825396,1775,19103.31353135313,0.06325054342016177,0.0013078126888039444
220
+ 0.2281746031746031,1784,19084.07620762076,0.06357125040088372,0.0013144438517330912
221
+ 0.2291666666666666,1787,19074.457545754576,0.06367815272779104,0.0013166542393761402
222
+ 0.2301587301587301,1790,19045.601560156018,0.06378505505469836,0.001318864627019189
223
+ 0.2311507936507936,1796,19016.74557455745,0.06399885970851299,0.001323285402305287
224
+ 0.2321428571428571,1804,18987.889588958897,0.06428393258026584,0.0013291797693534173
225
+ 0.2331349206349206,1810,18939.796279627964,0.06449773723408046,0.0013336005446395152
226
+ 0.2341269841269841,1812,18930.177617761776,0.06456900545201867,0.001335074136401548
227
+ 0.2351190476190476,1822,18862.84698469847,0.06492534654170973,0.001342442095211711
228
+ 0.2361111111111111,1833,18805.13501350135,0.06531732174036989,0.0013505468499028902
229
+ 0.2371031746031746,1837,18776.27902790279,0.0654598581762463,0.0013534940334269554
230
+ 0.238095238095238,1860,18631.99909990999,0.06627944268253572,0.0013704403386903307
231
+ 0.2400793650793651,1870,18545.43114311431,0.06663578377222677,0.0013778082975004938
232
+ 0.2410714285714285,1887,18449.24452445245,0.06724156362470156,0.0013903338274777709
233
+ 0.242063492063492,1889,18439.625862586257,0.06731283184263978,0.0013918074192398036
234
+ 0.2430555555555555,1898,18410.7698769877,0.06763353882336172,0.0013984385821689503
235
+ 0.2450396825396825,1903,18372.29522952295,0.06781170936820725,0.0014021225615740319
236
+ 0.246031746031746,1914,18285.727272727272,0.0682036845668674,0.001410227316265211
237
+ 0.2470238095238095,1933,18179.92199219922,0.06888073263728041,0.001424226438004521
238
+ 0.248015873015873,1943,18141.447344734475,0.06923707372697145,0.001431594396814684
239
+ 0.2490079365079365,1951,18102.972697269728,0.0695221465987243,0.0014374887638628146
240
+ 0.25,1959,18074.11671167117,0.06980721947047715,0.001443383130910945
241
+ 0.2509920634920635,1970,17977.9300930093,0.0701991946691373,0.0014514878856021244
242
+ 0.251984126984127,1972,17968.311431143113,0.0702704628870755,0.001452961477364157
243
+ 0.2529761904761904,1979,17929.83678367837,0.07051990164985925,0.0014581190485312711
244
+ 0.2549603174603174,1983,17872.12481248125,0.07066243808573566,0.0014610662320553363
245
+ 0.2559523809523809,2017,17708.607560756074,0.07187399779068525,0.0014861172920098908
246
+ 0.2569444444444444,2019,17679.751575157516,0.07194526600862346,0.0014875908837719233
247
+ 0.2579365079365079,2022,17670.13291329133,0.07205216833553077,0.0014898012714149723
248
+ 0.2589285714285714,2042,17583.56495649565,0.07276485051491287,0.0015045371890352985
249
+ 0.2599206349206349,2072,17410.429042904292,0.07383387378398604,0.0015266410654657875
250
+ 0.2609126984126984,2082,17343.098409840983,0.07419021487367708,0.0015340090242759506
251
+ 0.2628968253968254,2096,17304.62376237624,0.07468909239924455,0.001544324166610179
252
+ 0.2648809523809524,2108,17246.911791179118,0.07511670170687382,0.0015531657171823747
253
+ 0.2658730158730158,2111,17218.05580558056,0.07522360403378113,0.0015553761048254235
254
+ 0.2668650793650793,2119,17169.962496249624,0.07550867690553398,0.0015612704718735541
255
+ 0.2678571428571428,2124,17102.631863186318,0.0756868474503795,0.0015649544512786355
256
+ 0.2688492063492063,2144,17006.445244524453,0.07639952962976161,0.0015796903688989616
257
+ 0.2698412698412698,2150,16977.589258925895,0.07661333428357624,0.0015841111441850595
258
+ 0.2708333333333333,2152,16958.351935193517,0.07668460250151445,0.0015855847359470922
259
+ 0.2718253968253968,2165,16862.165316531653,0.07714784591811281,0.0015951630824003042
260
+ 0.2738095238095238,2167,16852.546654665464,0.07721911413605102,0.0015966366741623369
261
+ 0.2757936507936508,2170,16833.309330933094,0.07732601646295835,0.0015988470618053857
262
+ 0.2767857142857143,2181,16737.12271227123,0.0777179916616185,0.0016069518164965651
263
+ 0.2777777777777778,2187,16717.885388538853,0.07793179631543314,0.0016113725917826628
264
+ 0.2787698412698413,2207,16621.698769876988,0.07864447849481523,0.001626108509402989
265
+ 0.2797619047619047,2210,16602.461446144614,0.07875138082172255,0.001628318897046038
266
+ 0.2807539682539682,2246,16410.088208820882,0.08003420874461034,0.001654843548762625
267
+ 0.2817460317460317,2251,16381.232223222323,0.08021237928945586,0.0016585275281677065
268
+ 0.2827380952380952,2252,16371.613561356136,0.08024801339842497,0.0016592643240487229
269
+ 0.2837301587301587,2257,16333.13891389139,0.0804261839432705,0.0016629483034538044
270
+ 0.2847222222222222,2273,16236.952295229525,0.08099632968677618,0.0016747370375500652
271
+ 0.2857142857142857,2276,16217.71497149715,0.0811032320136835,0.0016769474251931143
272
+ 0.2867063492063492,2280,16198.477647764776,0.08124576844955991,0.0016798946087171794
273
+ 0.2876984126984127,2289,16179.240324032404,0.08156647543028187,0.0016865257716463262
274
+ 0.2886904761904761,2300,16121.528352835285,0.08195845062894203,0.0016946305263375056
275
+ 0.2896825396825397,2304,16111.909690969098,0.08210098706481844,0.0016975777098615708
276
+ 0.2906746031746032,2320,16034.960396039603,0.08267113280832412,0.0017093664439578318
277
+ 0.2916666666666667,2328,16006.104410441045,0.08295620568007697,0.0017152608110059622
278
+ 0.2926587301587302,2345,15909.91779177918,0.08356198553255176,0.0017277863409832393
279
+ 0.2946428571428571,2354,15881.061806180618,0.08388269251327371,0.001734417503912386
280
+ 0.2956349206349206,2363,15804.112511251124,0.08420339949399565,0.001741048666841533
281
+ 0.2966269841269841,2381,15707.92589258926,0.08484481345543955,0.0017543109926998264
282
+ 0.2976190476190476,2384,15698.307230723072,0.08495171578234686,0.0017565213803428753
283
+ 0.2986111111111111,2391,15679.0699069907,0.0852011545451306,0.0017616789515099895
284
+ 0.2996031746031746,2397,15659.832583258329,0.08541495919894523,0.0017660997267960872
285
+ 0.3005952380952381,2401,15640.595259525951,0.08555749563482165,0.0017690469103201526
286
+ 0.302579365079365,2408,15611.739273927393,0.0858069343976054,0.0017742044814872667
287
+ 0.304563492063492,2416,15573.264626462646,0.08609200726935823,0.0017800988485353973
288
+ 0.3055555555555556,2418,15554.027302730274,0.08616327548729644,0.0017815724402974297
289
+ 0.3075396825396825,2427,15525.171317131711,0.08648398246801839,0.0017882036032265765
290
+ 0.308531746031746,2439,15477.07800780078,0.08691159177564765,0.0017970451537987723
291
+ 0.3095238095238095,2450,15438.603360336034,0.08730356697430781,0.0018051499084899515
292
+ 0.310515873015873,2451,15409.747374737472,0.08733920108327692,0.0018058867043709678
293
+ 0.3115079365079365,2453,15400.128712871288,0.08741046930121513,0.0018073602961330005
294
+ 0.3125,2465,15342.416741674167,0.08783807860884439,0.0018162018467051961
295
+ 0.3134920634920635,2473,15294.323432343235,0.08812315148059723,0.0018220962137533267
296
+ 0.314484126984127,2476,15284.704770477048,0.08823005380750454,0.0018243066013963756
297
+ 0.3154761904761904,2477,15275.086108610862,0.08826568791647364,0.001825043397277392
298
+ 0.3164682539682539,2488,15217.37413741374,0.0886576631151338,0.0018331481519685711
299
+ 0.3174603174603174,2498,15178.899489948995,0.08901400420482486,0.0018405161107787342
300
+ 0.3184523809523809,2501,15150.043504350437,0.08912090653173217,0.0018427264984217833
301
+ 0.3194444444444444,2511,15130.80618061806,0.08947724762142323,0.0018500944572319464
302
+ 0.3204365079365079,2520,15101.950195019504,0.08979795460214518,0.001856725620161093
303
+ 0.3224206349206349,2525,15092.331533153316,0.0899761251469907,0.0018604095995661746
304
+ 0.3244047619047619,2532,15073.094209420942,0.09022556390977443,0.0018655671707332887
305
+ 0.3253968253968254,2550,14976.907590759076,0.09086697787121834,0.0018788294965915824
306
+ 0.3263888888888889,2558,14938.43294329433,0.09115205074297117,0.0018847238636397127
307
+ 0.3273809523809524,2562,14928.814281428144,0.09129458717884759,0.001887671047163778
308
+ 0.3283730158730158,2574,14871.102310231025,0.09172219648647685,0.0018965125977359737
309
+ 0.3293650793650793,2581,14832.627662766276,0.0919716352492606,0.0019016701689030877
310
+ 0.3303571428571428,2600,14755.678367836785,0.09264868331967359,0.0019156692906423976
311
+ 0.3323412698412698,2605,14746.059705970598,0.09282685386451912,0.0019193532700474791
312
+ 0.3333333333333333,2615,14717.203720372036,0.09318319495421017,0.0019267212288576422
313
+ 0.3343253968253968,2640,14611.398439843984,0.0940740476784378,0.00194514112588305
314
+ 0.3363095238095238,2649,14582.542454245424,0.09439475465915975,0.0019517722888121967
315
+ 0.3373015873015873,2662,14534.449144914492,0.09485799807575812,0.0019613506352654086
316
+ 0.3382936507936508,2673,14495.974497449744,0.09524997327441827,0.001969455389956588
317
+ 0.3402777777777778,2677,14486.35583558356,0.0953925097102947,0.0019724025734806534
318
+ 0.3412698412698413,2696,14390.169216921691,0.09606955778070769,0.0019864016952199632
319
+ 0.3422619047619047,2703,14351.694569456946,0.09631899654349144,0.0019915592663870773
320
+ 0.3432539682539682,2706,14342.07590759076,0.09642589887039875,0.001993769654030126
321
+ 0.3442460317460317,2715,14303.601260126014,0.09674660585112069,0.002000400816959273
322
+ 0.3452380952380952,2718,14293.982598259829,0.09685350817802801,0.0020026112046023217
323
+ 0.3472222222222222,2728,14274.745274527451,0.09720984926771906,0.0020099791634124848
324
+ 0.3482142857142857,2743,14197.79597959796,0.09774436090225563,0.0020210311016277296
325
+ 0.3492063492063492,2747,14178.558655865589,0.09788689733813206,0.0020239782851517946
326
+ 0.3501984126984127,2751,14168.9399939994,0.09802943377400848,0.00202692546867586
327
+ 0.3511904761904761,2760,14120.846684668468,0.09835014075473043,0.0020335566316050067
328
+ 0.3531746031746032,2770,14063.134713471349,0.09870648184442148,0.00204092459041517
329
+ 0.3551587301587302,2773,14053.51605160516,0.0988133841713288,0.0020431349780582185
330
+ 0.3561507936507936,2776,14043.897389738971,0.09892028649823612,0.0020453453657012675
331
+ 0.3581349206349206,2780,14034.278727872788,0.09906282293411253,0.002048292549225333
332
+ 0.3591269841269841,2783,14024.6600660066,0.09916972526101984,0.0020505029368683815
333
+ 0.3601190476190476,2791,14005.422742274228,0.09945479813277269,0.0020563973039165124
334
+ 0.3611111111111111,2799,13976.566756675667,0.09973987100452553,0.0020622916709646427
335
+ 0.3630952380952381,2807,13938.092109210922,0.10002494387627837,0.002068186038012773
336
+ 0.3640873015873015,2822,13870.761476147614,0.10055945551081495,0.0020792379762280175
337
+ 0.365079365079365,2830,13841.905490549056,0.1008445283825678,0.0020851323432761483
338
+ 0.3660714285714285,2841,13803.430843084308,0.10123650358122795,0.0020932370979673274
339
+ 0.367063492063492,2852,13774.574857485748,0.1016284787798881,0.002101341852658507
340
+ 0.3680555555555556,2877,13668.769576957697,0.10251933150411574,0.0021197617496839147
341
+ 0.369047619047619,2880,13659.15091509151,0.10262623383102305,0.0021219721373269634
342
+ 0.3700396825396825,2893,13630.29492949295,0.10308947724762142,0.0021315504837801755
343
+ 0.371031746031746,2908,13591.820282028202,0.103623988882158,0.00214260242199542
344
+ 0.3720238095238095,2909,13582.201620162015,0.1036596229911271,0.0021433392178764363
345
+ 0.3740079365079365,2915,13562.964296429644,0.10387342764494174,0.0021477599931625344
346
+ 0.375,2933,13514.87098709871,0.10451484160638563,0.002161022319020828
347
+ 0.376984126984127,2944,13466.777677767775,0.1049068168050458,0.0021691270737120073
348
+ 0.3779761904761904,2957,13428.30303030303,0.10537006022164416,0.002178705420165219
349
+ 0.3799603174603174,2968,13399.447044704471,0.10576203542030431,0.0021868101748563984
350
+ 0.3809523809523809,2975,13370.591059105913,0.10601147418308805,0.0021919677460235125
351
+ 0.3829365079365079,2986,13332.116411641164,0.10640344938174821,0.002200072500714692
352
+ 0.3849206349206349,2994,13303.260426042603,0.10668852225350105,0.0022059668677628223
353
+ 0.3859126984126984,2998,13293.641764176418,0.10683105868937748,0.0022089140512868877
354
+ 0.3869047619047619,3005,13255.167116711673,0.10708049745216121,0.0022140716224540017
355
+ 0.3878968253968254,3009,13245.548454845484,0.10722303388803762,0.002217018805978067
356
+ 0.3888888888888889,3019,13216.692469246926,0.10757937497772868,0.00222438676478823
357
+ 0.3898809523809524,3034,13158.980498049805,0.10811388661226526,0.0022354387030034746
358
+ 0.3908730158730158,3064,13053.175217521752,0.10918290988133841,0.002257542579433964
359
+ 0.3918650793650793,3068,13043.556555655565,0.10932544631721484,0.0022604897629580293
360
+ 0.3928571428571428,3076,13014.700570057006,0.10961051918896768,0.0022663841300061597
361
+ 0.3938492063492063,3085,12985.844584458446,0.10993122616968963,0.0022730152929353064
362
+ 0.3948412698412698,3090,12966.607260726072,0.11010939671453515,0.0022766992723403877
363
+ 0.3968253968253968,3094,12956.988598859883,0.11025193315041157,0.002279646455864453
364
+ 0.3978174603174603,3098,12947.3699369937,0.110394469586288,0.0022825936393885186
365
+ 0.3988095238095238,3104,12928.132613261329,0.11060827424010262,0.0022870144146746162
366
+ 0.3998015873015873,3109,12918.51395139514,0.11078644478494815,0.0022906983940796976
367
+ 0.4007936507936508,3112,12908.895289528951,0.11089334711185547,0.0022929087817227466
368
+ 0.4017857142857143,3134,12851.183318331834,0.11167729750917578,0.0023091182911051055
369
+ 0.4027777777777778,3141,12831.94599459946,0.11192673627195952,0.0023142758622722195
370
+ 0.4037698412698413,3161,12774.23402340234,0.11263941845134162,0.0023290117798925457
371
+ 0.4047619047619047,3167,12745.37803780378,0.11285322310515626,0.0023334325551786434
372
+ 0.4057539682539682,3177,12706.903390339034,0.1132095641948473,0.0023408005139888065
373
+ 0.4067460317460317,3184,12678.047404740477,0.11345900295763105,0.002345958085155921
374
+ 0.4077380952380952,3187,12668.428742874288,0.11356590528453836,0.0023481684727989695
375
+ 0.4087301587301587,3190,12658.8100810081,0.11367280761144567,0.0023503788604420186
376
+ 0.4097222222222222,3194,12649.191419141916,0.1138153440473221,0.002353326043966084
377
+ 0.4107142857142857,3200,12620.335433543354,0.11402914870113673,0.0023577468192521817
378
+ 0.4117063492063492,3207,12591.479447944794,0.11427858746392046,0.0023629043904192957
379
+ 0.4136904761904761,3211,12581.860786078609,0.11442112389979689,0.002365851573943361
380
+ 0.4146825396825397,3220,12562.623462346235,0.11474183088051883,0.002372482736872508
381
+ 0.4156746031746032,3224,12543.38613861386,0.11488436731639526,0.002375429920396573
382
+ 0.4166666666666667,3235,12495.29282928293,0.1152763425150554,0.0023835346750877523
383
+ 0.4176587301587302,3240,12466.436843684369,0.11545451305990094,0.002387218654492834
384
+ 0.4186507936507936,3253,12418.343534353437,0.11591775647649931,0.0023967970009460457
385
+ 0.4196428571428571,3256,12408.72487248725,0.11602465880340662,0.002399007388589095
386
+ 0.4206349206349206,3261,12399.106210621065,0.11620282934825214,0.0024026913679941766
387
+ 0.4216269841269841,3271,12379.86888688869,0.1165591704379432,0.0024100593268043392
388
+ 0.4226190476190476,3275,12370.250225022502,0.11670170687381962,0.0024130065103284046
389
+ 0.4236111111111111,3277,12351.012901290129,0.11677297509175782,0.0024144801020904373
390
+ 0.4246031746031746,3280,12341.394239423942,0.11687987741866515,0.0024166904897334864
391
+ 0.4255952380952381,3285,12331.775577557755,0.11705804796351067,0.0024203744691385677
392
+ 0.4265873015873015,3290,12322.156915691568,0.1172362185083562,0.0024240584485436495
393
+ 0.427579365079365,3304,12283.682268226825,0.11773509603392367,0.0024343735908778775
394
+ 0.4285714285714285,3317,12254.826282628264,0.11819833945052204,0.0024439519373310897
395
+ 0.429563492063492,3322,12245.207620762076,0.11837650999536757,0.002447635916736171
396
+ 0.4305555555555556,3333,12197.114311431144,0.11876848519402772,0.0024557406714273504
397
+ 0.431547619047619,3365,12120.16501650165,0.1199087766810391,0.0024793181396198724
398
+ 0.4325396825396825,3375,12100.927692769275,0.12026511777073014,0.0024866860984300355
399
+ 0.4345238095238095,3398,12014.359735973598,0.12108470227701956,0.0025036324036934103
400
+ 0.435515873015873,3402,12004.74107410741,0.12122723871289598,0.0025065795872174757
401
+ 0.4365079365079365,3414,11975.88508850885,0.12165484802052524,0.0025154211377896715
402
+ 0.4375,3432,11927.791779177918,0.12229626198196913,0.002528683463647965
403
+ 0.439484126984127,3463,11831.605160516052,0.12340091936001141,0.00255152413595947
404
+ 0.4404761904761904,3465,11821.986498649863,0.12347218757794962,0.002552997727721503
405
+ 0.4414682539682539,3485,11773.893189318933,0.12418486975733171,0.002567733645341829
406
+ 0.4424603174603174,3499,11754.65586558656,0.12468374728289919,0.0025780487876760575
407
+ 0.4434523809523809,3507,11735.418541854186,0.12496882015465204,0.002583943154724188
408
+ 0.4454365079365079,3513,11725.799879988,0.12518262480846667,0.0025883639300102856
409
+ 0.4464285714285714,3526,11687.325232523252,0.12564586822506504,0.0025979422764634977
410
+ 0.4484126984126984,3536,11658.469246924691,0.1260022093147561,0.002605310235273661
411
+ 0.4494047619047619,3545,11639.23192319232,0.12632291629547804,0.0026119413982028075
412
+ 0.4503968253968254,3567,11571.901290129012,0.12710686669279836,0.0026281509075851664
413
+ 0.4513888888888889,3571,11562.282628262828,0.12724940312867478,0.0026310980911092314
414
+ 0.4523809523809524,3574,11552.66396639664,0.12735630545558208,0.0026333084787522804
415
+ 0.4533730158730158,3582,11543.045304530451,0.12764137832733494,0.002639202845800411
416
+ 0.4543650793650793,3585,11533.426642664266,0.12774828065424224,0.00264141323344346
417
+ 0.4553571428571428,3589,11523.80798079808,0.12789081709011865,0.002644360416967525
418
+ 0.4563492063492063,3604,11485.333333333334,0.12842532872465523,0.0026554123551827697
419
+ 0.4583333333333333,3608,11475.714671467147,0.12856786516053167,0.0026583595387068347
420
+ 0.4593253968253968,3626,11418.002700270026,0.12920927912197555,0.002671621864565128
421
+ 0.4603174603174603,3632,11389.146714671468,0.12942308377579018,0.0026760426398512263
422
+ 0.4613095238095238,3636,11369.909390939094,0.12956562021166662,0.0026789898233752912
423
+ 0.4623015873015873,3648,11341.053405340534,0.12999322951929587,0.002687831373947487
424
+ 0.4632936507936508,3658,11331.434743474349,0.13034957060898691,0.00269519933275765
425
+ 0.4642857142857143,3678,11292.9600960096,0.13106225278836903,0.0027099352503779763
426
+ 0.4652777777777778,3693,11254.485448544854,0.1315967644229056,0.0027209871885932207
427
+ 0.4662698412698413,3708,11158.298829882988,0.13213127605744218,0.0027320391268084655
428
+ 0.4672619047619047,3714,11139.061506150616,0.1323450807112568,0.0027364599020945632
429
+ 0.4682539682539682,3716,11129.44284428443,0.13241634892919502,0.002737933493856596
430
+ 0.4692460317460317,3720,11119.824182418242,0.13255888536507143,0.0027408806773806613
431
+ 0.4712301587301587,3726,11110.205520552056,0.13277269001888609,0.002745301452666759
432
+ 0.4732142857142857,3735,11090.968196819682,0.13309339699960804,0.0027519326155959058
433
+ 0.4742063492063492,3741,11081.349534953495,0.13330720165342266,0.0027563533908820034
434
+ 0.4761904761904761,3745,11071.730873087308,0.13344973808929908,0.002759300574406069
435
+ 0.4771825396825397,3757,11023.637563756376,0.13387734739692833,0.0027681421249782646
436
+ 0.4781746031746032,3766,10985.162916291629,0.13419805437765028,0.0027747732879074114
437
+ 0.4791666666666667,3771,10965.925592559255,0.13437622492249582,0.0027784572673124927
438
+ 0.4801587301587302,3774,10956.30693069307,0.13448312724940312,0.0027806676549555417
439
+ 0.4811507936507936,3785,10927.45094509451,0.13487510244806328,0.002788772409646721
440
+ 0.4821428571428571,3790,10917.832283228325,0.1350532729929088,0.0027924563890518025
441
+ 0.4831349206349206,3803,10888.976297629762,0.13551651640950718,0.0028020347355050147
442
+ 0.4841269841269841,3808,10879.357635763576,0.13569468695435272,0.002805718714910096
443
+ 0.4851190476190476,3813,10869.73897389739,0.13587285749919822,0.0028094026943151777
444
+ 0.4871031746031746,3823,10850.501650165015,0.1362291985888893,0.002816770653125341
445
+ 0.4880952380952381,3826,10831.264326432643,0.1363361009157966,0.00281898104076839
446
+ 0.490079365079365,3831,10802.408340834085,0.13651427146064213,0.002822665020173471
447
+ 0.492063492063492,3835,10792.789678967896,0.13665680789651855,0.0028256122036975366
448
+ 0.4930555555555556,3840,10773.552355235524,0.13683497844136408,0.002829296183102618
449
+ 0.494047619047619,3846,10754.31503150315,0.1370487830951787,0.0028337169583887156
450
+ 0.4950396825396825,3856,10725.45904590459,0.13740512418486975,0.0028410849171988787
451
+ 0.496031746031746,3868,10706.221722172217,0.13783273349249903,0.0028499264677710745
452
+ 0.4970238095238095,3877,10686.984398439845,0.13815344047322098,0.0028565576307002212
453
+ 0.4990079365079365,3891,10658.128412841284,0.13865231799878844,0.0028668727730344497
454
+ 0.5,3896,10648.509750975098,0.13883048854363397,0.002870556752439531
455
+ 0.5009920634920635,3913,10610.03510351035,0.13943626839610876,0.002883082282416808
456
+ 0.501984126984127,3940,10552.323132313231,0.1403983893382746,0.002902975771204249
457
+ 0.5029761904761905,3954,10523.46714671467,0.14089726686384207,0.002913290913538477
458
+ 0.503968253968254,3960,10513.848484848484,0.1411110715176567,0.002917711688824575
459
+ 0.5049603174603174,3973,10494.611161116112,0.14157431493425507,0.0029272900352777867
460
+ 0.5059523809523809,4000,10398.424542454246,0.1425364358764209,0.002947183524065227
461
+ 0.5069444444444444,4013,10369.568556855686,0.14299967929301927,0.002956761870518439
462
+ 0.5079365079365079,4021,10350.331233123312,0.14328475216477213,0.0029626562375665694
463
+ 0.5089285714285714,4041,10302.23792379238,0.1439974343441542,0.0029773921551868956
464
+ 0.5099206349206349,4045,10292.619261926193,0.14413997078003066,0.002980339338710961
465
+ 0.5119047619047619,4051,10283.000600060006,0.14435377543384528,0.0029847601139970587
466
+ 0.5138888888888888,4061,10263.763276327632,0.14471011652353633,0.0029921280728072218
467
+ 0.5148809523809523,4075,10234.907290729074,0.1452089940491038,0.0030024432151414503
468
+ 0.5168650793650794,4080,10225.288628862889,0.14538716459394932,0.0030061271945465316
469
+ 0.5178571428571429,4084,10215.6699669967,0.14552970102982574,0.003009074378070597
470
+ 0.5188492063492064,4089,10206.051305130512,0.14570787157467127,0.0030127583574756783
471
+ 0.5218253968253969,4101,10186.81398139814,0.14613548088230055,0.003021599908047874
472
+ 0.5238095238095238,4108,10177.195319531951,0.14638491964508427,0.003026757479214988
473
+ 0.5248015873015873,4117,10148.339333933394,0.14670562662580622,0.003033388642144135
474
+ 0.5277777777777778,4129,10129.10201020102,0.1471332359334355,0.0030422301927163307
475
+ 0.5287698412698413,4139,10109.864686468649,0.14748957702312654,0.0030495981515264938
476
+ 0.5307539682539683,4145,10100.24602460246,0.14770338167694116,0.0030540189268125914
477
+ 0.5317460317460317,4168,10052.152715271528,0.14852296618323058,0.0030709652320759667
478
+ 0.5327380952380952,4175,10042.53405340534,0.14877240494601432,0.0030761228032430807
479
+ 0.5337301587301587,4185,10023.296729672968,0.1491287460357054,0.003083490762053244
480
+ 0.5347222222222222,4225,9955.966096609662,0.1505541103944696,0.003112962597293896
481
+ 0.5357142857142857,4240,9927.1101110111,0.15108862202900616,0.0031240145355091405
482
+ 0.5367063492063492,4250,9907.872787278728,0.1514449631186972,0.0031313824943193036
483
+ 0.5376984126984127,4262,9888.635463546354,0.15187257242632649,0.0031402240448914994
484
+ 0.5386904761904762,4291,9840.542154215422,0.15290596158643052,0.0031615911254409723
485
+ 0.5406746031746031,4303,9811.68616861686,0.1533335708940598,0.003170432676013168
486
+ 0.5416666666666666,4309,9792.448844884488,0.15354737554787443,0.003174853451299266
487
+ 0.5426587301587301,4332,9744.355535553555,0.15436696005416384,0.003191799756562641
488
+ 0.5436507936507936,4339,9725.118211821182,0.15461639881694758,0.003196957327729755
489
+ 0.5446428571428571,4370,9648.16891689169,0.15572105619498985,0.0032197980000412607
490
+ 0.5456349206349206,4376,9638.550255025502,0.15593486084880448,0.0032242187753273584
491
+ 0.5466269841269841,4395,9609.694269426942,0.15661190891921747,0.003238217897066668
492
+ 0.5476190476190477,4411,9590.45694569457,0.15718205466272317,0.003250006631162929
493
+ 0.5496031746031746,4430,9542.363636363636,0.15785910273313616,0.0032640057529022388
494
+ 0.5505952380952381,4435,9532.74497449745,0.15803727327798167,0.0032676897323073205
495
+ 0.5515873015873016,4446,9513.507650765076,0.15842924847664183,0.0032757944869985
496
+ 0.5525793650793651,4453,9503.88898889889,0.15867868723942558,0.003280952058165614
497
+ 0.5535714285714286,4466,9484.651665166515,0.15914193065602394,0.003290530404618826
498
+ 0.5555555555555556,4475,9465.414341434143,0.1594626376367459,0.003297161567547973
499
+ 0.5565476190476191,4489,9446.17701770177,0.15996151516231336,0.003307476709882201
500
+ 0.5575396825396826,4496,9436.558355835585,0.1602109539250971,0.0033126342810493154
501
+ 0.560515873015873,4507,9417.321032103211,0.16060292912375726,0.0033207390357404944
502
+ 0.564484126984127,4524,9398.083708370836,0.16120870897623205,0.003333264565717772
503
+ 0.5654761904761905,4556,9349.990399039903,0.1623490004632434,0.0033568420339102935
504
+ 0.566468253968254,4566,9330.753075307532,0.16270534155293448,0.0033642099927204566
505
+ 0.5674603174603174,4574,9321.134413441345,0.1629904144246873,0.003370104359768587
506
+ 0.5684523809523809,4582,9311.515751575158,0.16327548729644015,0.0033759987268167178
507
+ 0.5694444444444444,4586,9301.897089708971,0.16341802373231656,0.0033789459103407827
508
+ 0.5704365079365079,4597,9282.659765976598,0.16380999893097672,0.003387050665031962
509
+ 0.5724206349206349,4603,9263.422442244224,0.16402380358479135,0.00339147144031806
510
+ 0.5734126984126984,4608,9253.803780378035,0.16420197412963689,0.0033951554197231416
511
+ 0.5753968253968254,4631,9215.329132913292,0.1650215586359263,0.003412101724986517
512
+ 0.5763888888888888,4635,9205.710471047105,0.16516409507180274,0.003415048908510582
513
+ 0.5773809523809523,4651,9176.854485448544,0.1657342408153084,0.0034268376426068426
514
+ 0.5783730158730159,4656,9167.235823582358,0.16591241136015394,0.0034305216220119244
515
+ 0.5803571428571429,4666,9157.61716171617,0.166268752449845,0.0034378895808220874
516
+ 0.5813492063492064,4681,9128.761176117612,0.16680326408438156,0.003448941519037332
517
+ 0.5823412698412699,4684,9119.142514251424,0.1669101664112889,0.003451151906680381
518
+ 0.5833333333333334,4707,9080.667866786678,0.1677297509175783,0.003468098211943756
519
+ 0.5843253968253969,4722,9042.193219321933,0.16826426255211488,0.0034791501501590005
520
+ 0.5853174603174603,4727,9032.574557455746,0.16844243309696041,0.003482834129564082
521
+ 0.5863095238095238,4731,9022.95589558956,0.16858496953283683,0.0034857813130881473
522
+ 0.5892857142857143,4747,8984.481248124812,0.16915511527634253,0.003497570047184408
523
+ 0.5912698412698413,4757,8965.243924392438,0.16951145636603357,0.003504938005994571
524
+ 0.5922619047619048,4781,8926.769276927693,0.1703666749812921,0.0035226211071389627
525
+ 0.5932539682539683,4791,8917.150615061506,0.17072301607098314,0.003529989065949126
526
+ 0.5952380952380952,4795,8907.531953195321,0.17086555250685956,0.0035329362494731908
527
+ 0.5962301587301587,4802,8897.913291329132,0.1711149912696433,0.0035380938206403052
528
+ 0.5972222222222222,4808,8869.057305730574,0.17132879592345793,0.003542514595926403
529
+ 0.5982142857142857,4818,8849.8199819982,0.171685137013149,0.003549882554736566
530
+ 0.5992063492063492,4830,8830.582658265826,0.17211274632077825,0.003558724105308762
531
+ 0.6001984126984127,4845,8792.10801080108,0.17264725795531483,0.003569776043524006
532
+ 0.6011904761904762,4857,8772.870687068707,0.17307486726294408,0.003578617594096202
533
+ 0.6021825396825397,4871,8753.633363336334,0.17357374478851156,0.00358893273643043
534
+ 0.6031746031746031,4875,8744.014701470147,0.17371628122438798,0.0035918799199544955
535
+ 0.6041666666666666,4881,8734.39603960396,0.1739300858782026,0.003596300695240593
536
+ 0.6071428571428571,4913,8695.921392139215,0.17507037736521397,0.003619878163433115
537
+ 0.6081349206349206,4932,8657.446744674467,0.175747425435627,0.003633877285172425
538
+ 0.6091269841269841,4942,8647.82808280828,0.17610376652531803,0.003641245243982588
539
+ 0.6101190476190477,4965,8618.97209720972,0.17692335103160745,0.0036581915492459633
540
+ 0.6121031746031746,4973,8599.734773477347,0.1772084239033603,0.0036640859162940936
541
+ 0.6130952380952381,4998,8561.260126012601,0.17809927662758793,0.003682505813319501
542
+ 0.6140873015873016,5022,8532.40414041404,0.17895449524284646,0.0037001889144638927
543
+ 0.6160714285714286,5048,8484.310831083108,0.1798809820760432,0.0037193456073703166
544
+ 0.6170634920634921,5054,8474.692169216922,0.18009478672985782,0.0037237663826564142
545
+ 0.6180555555555556,5089,8426.598859885988,0.1813419805437765,0.0037495542384919853
546
+ 0.6190476190476191,5116,8388.124212421242,0.18230410148594234,0.0037694477272794255
547
+ 0.6200396825396826,5119,8378.505550555055,0.18241100381284966,0.0037716581149224745
548
+ 0.6220238095238095,5125,8368.886888688869,0.1826248084666643,0.003776078890208572
549
+ 0.623015873015873,5129,8359.268226822682,0.1827673449025407,0.0037790260737326376
550
+ 0.6240079365079365,5144,8330.412241224123,0.18330185653707728,0.003790078011947882
551
+ 0.625,5156,8311.17491749175,0.18372946584470656,0.003798919562520078
552
+ 0.6259920634920635,5162,8301.556255625563,0.1839432704985212,0.0038033403378061755
553
+ 0.626984126984127,5166,8291.937593759376,0.1840858069343976,0.003806287521330241
554
+ 0.628968253968254,5179,8282.318931893189,0.18454905035099597,0.0038158658677834526
555
+ 0.6299603174603174,5187,8272.700270027002,0.1848341232227488,0.003821760234831583
556
+ 0.6309523809523809,5194,8263.081608160817,0.18508356198553255,0.0038269178059986975
557
+ 0.6329365079365079,5233,8195.75097509751,0.18647329223532766,0.0038556528453583335
558
+ 0.6339285714285714,5256,8166.89498949895,0.18729287674161707,0.0038725991506217083
559
+ 0.6349206349206349,5264,8157.276327632763,0.1875779496133699,0.0038784935176698386
560
+ 0.6359126984126984,5278,8138.0390039003905,0.1880768271389374,0.003888808660004067
561
+ 0.6369047619047619,5286,8128.420342034204,0.18836190001069023,0.0038947030270521975
562
+ 0.6378968253968254,5292,8118.801680168017,0.18857570466450485,0.003899123802338295
563
+ 0.6388888888888888,5311,8099.564356435643,0.18925275273491787,0.003913122924077605
564
+ 0.6408730158730159,5326,8080.327032703271,0.18978726436945445,0.00392417486229285
565
+ 0.6428571428571429,5351,8051.47104710471,0.19067811709368207,0.003942594759318257
566
+ 0.6448412698412699,5394,7993.759075907591,0.1922103837793536,0.003974276982201958
567
+ 0.6458333333333334,5409,7974.521752175217,0.19274489541389017,0.003985328920417203
568
+ 0.6468253968253969,5426,7955.284428442845,0.19335067526636496,0.003997854450394481
569
+ 0.6478174603174603,5444,7926.428442844284,0.19399208922780886,0.004011116776252774
570
+ 0.6488095238095238,5452,7916.8097809780975,0.1942771620995617,0.004017011143300904
571
+ 0.6507936507936508,5474,7887.953795379538,0.19506111249688202,0.004033220652683263
572
+ 0.6517857142857143,5492,7868.716471647165,0.19570252645832592,0.0040464829785415565
573
+ 0.6537698412698413,5515,7849.479147914792,0.19652211096461533,0.004063429283804932
574
+ 0.6547619047619048,5535,7820.623162316232,0.19723479314399744,0.004078165201425258
575
+ 0.6557539682539683,5548,7811.004500450045,0.1976980365605958,0.00408774354787847
576
+ 0.6567460317460317,5564,7791.767176717672,0.19826818230410148,0.004099532281974731
577
+ 0.6587301587301587,5585,7762.911191119112,0.19901649859245268,0.004115004995476073
578
+ 0.6597222222222222,5590,7753.292529252925,0.19919466913729822,0.004118688974881155
579
+ 0.6607142857142857,5598,7734.055205520553,0.19947974200905105,0.0041245833419292855
580
+ 0.6626984126984127,5619,7714.817881788179,0.20022805829740228,0.004140056055430628
581
+ 0.6636904761904762,5640,7685.961896189619,0.2009763745857535,0.0041555287689319705
582
+ 0.6646825396825397,5646,7676.343234323433,0.2011901792395681,0.004159949544218068
583
+ 0.6666666666666666,5658,7666.724572457246,0.2016177885471974,0.004168791094790264
584
+ 0.6686507936507936,5674,7647.487248724872,0.20218793429070306,0.004180579828886525
585
+ 0.6696428571428571,5687,7637.868586858686,0.20265117770730143,0.004190158175339737
586
+ 0.6706349206349206,5709,7609.012601260126,0.20343512810462175,0.004206367684722095
587
+ 0.6716269841269841,5720,7599.393939393939,0.2038271033032819,0.004214472439413274
588
+ 0.6726190476190477,5753,7560.919291929193,0.20500302889926236,0.004238786703486813
589
+ 0.6736111111111112,5780,7522.444644464446,0.20596514984142822,0.004258680192274253
590
+ 0.6755952380952381,5794,7503.207320732074,0.20646402736699568,0.004268995334608482
591
+ 0.6765873015873016,5798,7493.588658865887,0.20660656380287212,0.004271942518132546
592
+ 0.6775793650793651,5803,7483.9699969997,0.20678473434771763,0.0042756264975376285
593
+ 0.6795634920634921,5811,7474.351335133513,0.20706980721947046,0.0042815208645857585
594
+ 0.6805555555555556,5822,7464.732673267326,0.20746178241813062,0.004289625619276938
595
+ 0.6815476190476191,5839,7445.495349534954,0.2080675622706054,0.0043021511492542155
596
+ 0.6825396825396826,5854,7426.25802580258,0.208602073905142,0.0043132030874694595
597
+ 0.683531746031746,5864,7416.639363936394,0.20895841499483306,0.004320571046279623
598
+ 0.6865079365079365,5875,7407.020702070207,0.20935039019349322,0.004328675800970802
599
+ 0.6884920634920635,5899,7368.546054605461,0.21020560880875175,0.004346358902115194
600
+ 0.689484126984127,5912,7349.308730873087,0.21066885222535012,0.004355937248568405
601
+ 0.6904761904761905,5925,7339.690069006901,0.2111320956419485,0.004365515595021617
602
+ 0.691468253968254,5956,7310.834083408341,0.21223675301999073,0.004388356267333123
603
+ 0.6924603174603174,5969,7291.596759675967,0.2126999964365891,0.004397934613786335
604
+ 0.6934523809523809,5986,7272.359435943595,0.2133057762890639,0.004410460143763613
605
+ 0.6954365079365079,5994,7253.122112211221,0.21359084916081672,0.004416354510811743
606
+ 0.6964285714285714,6002,7243.503450345034,0.21387592203256958,0.0044222488778598735
607
+ 0.6974206349206349,6013,7233.884788478848,0.21426789723122974,0.004430353632551053
608
+ 0.6984126984126984,6016,7224.266126612661,0.21437479955813704,0.0044325640201941015
609
+ 0.6994047619047619,6029,7214.647464746475,0.2148380429747354,0.004442142366647314
610
+ 0.7003968253968254,6049,7195.410141014101,0.21555072515411752,0.00445687828426764
611
+ 0.7013888888888888,6053,7185.791479147915,0.21569326158999394,0.004459825467791705
612
+ 0.7023809523809523,6065,7176.172817281728,0.21612087089762322,0.004468667018363901
613
+ 0.7043650793650794,6085,7156.935493549355,0.2168335530770053,0.004483402935984227
614
+ 0.7063492063492064,6094,7147.316831683168,0.21715426005772726,0.0044900340989133735
615
+ 0.7073412698412699,6123,7118.460846084608,0.21818764921783132,0.0045114011794628464
616
+ 0.7083333333333334,6134,7108.842184218422,0.21857962441649145,0.004519505934154026
617
+ 0.7093253968253969,6145,7099.223522352236,0.2189715996151516,0.004527610688845205
618
+ 0.7103174603174603,6155,7089.604860486049,0.21932794070484268,0.004534978647655368
619
+ 0.7113095238095238,6180,7060.7488748874885,0.2202187934290703,0.0045533985446807755
620
+ 0.7123015873015873,6212,7031.892889288929,0.22135908491608167,0.004576976012873298
621
+ 0.7132936507936508,6229,7012.655565556555,0.22196486476855645,0.0045895015428505746
622
+ 0.7142857142857143,6239,6993.418241824183,0.22232120585824752,0.004596869501660738
623
+ 0.7152777777777778,6247,6983.799579957996,0.22260627873000036,0.004602763868708868
624
+ 0.7162698412698413,6254,6974.180918091809,0.2228557174927841,0.004607921439875983
625
+ 0.7172619047619048,6266,6964.562256225623,0.22328332680041335,0.004616762990448178
626
+ 0.7182539682539683,6278,6945.324932493249,0.22371093610804263,0.004625604541020374
627
+ 0.7192460317460317,6310,6916.46894689469,0.224851227595054,0.004649182009212896
628
+ 0.7202380952380952,6320,6906.850285028503,0.22520756868474504,0.004656549968023059
629
+ 0.7232142857142857,6330,6897.231623162316,0.22556390977443608,0.004663917926833222
630
+ 0.7251984126984127,6362,6868.375637563757,0.22670420126144747,0.004687495395025744
631
+ 0.7261904761904762,6375,6849.138313831383,0.22716744467804584,0.004697073741478956
632
+ 0.7271825396825397,6380,6839.519651965196,0.22734561522289135,0.004700757720884037
633
+ 0.7281746031746031,6398,6820.282328232824,0.22798702918433525,0.004714020046742331
634
+ 0.7291666666666666,6404,6810.663666366637,0.22820083383814987,0.004718440822028429
635
+ 0.7301587301587301,6430,6781.807680768077,0.2291273206713466,0.004737597514934852
636
+ 0.7311507936507936,6438,6772.18901890189,0.22941239354309945,0.004743491881982983
637
+ 0.7321428571428571,6453,6752.951695169517,0.22994690517763602,0.004754543820198228
638
+ 0.7331349206349206,6461,6743.33303330333,0.23023197804938889,0.004760438187246358
639
+ 0.7371031746031746,6471,6733.714371437144,0.23058831913907993,0.004767806146056521
640
+ 0.7400793650793651,6476,6724.095709570957,0.23076648968392546,0.004771490125461603
641
+ 0.7410714285714286,6524,6676.002400240024,0.2324769269144425,0.004806856327750385
642
+ 0.7420634920634921,6534,6666.383738373837,0.23283326800413356,0.0048142242865605485
643
+ 0.7440476190476191,6572,6637.527752775278,0.23418736414495955,0.004842222530039168
644
+ 0.7450396825396826,6660,6560.578457845784,0.2373231657342408,0.004907060567568603
645
+ 0.7470238095238095,6667,6550.959795979598,0.23757260449702455,0.004912218138735717
646
+ 0.748015873015873,6675,6541.341134113412,0.2378576773687774,0.004918112505783848
647
+ 0.75,6733,6474.010501050105,0.2399244556889855,0.0049608466668827934
648
+ 0.7509920634920635,6749,6454.773177317732,0.24049460143249118,0.004972635400979054
649
+ 0.753968253968254,6780,6425.917191719172,0.24159925881053343,0.00499547607329056
650
+ 0.7549603174603174,6803,6406.679867986799,0.24241884331682287,0.005012422378553935
651
+ 0.7559523809523809,6855,6358.586558655866,0.24427181698321634,0.005050735764366783
652
+ 0.7569444444444444,6875,6339.349234923492,0.24498449916259843,0.005065471681987109
653
+ 0.7579365079365079,6899,6320.111911191119,0.24583971777785696,0.0050831547831315
654
+ 0.7599206349206349,6931,6291.255925592559,0.24698000926486832,0.0051067322513240225
655
+ 0.7638888888888888,6976,6252.781278127813,0.24858354416847805,0.005139888065969756
656
+ 0.7648809523809523,7021,6204.687968796879,0.2501870790720878,0.00517304388061549
657
+ 0.7658730158730159,7036,6195.0693069306935,0.25072159070662436,0.005184095818830735
658
+ 0.7668650793650794,7062,6166.213321332133,0.2516480775398211,0.005203252511737158
659
+ 0.7678571428571429,7072,6156.594659465946,0.2520044186295122,0.005210620470547322
660
+ 0.7688492063492064,7088,6146.97599759976,0.25257456437301784,0.005222409204643582
661
+ 0.7698412698412699,7106,6127.738673867387,0.25321597833446174,0.005235671530501876
662
+ 0.7708333333333334,7167,6060.408040804081,0.2553896589815772,0.00528061607924387
663
+ 0.7718253968253969,7196,6041.170717071707,0.2564230481416812,0.005301983159793343
664
+ 0.7738095238095238,7231,6012.314731473148,0.2576702419555999,0.005327771015628914
665
+ 0.7748015873015873,7244,6002.696069606961,0.25813348537219827,0.005337349362082126
666
+ 0.7757936507936508,7273,5983.458745874587,0.2591668745323023,0.005358716442631599
667
+ 0.7767857142857143,7289,5964.221422142215,0.25973702027580803,0.00537050517672786
668
+ 0.7777777777777778,7328,5925.746774677468,0.2611267505256031,0.005399240216087496
669
+ 0.7807539682539683,7415,5848.797479747975,0.26422691800591525,0.005463341457735914
670
+ 0.7817460317460317,7427,5839.178817881788,0.26465452731354455,0.0054721830083081105
671
+ 0.7827380952380952,7435,5829.560156015602,0.2649396001852974,0.0054780773753562405
672
+ 0.7857142857142857,7449,5819.941494149415,0.2654384777108648,0.005488392517690469
673
+ 0.7867063492063492,7465,5810.322832283228,0.26600862345437054,0.00550018125178673
674
+ 0.7876984126984127,7474,5800.7041704170415,0.2663293304350925,0.005506812414715877
675
+ 0.7886904761904762,7512,5771.848184818482,0.2676834265759185,0.0055348106581944966
676
+ 0.7906746031746031,7521,5762.229522952295,0.26800413355664043,0.005541441821123643
677
+ 0.7916666666666666,7533,5752.610861086108,0.2684317428642697,0.005550283371695839
678
+ 0.7926587301587301,7568,5714.136213621362,0.2696789366781884,0.00557607122753141
679
+ 0.7936507936507936,7597,5694.898889888989,0.2707123258382924,0.005597438308080883
680
+ 0.7946428571428571,7604,5685.2802280228025,0.2709617646010761,0.005602595879247997
681
+ 0.7956349206349206,7674,5627.5682568256825,0.2734561522289135,0.005654171590919138
682
+ 0.7966269841269841,7682,5617.949594959496,0.27374122510066634,0.005660065957967269
683
+ 0.7986111111111112,7707,5598.712271227123,0.27463207782489396,0.005678485854992676
684
+ 0.7996031746031746,7723,5579.474947494749,0.2752022235683997,0.005690274589088937
685
+ 0.8005952380952381,7762,5541.000300030003,0.2765919538181948,0.005719009628448573
686
+ 0.8015873015873016,7779,5521.762976297629,0.2771977336706696,0.00573153515842585
687
+ 0.8025793650793651,7807,5492.90699069907,0.2781954887218045,0.005752165443094307
688
+ 0.8045634920634921,7831,5473.669666966697,0.27905070733706305,0.0057698485442386985
689
+ 0.8065476190476191,7869,5444.813681368137,0.28040480347788904,0.005797846787717318
690
+ 0.8075396825396826,7879,5435.19501950195,0.2807611445675801,0.005805214746527481
691
+ 0.808531746031746,7894,5425.576357635764,0.28129565620211666,0.005816266684742726
692
+ 0.8095238095238095,7924,5396.720372037204,0.2823646794711898,0.0058383705611732145
693
+ 0.8125,7950,5377.483048304831,0.28329116630438655,0.005857527254079639
694
+ 0.8134920634920635,7965,5367.864386438644,0.28382567793892316,0.005868579192294884
695
+ 0.8154761904761905,8009,5319.771077107711,0.28539357873356375,0.0059009982110596005
696
+ 0.8174603174603174,8022,5310.152415241524,0.2858568221501621,0.005910576557512813
697
+ 0.8184523809523809,8037,5300.533753375337,0.2863913337846987,0.0059216284957280575
698
+ 0.8204365079365079,8045,5290.915091509151,0.28667640665645155,0.005927522862776188
699
+ 0.8214285714285714,8110,5242.821782178218,0.2889926237394434,0.005975414595042248
700
+ 0.8224206349206349,8153,5204.347134713471,0.2905248904251149,0.006007096817925949
701
+ 0.8234126984126984,8167,5194.728472847285,0.2910237679506824,0.006017411960260177
702
+ 0.8244047619047619,8206,5165.872487248725,0.2924134982004775,0.006046146999619814
703
+ 0.8263888888888888,8220,5156.253825382539,0.29291237572604495,0.006056462141954042
704
+ 0.8293650793650794,8237,5146.635163516352,0.29351815557851973,0.006068987671931319
705
+ 0.8303571428571429,8248,5137.016501650165,0.2939101307771799,0.006077092426622498
706
+ 0.8333333333333334,8266,5127.397839783978,0.2945515447386238,0.006090354752480791
707
+ 0.8343253968253969,8280,5117.779177917792,0.29505042226419126,0.00610066989481502
708
+ 0.8353174603174603,8320,5088.923192319232,0.2964757866229555,0.006130141730055673
709
+ 0.8363095238095238,8349,5060.067206720672,0.2975091757830595,0.0061515088106051455
710
+ 0.8373015873015873,8367,5050.448544854486,0.2981505897445034,0.006164771136463439
711
+ 0.8382936507936508,8413,5021.592559255926,0.2997897587570823,0.0061986637469901885
712
+ 0.8392857142857143,8498,4973.499249924993,0.30281865801945623,0.0062612913968765746
713
+ 0.8412698412698413,8514,4963.880588058806,0.3033888037629619,0.006273080130972836
714
+ 0.8432539682539683,8525,4954.261926192619,0.3037807789616221,0.006281184885664015
715
+ 0.8442460317460317,8553,4935.024602460246,0.304778534012757,0.006301815170332472
716
+ 0.8462301587301587,8577,4915.787278727873,0.3056337526280155,0.006319498271476863
717
+ 0.8472222222222222,8606,4896.549954995499,0.3066671417881196,0.006340865352026336
718
+ 0.8482142857142857,8703,4838.837983798379,0.3101236503581228,0.006412334552484918
719
+ 0.8511904761904762,8749,4809.98199819982,0.31176281937070166,0.006446227163011668
720
+ 0.8531746031746031,8785,4790.744674467447,0.3130456472935894,0.006472751814728255
721
+ 0.8541666666666666,8840,4752.2700270027,0.3150055232868902,0.006513275588184152
722
+ 0.8551587301587301,8861,4742.651365136513,0.31575383957524145,0.006528748301685494
723
+ 0.8561507936507936,8872,4733.032703270327,0.3161458147739016,0.006536853056376674
724
+ 0.8571428571428571,8964,4675.320732073207,0.3194241527990593,0.006604638277430174
725
+ 0.8581349206349206,8990,4656.083408340834,0.320350639632256,0.006623794970336598
726
+ 0.8591269841269841,9015,4636.846084608461,0.32124149235648364,0.006642214867362006
727
+ 0.8621031746031746,9041,4617.608760876088,0.3221679791896804,0.00666137156026843
728
+ 0.8630952380952381,9058,4607.990099009901,0.32277375904215516,0.0066738970902457066
729
+ 0.8640873015873016,9077,4598.3714371437145,0.32345080711256813,0.006687896211985017
730
+ 0.8650793650793651,9102,4579.134113411341,0.3243416598367958,0.006706316109010424
731
+ 0.8660714285714286,9133,4559.896789678968,0.325446317214838,0.00672915678132193
732
+ 0.8670634920634921,9143,4550.278127812781,0.3258026583045291,0.006736524740132093
733
+ 0.8680555555555556,9159,4540.6594659465945,0.32637280404803476,0.006748313474228353
734
+ 0.8690476190476191,9255,4492.566156615661,0.3297936785090689,0.006819045878805919
735
+ 0.8700396825396826,9274,4482.947494749475,0.3304707265794819,0.006833045000545229
736
+ 0.871031746031746,9309,4463.710171017102,0.3317179203934006,0.0068588328563808
737
+ 0.8720238095238095,9347,4444.472847284729,0.3330720165342266,0.00688683109985942
738
+ 0.873015873015873,9363,4434.854185418542,0.33364216227773225,0.0068986198339556805
739
+ 0.8740079365079365,9404,4415.616861686169,0.33510316074546553,0.006928828465077349
740
+ 0.8759920634920635,9422,4405.998199819982,0.33574457470690944,0.006942090790935642
741
+ 0.876984126984127,9433,4396.379537953795,0.3361365499055696,0.006950195545626822
742
+ 0.8779761904761905,9467,4377.142214221422,0.3373481096105192,0.006975246605581376
743
+ 0.878968253968254,9508,4357.904890489049,0.3388091080782525,0.007005455236703045
744
+ 0.8799603174603174,9546,4338.667566756676,0.3401632042190785,0.007033453480181665
745
+ 0.8809523809523809,9558,4329.048904890489,0.3405908135267078,0.00704229503075386
746
+ 0.8819444444444444,9741,4232.862286228623,0.347111855468054,0.007177128676979844
747
+ 0.8829365079365079,9778,4213.624962496249,0.34843031749991094,0.007204390124577448
748
+ 0.8839285714285714,9799,4204.006300630063,0.34917863378826214,0.00721986283807879
749
+ 0.8849206349206349,9819,4194.387638763877,0.3498913159676442,0.007234598755699116
750
+ 0.8859126984126984,9847,4184.76897689769,0.3508890710187792,0.007255229040367572
751
+ 0.8869047619047619,9916,4155.91299129913,0.35334782453764746,0.007306067956157698
752
+ 0.8878968253968254,10014,4107.819681968197,0.35683996721661976,0.007378273952497296
753
+ 0.8888888888888888,10036,4098.20102010201,0.3576239176139401,0.007394483461879655
754
+ 0.8898809523809523,10056,4088.582358235824,0.35833659979332216,0.007409219379499981
755
+ 0.8918650793650794,10074,4078.963696369637,0.35897801375476607,0.007422481705358274
756
+ 0.8938492063492064,10087,4069.34503450345,0.35944125717136444,0.007432060051811486
757
+ 0.8958333333333334,10125,4050.107710771077,0.3607953533121904,0.007460058295290106
758
+ 0.8998015873015873,10210,4011.6330633063303,0.36382425257456436,0.007522685945176492
759
+ 0.9017857142857144,10234,4002.014401440144,0.3646794711898229,0.007540369046320884
760
+ 0.9027777777777778,10324,3963.5397539753976,0.3678865409970424,0.007606680675612351
761
+ 0.9037698412698412,10343,3953.921092109211,0.36856358906745534,0.007620679797351661
762
+ 0.9057539682539684,10366,3944.302430243024,0.3693831735737448,0.007637626102615036
763
+ 0.9067460317460316,10389,3934.6837683768376,0.3702027580800342,0.007654572407878411
764
+ 0.9077380952380952,10402,3925.0651065106513,0.3706660014966326,0.007664150754331623
765
+ 0.9087301587301588,10455,3896.2091209120913,0.37255460927199513,0.0077032009360254875
766
+ 0.9107142857142856,10497,3876.971797179718,0.3740512418486976,0.0077341463630281725
767
+ 0.9117063492063492,10569,3848.115811581158,0.37661689769447315,0.007787195666461346
768
+ 0.9126984126984128,10659,3800.022502250225,0.3798239675016926,0.007853507295752814
769
+ 0.9136904761904762,10674,3790.403840384039,0.3803584791362292,0.007864559233968059
770
+ 0.9146825396825397,10692,3780.785178517852,0.38099989309767307,0.007877821559826352
771
+ 0.9156746031746033,10820,3723.073207320732,0.3855610590457186,0.00797213143259644
772
+ 0.91765873015873,10845,3713.4545454545455,0.3864519117699462,0.007990551329621847
773
+ 0.9186507936507936,10885,3694.217221722172,0.3878772761287104,0.0080200231648625
774
+ 0.9196428571428572,10960,3665.361236123612,0.3905498343013933,0.008075282855938722
775
+ 0.9206349206349206,10987,3655.742574257426,0.39151195524355914,0.008095176344726162
776
+ 0.921626984126984,11011,3646.123912391239,0.39236717385881764,0.008112859445870554
777
+ 0.9236111111111112,11040,3626.886588658866,0.39340056301892173,0.008134226526420027
778
+ 0.9246031746031746,11156,3588.41194119412,0.39753411965933794,0.008219694848617919
779
+ 0.9265873015873016,11178,3578.793279327933,0.39831807005665826,0.008235904358000277
780
+ 0.9275793650793652,11213,3559.55595559556,0.3995652638705769,0.008261692213835848
781
+ 0.9305555555555556,11247,3549.937293729373,0.4007768235755265,0.008286743273790403
782
+ 0.9315476190476192,11314,3521.0813081308133,0.40316430887645655,0.008336108597818494
783
+ 0.9325396825396826,11366,3501.84398439844,0.40501728254285,0.008374421983631343
784
+ 0.933531746031746,11386,3492.2253225322534,0.4057299647222321,0.008389157901251668
785
+ 0.9345238095238096,11411,3482.6066606660665,0.4066208174464597,0.008407577798277076
786
+ 0.935515873015873,11430,3472.98799879988,0.40729786551687275,0.008421576920016386
787
+ 0.9365079365079364,11488,3444.13201320132,0.4093646438370809,0.008464311081115332
788
+ 0.9375,11516,3434.5133513351334,0.4103623988882158,0.008484941365783788
789
+ 0.9384920634920636,11601,3396.038703870387,0.4133912981505897,0.008547569015670175
790
+ 0.939484126984127,11666,3367.182718271827,0.41570751523358157,0.008595460747936235
791
+ 0.9404761904761904,11715,3347.945394539454,0.41745358657306775,0.008631563746106033
792
+ 0.941468253968254,11806,3319.089408940894,0.42069629048925633,0.008698612171278517
793
+ 0.9424603174603174,11891,3290.2334233423344,0.42372518975163026,0.008761239821164904
794
+ 0.9434523809523808,12058,3222.902790279028,0.4296760859494708,0.008884284733294627
795
+ 0.945436507936508,12090,3213.2841284128413,0.4308163774364822,0.008907862201487149
796
+ 0.9464285714285714,12272,3145.953495349535,0.4373017852688594,0.009041959051832117
797
+ 0.9474206349206348,12327,3126.7161716171618,0.43926166126216015,0.009082482825288014
798
+ 0.9484126984126984,12347,3117.097509750975,0.43997434344154224,0.00909721874290834
799
+ 0.949404761904762,12383,3107.4788478847886,0.44125717136443005,0.009123743394624927
800
+ 0.9503968253968254,12439,3088.2415241524154,0.44325268146669994,0.00916500396396184
801
+ 0.9513888888888888,12459,3078.6228622862286,0.443965363646082,0.009179739881582166
802
+ 0.9523809523809524,12474,3069.0042004200423,0.44449987528061863,0.00919079181979741
803
+ 0.953373015873016,12507,3059.3855385538554,0.4456758008765991,0.009215106083870949
804
+ 0.9543650793650794,12599,3030.5295529552955,0.4489541389017568,0.009282891304924448
805
+ 0.9553571428571428,12650,3011.2922292229223,0.45077147845918114,0.00932046789485628
806
+ 0.9563492063492064,12686,3001.6735673567355,0.4520543063820689,0.009346992546572867
807
+ 0.95734126984127,12742,2982.4362436243623,0.4540498164843388,0.009388253115909781
808
+ 0.9583333333333334,12788,2963.198919891989,0.45568898549691766,0.00942214572643653
809
+ 0.9593253968253967,12824,2953.580258025803,0.4569718134198054,0.009448670378153117
810
+ 0.9603174603174603,12905,2924.724272427243,0.459858176246303,0.009508350844515439
811
+ 0.9642857142857144,13182,2838.1563156315638,0.4697288244307451,0.009712443303556955
812
+ 0.9652777777777778,13253,2818.9189918991897,0.4722588461675516,0.009764755811109114
813
+ 0.9672619047619048,13305,2799.681668166817,0.47411181983394507,0.00980306919692196
814
+ 0.9682539682539684,13333,2790.06300630063,0.47510957488508,0.009823699481590419
815
+ 0.9692460317460316,13604,2713.113711371137,0.4847664184157075,0.010023371165345837
816
+ 0.9702380952380952,13636,2703.4950495049507,0.4859067099027189,0.010046948633538359
817
+ 0.9712301587301588,13873,2626.545754575457,0.49435199372839683,0.010221569257339224
818
+ 0.9722222222222222,13901,2616.927092709271,0.49534974877953175,0.01024219954200768
819
+ 0.9732142857142856,14207,2530.359135913592,0.506253786124078,0.01046765908159867
820
+ 0.9742063492063492,14612,2424.5538553855386,0.5206856002565656,0.010766061413410275
821
+ 0.9751984126984128,14778,2376.4605460546054,0.526600862345437,0.010888369529658982
822
+ 0.9761904761904762,15109,2289.892589258926,0.5383957524142109,0.011132248966275378
823
+ 0.9771825396825397,15225,2261.0366036603664,0.542529309054627,0.01121771728847327
824
+ 0.9791666666666666,15274,2251.41794179418,0.5442753803941133,0.01125382028664307
825
+ 0.98015873015873,15312,2241.7992799279928,0.5456294765349392,0.011281818530121689
826
+ 0.9811507936507936,15512,2193.70597059706,0.5527562983287603,0.01142917770632495
827
+ 0.9831349206349206,15729,2145.6126612661265,0.5604888999750561,0.011589062412505489
828
+ 0.984126984126984,15778,2135.99399939994,0.5622349713145423,0.011625165410675289
829
+ 0.9851190476190476,15964,2097.5193519351933,0.5688629155827959,0.01176220944454432
830
+ 0.9861111111111112,16189,2049.42604260426,0.5768805901008446,0.01192798851777299
831
+ 0.9871031746031746,16266,2030.1887188718872,0.5796244164914657,0.011984721800611246
832
+ 0.988095238095238,16370,2010.951395139514,0.5833303638242526,0.012061348572236941
833
+ 0.9890873015873016,16556,1972.4767476747677,0.5899583080925062,0.012198392606105975
834
+ 0.9900793650793652,16877,1914.764776477648,0.601396857071589,0.012434904083912209
835
+ 0.9910714285714286,17205,1857.052805280528,0.6130848448134555,0.012676573132885558
836
+ 0.992063492063492,17516,1808.9594959495948,0.6241670527028471,0.01290571665188163
837
+ 0.9930555555555556,17735,1770.4848484848485,0.6319709225670812,0.0130670749498242
838
+ 0.9940476190476192,17997,1732.0102010201022,0.6413070591169868,0.013260115470650473
839
+ 0.9950396825396826,18721,1626.2049204920493,0.667106154010619,0.013793555688506279
840
+ 0.996031746031746,18795,1616.586258625863,0.6697430780743328,0.013848078583701486
841
+ 0.9970238095238096,23165,1135.6531653165316,0.8254641342693226,0.017067876583742748
842
+ 0.9990079365079364,23945,1068.3225322532253,0.8532587392652247,0.017642577370935466
843
+ 1.0,28063,799.0,1.0,0.020676702808960615
metric_analysis/output_standardized/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,1,0.1110177516201285,3.1287642945418706e-06,7.367958810163067e-07
3
+ 0.0109126984126984,14,0.0458136629133554,4.380270012358619e-05,1.0315142334228295e-05
4
+ 0.0208333333333333,31,0.0305420201938555,9.699169313079799e-05,2.284067231150551e-05
5
+ 0.0307539682539682,58,0.0211584469416017,0.0001814683290834285,4.273416109894579e-05
6
+ 0.0406746031746031,87,0.0172859102583349,0.00027220249362514275,6.410124164841868e-05
7
+ 0.050595238095238,128,0.013160342533035,0.00040048182970135943,9.430987277008726e-05
8
+ 0.060515873015873,181,0.0106189425333306,0.0005663063373120786,0.00013336005446395153
9
+ 0.0704365079365079,274,0.0082195927599472,0.0008572814167044726,0.00020188207139846805
10
+ 0.0803571428571428,350,0.0074260070425938,0.0010950675030896547,0.0002578785583557074
11
+ 0.0902777777777777,451,0.0064240020967853,0.0014110726968383836,0.00033229494233835433
12
+ 0.1001984126984127,520,0.0058630374952952,0.0016269574331617728,0.00038313385812847954
13
+ 0.1101190476190476,548,0.0055695733498176,0.0017145628334089451,0.0004037641427969361
14
+ 0.1200396825396825,670,0.0050640430518192,0.0020962720773430536,0.0004936532402809255
15
+ 0.1299603174603174,822,0.0044397680804404,0.0025718442501134176,0.0006056462141954042
16
+ 0.1398809523809523,939,0.0040877025472988,0.0029379096725748164,0.0006918513322743121
17
+ 0.1507936507936507,1139,0.0036561499334258,0.003563662531483191,0.0008392105084775734
18
+ 0.1607142857142857,1401,0.0033397443279497,0.004383398776653161,0.0010322510293038457
19
+ 0.1706349206349206,1624,0.0030947814200735,0.005081113214335998,0.0011965565107704822
20
+ 0.1805555555555555,2070,0.0026847167747472,0.006476542089701672,0.001525167473703755
21
+ 0.1904761904761904,2303,0.0024974390206845,0.0072055441703299285,0.0016968409139805545
22
+ 0.2003968253968254,2658,0.0022891258443659,0.008316255494892292,0.001958403451741343
23
+ 0.2103174603174603,2985,0.0021472681131348,0.009339361419207484,0.0021993357048336755
24
+ 0.2202380952380952,3242,0.0020431615576899,0.010143453842904744,0.0023886922462548667
25
+ 0.2301587301587301,3623,0.0019049409638422,0.011335513039125198,0.0026694114769220795
26
+ 0.2400793650793651,4018,0.0017638702239982,0.012571374935469237,0.002960445849923521
27
+ 0.25,4524,0.0016339688981287,0.014154529668507424,0.003333264565717772
28
+ 0.2599206349206349,4872,0.0015278241082717,0.015243339643007994,0.0035896695323114464
29
+ 0.2698412698412698,5224,0.0014599380842198,0.016344664674686732,0.0038490216824291867
30
+ 0.2797619047619047,6015,0.0013358734220584,0.01881951723166935,0.004431827224313086
31
+ 0.2906746031746032,6705,0.0012345399503041,0.020978364594903244,0.0049402163822143364
32
+ 0.3005952380952381,6991,0.0011997293930759,0.021873191183142217,0.005150940004185001
33
+ 0.310515873015873,7611,0.001133266050563,0.023813025045758177,0.005607753450415111
34
+ 0.3204365079365079,8209,0.0010706819674658,0.025684026093894217,0.006048357387262862
35
+ 0.3303571428571428,8697,0.0010318295150289,0.02721086306963065,0.0064079137771988195
36
+ 0.3402777777777778,9730,0.0009582618477334,0.0304428765858924,0.007169023922288665
37
+ 0.3501984126984127,10089,0.0009323687651642,0.03156610296763293,0.007433533643573519
38
+ 0.3601190476190476,10941,0.0008840972038673,0.03423181014658261,0.008061283734199412
39
+ 0.3700396825396825,11298,0.0008656145917666,0.03534877899973406,0.008324319863722235
40
+ 0.3799603174603174,11952,0.0008387869121772,0.03739499084836444,0.008806184369906898
41
+ 0.3898809523809524,12745,0.00080268212817,0.03987610093393614,0.00939046350355283
42
+ 0.3998015873015873,13622,0.0007683790460187,0.042620027220249365,0.01003663349120413
43
+ 0.4097222222222222,14806,0.0007321509024842,0.04632448414498694,0.010908999814327438
44
+ 0.4196428571428571,15421,0.0007106285418656,0.048248674186130186,0.011362129281152466
45
+ 0.4305555555555556,15822,0.000696980606374,0.04950330866824148,0.011657584429440006
46
+ 0.4404761904761904,16628,0.0006743321938094,0.05202509268964223,0.012251441909539149
47
+ 0.4503968253968254,17340,0.00065250692733,0.05425277286735604,0.012776040576822759
48
+ 0.4603174603174603,18354,0.0006301131324195,0.057425339862021495,0.013523151600173294
49
+ 0.4702380952380952,19552,0.0006047211659253,0.061173599486882654,0.01440583306563083
50
+ 0.4801587301587302,20284,0.0005895765740513,0.0634638549504873,0.014945167650534767
51
+ 0.490079365079365,21566,0.000564702192227,0.06747493077608999,0.015889739969997672
52
+ 0.5,23032,0.0005395738872296,0.07206169923188836,0.016969882731567576
53
+ 0.5099206349206349,24228,0.0005212916903156,0.07580370132816044,0.01785109060526308
54
+ 0.5198412698412699,25296,0.0005061834084447,0.07914522159473115,0.018637988606188496
55
+ 0.5297619047619048,26013,0.0004976799255063,0.08138854559391769,0.019166271252877187
56
+ 0.5396825396825397,26667,0.0004876764751834,0.08343475744254807,0.019648135759061852
57
+ 0.5496031746031746,28535,0.0004681204083387,0.08927928914475228,0.021024470464800313
58
+ 0.5595238095238095,29882,0.0004524594289673,0.09349373464950018,0.02201693451652928
59
+ 0.5694444444444444,31354,0.0004371937638723,0.09809927569106582,0.023101498053385284
60
+ 0.5803571428571429,32866,0.000426354196908,0.10282996730441313,0.02421553342548194
61
+ 0.5902777777777778,34629,0.0004112004488701,0.10834597875569044,0.025514504563713687
62
+ 0.6001984126984127,36276,0.0003980419371793,0.1134990535488009,0.026728007379747544
63
+ 0.6101190476190477,37806,0.0003870781771608,0.11828606291944996,0.027855305077702494
64
+ 0.6200396825396826,39620,0.0003748698230899,0.12396164134974892,0.029191852805866073
65
+ 0.6299603174603174,41027,0.0003649463454591,0.12836381271216932,0.03022852461045602
66
+ 0.6398809523809523,42099,0.0003569193584129,0.1317178480359182,0.0310183697949055
67
+ 0.6498015873015873,42986,0.0003511657214206,0.13449306196517685,0.03167190774136696
68
+ 0.6597222222222222,44674,0.0003421251688588,0.13977441609436353,0.03291561918852249
69
+ 0.6696428571428571,47808,0.0003263702632609,0.14957996339345775,0.035224737479627594
70
+ 0.6795634920634921,49826,0.0003164125508853,0.15589380973984324,0.0367115915675185
71
+ 0.689484126984127,53140,0.0003008070695204,0.166262534611955,0.03915333311720654
72
+ 0.6994047619047619,55365,0.0002917651358013,0.17322403516731066,0.040792703952467826
73
+ 0.7093253968253969,56801,0.0002862937577127,0.1777169406942728,0.041850742837607244
74
+ 0.7202380952380952,58502,0.0002800657421942,0.18303896875928852,0.043104032631215976
75
+ 0.7301587301587301,62958,0.0002650567389384,0.1969807424557671,0.04638719507702464
76
+ 0.7400793650793651,67584,0.0002501977740079,0.21145440608231778,0.049795612822606077
77
+ 0.75,70894,0.0002407572874386,0.22181061589725137,0.05223440718877005
78
+ 0.7599206349206349,72948,0.0002343518471991,0.2282370977582404,0.05374778592837755
79
+ 0.7698412698412699,75718,0.0002262636742258,0.23690377485412137,0.055788710518792715
80
+ 0.7797619047619048,78571,0.0002184445814173,0.24583013938644932,0.057890789167332236
81
+ 0.7896825396825397,84808,0.0002054679665547,0.265344242291507,0.062486185077230944
82
+ 0.7996031746031746,87559,0.0001993432769733,0.27395147286579163,0.0645131105459068
83
+ 0.8095238095238095,90777,0.0001928516703228,0.2840198363656274,0.06688411969101728
84
+ 0.8194444444444444,94063,0.0001857550455259,0.294300955837492,0.06930523095603687
85
+ 0.8293650793650794,97923,0.0001784615254643,0.3063779860144236,0.07214926305675981
86
+ 0.8392857142857143,102033,0.0001717709708193,0.3192372072649907,0.07517749412773683
87
+ 0.8492063492063492,107026,0.0001638177776107,0.33485912738763823,0.07885631596165125
88
+ 0.8601190476190477,109675,0.0001597665218121,0.34314722400387965,0.08080808825046344
89
+ 0.8700396825396826,113262,0.0001543407659995,0.35437010152840137,0.08345097507566894
90
+ 0.8799603174603174,119119,0.0001455305353291,0.3726952740015331,0.08776638855078145
91
+ 0.8898809523809523,127440,0.0001359807274111,0.398729721696416,0.09389726707671814
92
+ 0.8998015873015873,133957,0.0001290390562169,0.4191198786039454,0.0986989658333014
93
+ 0.9097222222222222,142348,0.0001211838665921,0.4453733397994462,0.10488142007090924
94
+ 0.9196428571428572,153038,0.0001116244820123,0.4788198301080988,0.11275776803897355
95
+ 0.929563492063492,163454,0.0001037630115678,0.511409039000047,0.12043223393563941
96
+ 0.939484126984127,177935,9.274401243552316e-05,0.5567166747493077,0.13110177508863655
97
+ 0.949404761904762,198735,7.906615255441045e-05,0.6217949720757787,0.14642712941377573
98
+ 0.9593253968253967,215154,6.953399127171892e-05,0.6731661530278616,0.15852458098418246
99
+ 0.9692460317460316,232131,6.06772297454726e-05,0.726283184456299,0.1710331646561963
100
+ 0.9791666666666666,255601,4.912336231094642e-05,0.7997152824491967,0.18832576398364903
101
+ 0.9890873015873016,273938,4.051692731176368e-05,0.857087433318211,0.20183639005384504
102
+ 1.0,319615,2.082382557391369e-05,1.0,0.23549101551102689
metric_analysis/output_standardized/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,1,2.143417655682428,3.2215456976257205e-05,7.367958810163067e-07
3
+ 0.0109126984126984,11,1.7468918034538925,0.0003543700267388293,8.104754691179374e-06
4
+ 0.0208333333333333,23,1.6432671810611028,0.0007409555104539157,1.6946305263375057e-05
5
+ 0.0307539682539682,34,1.5487864668450748,0.001095325537192745,2.505105995455443e-05
6
+ 0.0406746031746031,45,1.4578807811516885,0.0014496955639315744,3.31558146457338e-05
7
+ 0.050595238095238,59,1.397787734828259,0.0019007119615991753,4.34709569799621e-05
8
+ 0.060515873015873,70,1.3793064865985625,0.0022550819883380047,5.157571167114147e-05
9
+ 0.0704365079365079,86,1.342219079933877,0.00277052929995812,6.336444576740239e-05
10
+ 0.0803571428571428,100,1.3250067925542386,0.003221545697625721,7.367958810163068e-05
11
+ 0.0902777777777777,113,1.3061771441754433,0.0036403466383170646,8.325793455484266e-05
12
+ 0.1001984126984127,124,1.2884022960547663,0.003994716665055894,9.136268924602204e-05
13
+ 0.1101190476190476,149,1.2546164327696954,0.004800103089462324,0.0001097825862714297
14
+ 0.1200396825396825,163,1.237761303530253,0.005251119487129925,0.000120097728605658
15
+ 0.1299603174603174,181,1.2226659947251466,0.005830997712702555,0.00013336005446395153
16
+ 0.1398809523809523,201,1.2048894357033295,0.006475306852227699,0.00014809597208427765
17
+ 0.1507936507936507,221,1.191433139144886,0.007119615991752843,0.0001628318897046038
18
+ 0.1607142857142857,238,1.183222859975457,0.0076672787603492155,0.000175357419681881
19
+ 0.1706349206349206,270,1.1596741156004886,0.008698173383589447,0.00019893488787440283
20
+ 0.1805555555555555,294,1.1487927586475322,0.00947134435101962,0.0002166179890187942
21
+ 0.1904761904761904,334,1.1355234542124109,0.010759962630069907,0.00024608982425944646
22
+ 0.2003968253968254,362,1.1254166588926875,0.01166199542540511,0.00026672010892790307
23
+ 0.2103174603174603,399,1.1160912776174,0.012853967333526626,0.0002939815565255064
24
+ 0.2202380952380952,436,1.1083128293352051,0.014045939241648143,0.00032124300412310976
25
+ 0.2301587301587301,472,1.1020695947838788,0.015205695692793402,0.0003477676558396968
26
+ 0.2400793650793651,508,1.0950675043632472,0.016365452143938662,0.00037429230755628385
27
+ 0.25,540,1.0893285138897673,0.017396346767178893,0.00039786977574880566
28
+ 0.2599206349206349,573,1.0840944112778226,0.01845945684739538,0.00042218403982234375
29
+ 0.2698412698412698,609,1.0775125729219717,0.01961921329854064,0.0004487086915389308
30
+ 0.2797619047619047,663,1.0691477573528578,0.02135884797525853,0.0004884956691138114
31
+ 0.2906746031746032,721,1.060571954677911,0.023227344479881448,0.0005312298302127572
32
+ 0.3005952380952381,770,1.0524500566259047,0.02480590187171805,0.0005673328283825562
33
+ 0.310515873015873,812,1.048103007385223,0.026158951064720853,0.0005982782553852411
34
+ 0.3204365079365079,871,1.0416438955273315,0.028059663026320028,0.0006417492123652032
35
+ 0.3303571428571428,914,1.035761826178403,0.02944492767629909,0.0006734314352489044
36
+ 0.3402777777777778,984,1.0277467024345217,0.03170000966463709,0.0007250071469200459
37
+ 0.3501984126984127,1035,1.0230244967357478,0.03334299797042621,0.0007625837368518775
38
+ 0.3601190476190476,1074,1.0190172621424871,0.03459940079250024,0.0007913187762115134
39
+ 0.3700396825396825,1160,1.012688867600473,0.03736993009245836,0.0008546832219789159
40
+ 0.3799603174603174,1237,1.0053147119107226,0.03985052027963017,0.0009114165048171714
41
+ 0.3898809523809524,1281,1.0016513621903245,0.041268000386585485,0.000943835523581889
42
+ 0.3998015873015873,1341,0.997249620997618,0.04320092780516092,0.0009880432764428674
43
+ 0.4097222222222222,1417,0.9918971124207676,0.04564930253535646,0.0010440397634001067
44
+ 0.4196428571428571,1499,0.9865132333995608,0.048290970007409555,0.0011044570256434438
45
+ 0.4305555555555556,1561,0.983190364968206,0.0502883283399375,0.0011501383702664549
46
+ 0.4404761904761904,1633,0.9790934785418004,0.05260784124222802,0.001203187673699629
47
+ 0.4503968253968254,1692,0.9762955472165364,0.0545085532038272,0.0012466586306795911
48
+ 0.4603174603174603,1781,0.9715297580192088,0.05737572887471409,0.0013122334640900423
49
+ 0.4702380952380952,1831,0.9686141408612416,0.05898650172352695,0.0013490732581408578
50
+ 0.4801587301587302,1905,0.96557583848392,0.06137044553976998,0.0014035961533360643
51
+ 0.490079365079365,2006,0.9611672660302412,0.06462420669437197,0.0014780125373187113
52
+ 0.5,2095,0.9570957173459096,0.06749138236525885,0.0015435873707291628
53
+ 0.5099206349206349,2189,0.9537739623026252,0.07051963532102704,0.0016128461835446955
54
+ 0.5198412698412699,2275,0.9504137942708432,0.07329016462098514,0.001676210629312098
55
+ 0.5297619047619048,2376,0.9461150743517937,0.07654392577558712,0.001750627013294745
56
+ 0.5396825396825397,2450,0.9431937683554849,0.07892786959183017,0.0018051499084899515
57
+ 0.5496031746031746,2523,0.9403613239507382,0.08127959795109693,0.001858936007804142
58
+ 0.5595238095238095,2614,0.9370237927380608,0.08421120453593635,0.0019259844329766258
59
+ 0.5694444444444444,2785,0.9314740928203532,0.08972004767887633,0.0020519765286304142
60
+ 0.5803571428571429,2954,0.9257993297998168,0.09516445990786379,0.0021764950325221704
61
+ 0.5902777777777778,3057,0.9223133679230248,0.09848265197641828,0.00225238500826685
62
+ 0.6001984126984127,3203,0.918171233112645,0.10318610869495184,0.0023599572068952307
63
+ 0.6101190476190477,3415,0.9118727064143508,0.11001578557391836,0.0025161579336706874
64
+ 0.6200396825396826,3581,0.9074973981023632,0.11536355143197706,0.0026384660499193945
65
+ 0.6299603174603174,3718,0.90335209700547,0.1197770690377243,0.0027394070856186286
66
+ 0.6398809523809523,3866,0.9000547970039445,0.12454495667021037,0.002848452876009042
67
+ 0.6498015873015873,3976,0.8971731862315426,0.12808865693759866,0.0029295004229208358
68
+ 0.6597222222222222,4098,0.8946920241479999,0.13201894268870204,0.003019389520404825
69
+ 0.6696428571428571,4234,0.8909887319607015,0.136400244837473,0.003119593760223043
70
+ 0.6795634920634921,4343,0.8884327201390895,0.13991172964788506,0.0031999045112538205
71
+ 0.689484126984127,4421,0.8867262402778523,0.1424245352920331,0.003257374589973092
72
+ 0.6994047619047619,4644,0.8816572865395638,0.14960858219773848,0.0034216800714397286
73
+ 0.7093253968253969,4908,0.8762024305187027,0.15811346283947036,0.003616194184028034
74
+ 0.7202380952380952,5043,0.8733238070593741,0.1624625495312651,0.003715661627965235
75
+ 0.7301587301587301,5187,0.8702700733610532,0.16710157533584613,0.003821760234831583
76
+ 0.7400793650793651,5439,0.8656201351014363,0.17521987049386295,0.004007432796847693
77
+ 0.75,5607,0.8625694432138333,0.18063206726587416,0.004131214504858432
78
+ 0.7599206349206349,5814,0.8589762954909695,0.1873006668599594,0.004283731252228807
79
+ 0.7698412698412699,5987,0.8559459884429319,0.1928739409168519,0.004411196939644629
80
+ 0.7797619047619048,6129,0.8531910961698003,0.19744853580748042,0.0045158219547489446
81
+ 0.7896825396825397,6358,0.8493448528501626,0.20482587545504333,0.004684548211501678
82
+ 0.7996031746031746,6515,0.8466514160755445,0.2098837022003157,0.004800225164821238
83
+ 0.8095238095238095,6728,0.8431442930562224,0.2167455945362585,0.004957162687477712
84
+ 0.8194444444444444,6820,0.8417075589387941,0.21970941657807416,0.005024947908531212
85
+ 0.8293650793650794,7125,0.8367692646747009,0.2295351309558326,0.005249670652241186
86
+ 0.8392857142857143,7401,0.8321194850614858,0.23842659708127958,0.005453026315401686
87
+ 0.8492063492063492,7660,0.8282315008546971,0.24677040043813023,0.00564385644858491
88
+ 0.8601190476190477,7913,0.8243482587654483,0.2549209110531233,0.005830265806482035
89
+ 0.8700396825396826,8191,0.8201902577121126,0.2638768080925228,0.006035095061404569
90
+ 0.8799603174603174,8537,0.8158236353302109,0.2750233562063078,0.006290026436236211
91
+ 0.8898809523809523,8823,0.8121718746758682,0.28423697690151734,0.006500750058206875
92
+ 0.8998015873015873,9235,0.8063837662960968,0.2975097451757353,0.006804309961185593
93
+ 0.9097222222222222,9436,0.8038518801656075,0.303985052027963,0.00695240593326987
94
+ 0.9196428571428572,9768,0.7997056994974557,0.3146805837440804,0.007197022165767284
95
+ 0.929563492063492,10231,0.7936671192146495,0.3295963403240875,0.007538158658677834
96
+ 0.939484126984127,10668,0.7889122408794803,0.3436744950227119,0.007860138458681961
97
+ 0.949404761904762,11239,0.7818005566060252,0.36206952095615474,0.008280848906742272
98
+ 0.9593253968253967,12034,0.7726483778092785,0.38768080925227927,0.008866601632150235
99
+ 0.9692460317460316,12783,0.7640086373583896,0.4118101865274959,0.00941846174703145
100
+ 0.9791666666666666,14286,0.7484793180463,0.46023001836281047,0.010525865956198959
101
+ 0.9890873015873016,16933,0.7230443483570232,0.5455043329789633,0.012476164653249123
102
+ 1.0,31041,0.6240002852947716,1.0,0.022870880942627177
metric_analysis/output_standardized/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,14,10241.747512102276,0.0005680204487361545,1.0315142334228295e-05
3
+ 0.0109126984126984,82,6411.553184403061,0.0033269769140260477,6.0417262243337154e-05
4
+ 0.0208333333333333,164,5331.458638244872,0.006653953828052095,0.00012083452448667431
5
+ 0.0307539682539682,253,4696.706526876823,0.01026494096644622,0.0001864093578971256
6
+ 0.0406746031746031,280,4475.92163141317,0.01136040897472309,0.0002063028466845659
7
+ 0.050595238095238,366,4018.487598414677,0.014849677445530896,0.00026966729245196826
8
+ 0.060515873015873,450,3670.968448884167,0.018257800137947824,0.00033155814645733804
9
+ 0.0704365079365079,522,3443.658749880984,0.021179048160019476,0.00038460744989051213
10
+ 0.0803571428571428,579,3300.3631730935804,0.023491702844159534,0.0004266048151084416
11
+ 0.0902777777777777,634,3191.2654159655003,0.02572321174990871,0.00046712858856433846
12
+ 0.1001984126984127,691,3057.4880652118327,0.02803586643404877,0.0005091259537822679
13
+ 0.1101190476190476,776,2855.125879628761,0.03148456201566113,0.0005717536036686541
14
+ 0.1200396825396825,847,2720.031974699821,0.03436523714853735,0.0006240661112208118
15
+ 0.1299603174603174,903,2644.884840422699,0.03663731894348197,0.000665326680557725
16
+ 0.1398809523809523,995,2512.2825954364475,0.040370024749462406,0.0007331119016112252
17
+ 0.1507936507936507,1043,2465.783943579372,0.04231752343084351,0.0007684781039000079
18
+ 0.1607142857142857,1089,2409.978463778208,0.044183876333833735,0.000802370714426758
19
+ 0.1706349206349206,1153,2343.3129244431043,0.046780541242341864,0.0008495256508118016
20
+ 0.1805555555555555,1187,2298.6494790579704,0.04816001947498681,0.0008745767107663561
21
+ 0.1904761904761904,1253,2218.5893192510925,0.050837830161885825,0.0009232052389134324
22
+ 0.2003968253968254,1296,2181.038558435724,0.05258246439728973,0.0009548874617971336
23
+ 0.2103174603174603,1354,2133.4300161851493,0.05493569197062523,0.0009976216228960793
24
+ 0.2202380952380952,1424,2083.200810963788,0.057775794214306,0.001049197334567221
25
+ 0.2301587301587301,1508,2016.6788884308944,0.06118391690672293,0.0011110881885725906
26
+ 0.2400793650793651,1595,1957.2434949168637,0.06471375826672618,0.0011751894302210093
27
+ 0.25,1646,1924.0614094625105,0.06678297561569359,0.0012127660201528409
28
+ 0.2599206349206349,1693,1888.1800931525004,0.06868990140787926,0.0012473954265606072
29
+ 0.2698412698412698,1753,1852.2105720993288,0.07112427475960563,0.0012916031794215858
30
+ 0.2797619047619047,1813,1818.039475418674,0.073558648111332,0.001335810932282564
31
+ 0.2906746031746032,1857,1791.705607659909,0.07534385523593135,0.0013682299510472816
32
+ 0.3005952380952381,1918,1753.2702562983825,0.07781880147685316,0.0014131744997892765
33
+ 0.310515873015873,1984,1711.5371500241918,0.08049661216375219,0.0014618030279363527
34
+ 0.3204365079365079,2076,1658.641956527047,0.08422931796973263,0.001529588248989853
35
+ 0.3303571428571428,2150,1609.3989544747326,0.08723171177019516,0.0015841111441850595
36
+ 0.3402777777777778,2204,1578.2817061633984,0.0894226477867489,0.0016238981217599402
37
+ 0.3501984126984127,2310,1527.467028791344,0.0937233740414655,0.0017019984851476687
38
+ 0.3601190476190476,2357,1506.1373828761034,0.09563029983365115,0.001736627891555435
39
+ 0.3700396825396825,2406,1483.3271367722798,0.09761837140422769,0.0017727308897252342
40
+ 0.3799603174603174,2500,1440.5856788553633,0.10143222298859902,0.001841989702540767
41
+ 0.3898809523809524,2589,1408.5466934453984,0.10504321012699315,0.0019075645359512181
42
+ 0.3998015873015873,2629,1392.3103276130262,0.10666612569481072,0.0019370363711918705
43
+ 0.4097222222222222,2693,1366.6130386272123,0.10926279060331887,0.001984191307576914
44
+ 0.4196428571428571,2824,1327.1136527390647,0.11457783908792145,0.0020807115679900502
45
+ 0.4305555555555556,2900,1301.7988554511148,0.11766137866677485,0.0021367080549472895
46
+ 0.4404761904761904,2982,1273.901632813587,0.12098835558080091,0.002197125317190627
47
+ 0.4503968253968254,3040,1252.4280115900583,0.1233415831541364,0.0022398594782895727
48
+ 0.4603174603174603,3144,1221.3638625370725,0.12756116363046213,0.0023164862499152686
49
+ 0.4702380952380952,3181,1210.132391580637,0.1290623605306934,0.002343747697512872
50
+ 0.4801587301587302,3237,1192.4468751700258,0.131334442325638,0.002385008266849785
51
+ 0.490079365079365,3309,1173.011743073717,0.13425569034770965,0.0024380575702829593
52
+ 0.5,3403,1146.372070033951,0.13806954193208099,0.002507316383098492
53
+ 0.5099206349206349,3454,1133.927886665909,0.1401387592810484,0.0025448929730303234
54
+ 0.5198412698412699,3581,1099.8695901550593,0.14529151620886924,0.0026384660499193945
55
+ 0.5297619047619048,3664,1078.0676366493603,0.14865906601209072,0.002699620108043748
56
+ 0.5396825396825397,3733,1060.6487591725645,0.15145859536657605,0.002750459023833873
57
+ 0.5496031746031746,3868,1027.9967506835908,0.15693593540796041,0.0028499264677710745
58
+ 0.5595238095238095,3948,1010.3955348603816,0.16018176654359556,0.002908870138252379
59
+ 0.5694444444444444,4022,994.8012552878178,0.1631841603440581,0.0029633930334475858
60
+ 0.5803571428571429,4107,972.2112451667429,0.16663285592567045,0.003026020683333972
61
+ 0.5902777777777778,4189,954.8779497176456,0.1699598328396965,0.003086437945577309
62
+ 0.6001984126984127,4297,934.7004834106868,0.17434170487280398,0.00316601190072707
63
+ 0.6101190476190477,4366,923.3244117903994,0.17714123422728933,0.0032168508165171953
64
+ 0.6200396825396826,4423,913.4868359265708,0.17945388891142938,0.0032588481817351247
65
+ 0.6299603174603174,4558,889.5148546157745,0.18493122895281372,0.003358315625672326
66
+ 0.6398809523809523,4672,870.5048665882368,0.18955653832109384,0.003442310356108185
67
+ 0.6498015873015873,4786,849.9546866922489,0.19418184768937397,0.003526305086544044
68
+ 0.6597222222222222,4901,832.7804952963619,0.19884772994684952,0.0036110366128609193
69
+ 0.6696428571428571,5037,811.3520395758455,0.2043656428774293,0.003711240852679137
70
+ 0.6795634920634921,5109,800.5203812652611,0.20728689089950095,0.0037642901561123114
71
+ 0.689484126984127,5234,781.4194696938304,0.2123585020489309,0.00385638964123935
72
+ 0.6994047619047619,5400,760.2850310752603,0.2190936016553739,0.0039786977574880564
73
+ 0.7093253968253969,5524,745.3425802314556,0.22412463991560838,0.004070060446734079
74
+ 0.7202380952380952,5736,718.2535271058875,0.23272609242504158,0.004226261173509536
75
+ 0.7301587301587301,5846,706.1316731892126,0.23718911023653994,0.0043073087204213295
76
+ 0.7400793650793651,6008,687.8141314252327,0.24376191828620117,0.004426669653145971
77
+ 0.75,6151,671.9318091327671,0.24956384144114901,0.004532031464131303
78
+ 0.7599206349206349,6301,655.3005304817065,0.25564977482046497,0.004642550846283749
79
+ 0.7698412698412699,6444,641.222980944397,0.26145169797541284,0.004747912657269081
80
+ 0.7797619047619048,6566,631.0274981581211,0.2664015904572565,0.00483780175475307
81
+ 0.7896825396825397,6797,608.3009252958204,0.27577392786140303,0.005008001603267837
82
+ 0.7996031746031746,7011,587.5531675251212,0.28445652614922706,0.005165675921805327
83
+ 0.8095238095238095,7157,576.0696671571288,0.29038016797176125,0.005273248120433708
84
+ 0.8194444444444444,7357,560.2436517890462,0.29849474581084917,0.0054206072966369685
85
+ 0.8293650793650794,7603,539.923690961875,0.30847567655292735,0.005601859083366981
86
+ 0.8392857142857143,7800,523.7004888975509,0.31646853572442896,0.005747007871927193
87
+ 0.8492063492063492,7973,509.5041443087782,0.32348764555524,0.005874473559343014
88
+ 0.8601190476190477,8058,502.7703002355239,0.3269363411368524,0.0059371012092294
89
+ 0.8700396825396826,8235,491.0665617707924,0.33411774252444515,0.0060675140801692866
90
+ 0.8799603174603174,8452,476.55673451118935,0.34292205947985555,0.006227398786349825
91
+ 0.8898809523809523,8904,446.87607462645417,0.36126100539619427,0.006560430524569195
92
+ 0.8998015873015873,9298,425.4498120093774,0.3772467237391975,0.006850728101689621
93
+ 0.9097222222222222,9560,411.7376865200165,0.3878768207084026,0.007043768622515893
94
+ 0.9196428571428572,10033,389.3847190107792,0.4070677972978456,0.007392273074236606
95
+ 0.929563492063492,10240,379.36847411850994,0.4154663853613016,0.007544789821606981
96
+ 0.939484126984127,10899,351.11195753265446,0.44220391934109626,0.008030338307196728
97
+ 0.949404761904762,11501,326.7121276970904,0.46662879863675094,0.008473889427568545
98
+ 0.9593253968253967,12079,304.2604884972614,0.490079928591715,0.00889975744679597
99
+ 0.9692460317460316,13013,272.4577979927763,0.5279750071002556,0.0095879247996652
100
+ 0.9791666666666666,13979,244.9501617824125,0.5671684180630503,0.010299669620726952
101
+ 0.9890873015873016,16064,195.4118856404268,0.6517628920355418,0.011835889032645952
102
+ 1.0,24647,96.18719446805105,1.0,0.018159808079408913
metric_analysis/output_standardized/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,1,3533.739728797425,1.2755427434373327e-06,7.367958810163067e-07
3
+ 0.0109126984126984,11,913.2204554544222,1.4030970177810659e-05,8.104754691179374e-06
4
+ 0.0208333333333333,22,642.3126335309646,2.8061940355621317e-05,1.620950938235875e-05
5
+ 0.0307539682539682,34,463.6375267632411,4.336845327686931e-05,2.505105995455443e-05
6
+ 0.0406746031746031,47,319.3778539107914,5.995050894155463e-05,3.462940640776642e-05
7
+ 0.050595238095238,57,261.8047511787649,7.270593637592796e-05,4.1997365217929485e-05
8
+ 0.060515873015873,68,220.9913721970096,8.673690655373862e-05,5.010211990910886e-05
9
+ 0.0704365079365079,81,195.60887797794308,0.00010331896221842393,5.968046636232085e-05
10
+ 0.0803571428571428,97,157.76121015148178,0.00012372764611342127,7.146920045858176e-05
11
+ 0.0902777777777777,114,126.08462267664385,0.0001454118727518559,8.399473043585897e-05
12
+ 0.1001984126984127,128,114.91598022184296,0.00016326947115997858,9.430987277008726e-05
13
+ 0.1101190476190476,149,99.31178628400392,0.00019005586877216256,0.0001097825862714297
14
+ 0.1200396825396825,164,90.3818928159452,0.00020918900992372255,0.00012083452448667431
15
+ 0.1299603174603174,178,82.08798118734508,0.0002270466083318452,0.00013114966682090261
16
+ 0.1398809523809523,197,70.16356545125636,0.0002512819204571545,0.00014514878856021244
17
+ 0.1507936507936507,216,61.84780146447067,0.00027551723258246384,0.00015914791029952226
18
+ 0.1607142857142857,241,54.36857680039037,0.00030740580116839716,0.00017756780732492992
19
+ 0.1706349206349206,272,48.023909569922445,0.00034694762621495447,0.00020040847963643545
20
+ 0.1805555555555555,300,41.873141561514856,0.00038266282303119976,0.00022103876430489203
21
+ 0.1904761904761904,327,37.27337021039718,0.00041710247710400776,0.00024093225309233232
22
+ 0.2003968253968254,354,34.49758353529179,0.00045154213117681576,0.0002608257418797726
23
+ 0.2103174603174603,376,32.10956578521053,0.00047960407153243705,0.00027703525126213135
24
+ 0.2202380952380952,413,29.10383773593751,0.0005267991530396184,0.0003042966988597347
25
+ 0.2301587301587301,440,27.26674857348407,0.0005612388071124263,0.00032419018764717495
26
+ 0.2400793650793651,493,23.38378755565621,0.000628842572514605,0.0003632403693410392
27
+ 0.25,530,21.76810557585511,0.0006760376540217863,0.00039050181693864257
28
+ 0.2599206349206349,589,19.614803085421624,0.0007512946758845889,0.0004339727739186047
29
+ 0.2698412698412698,617,18.715704509161537,0.0007870098727008342,0.0004546030585870613
30
+ 0.2797619047619047,656,17.503118276572664,0.0008367560396948902,0.00048333809794669723
31
+ 0.2906746031746032,710,15.75957395134853,0.0009056353478405061,0.0005231250755215778
32
+ 0.3005952380952381,776,14.116974236232336,0.0009898211689073702,0.0005717536036686541
33
+ 0.310515873015873,830,13.104004547610508,0.001058700477052986,0.0006115405812435346
34
+ 0.3204365079365079,868,12.499962631113736,0.0011071711013036047,0.0006395388247221543
35
+ 0.3303571428571428,914,11.924377006525065,0.001165846067501722,0.0006734314352489044
36
+ 0.3402777777777778,967,11.162073579916996,0.0012334498329039005,0.0007124816169427686
37
+ 0.3501984126984127,1013,10.492987863082892,0.0012921247991020178,0.0007463742274695188
38
+ 0.3601190476190476,1082,9.709458682853722,0.0013801372483991938,0.0007972131432596439
39
+ 0.3700396825396825,1182,8.841857620201926,0.001507691522742927,0.0008708927313612746
40
+ 0.3799603174603174,1318,7.95447265160399,0.0016811653358504044,0.0009710969711794924
41
+ 0.3898809523809524,1381,7.553164524632679,0.0017615245286869564,0.0010175151116835197
42
+ 0.3998015873015873,1476,6.895417771804837,0.0018827010893135029,0.0010875107203800688
43
+ 0.4097222222222222,1590,6.348934235182757,0.0020281129620653587,0.0011715054508159278
44
+ 0.4196428571428571,1669,5.972958366751632,0.002128880838796908,0.0012297123254162159
45
+ 0.4305555555555556,1743,5.658337935222293,0.0022232710018112705,0.0012842352206114227
46
+ 0.4404761904761904,1898,5.125327551050057,0.0024209801270440572,0.0013984385821689503
47
+ 0.4503968253968254,1956,4.94650983750502,0.0024949616061634224,0.0014411727432678961
48
+ 0.4603174603174603,2041,4.73908030227792,0.002603382739355596,0.0015038003931542821
49
+ 0.4702380952380952,2157,4.426219447170888,0.0027513456975943266,0.0015892687153521738
50
+ 0.4801587301587302,2384,3.9638262404902136,0.003040893900354601,0.0017565213803428753
51
+ 0.490079365079365,2502,3.752840702273832,0.003191407944080206,0.0018434632943027996
52
+ 0.5,2593,3.59870356795998,0.0033074823337330035,0.0019105117194752835
53
+ 0.5099206349206349,2707,3.428223279567193,0.0034528942064848593,0.0019945064499111423
54
+ 0.5198412698412699,2908,3.1785787340504648,0.003709278297915763,0.00214260242199542
55
+ 0.5297619047619048,3081,2.955250022079203,0.003929947192530422,0.002270068109411241
56
+ 0.5396825396825397,3152,2.8954501822960066,0.004020510727314472,0.002322380616963399
57
+ 0.5496031746031746,3248,2.7815895186317805,0.004142962830684456,0.0023931130215409644
58
+ 0.5595238095238095,3468,2.583086678098461,0.004423582234240669,0.002555208115364552
59
+ 0.5694444444444444,3687,2.4125368969101038,0.004702926095053446,0.002716566413307123
60
+ 0.5803571428571429,4128,2.125304741103856,0.005265440444909309,0.0030414933968353143
61
+ 0.5902777777777778,4249,2.051449708767312,0.005419781116865226,0.0031306456984382873
62
+ 0.6001984126984127,4484,1.9245385016106695,0.005719533661572999,0.0033037927304771196
63
+ 0.6101190476190477,4686,1.8317436498146948,0.00597719329574734,0.0034526254984424136
64
+ 0.6200396825396826,4966,1.722560123794516,0.006334345263909793,0.0036589283451269796
65
+ 0.6299603174603174,5321,1.58351180677159,0.0067871629378300465,0.0039204908828877685
66
+ 0.6398809523809523,5882,1.4051718048191333,0.0075027424168983906,0.0043338333721379164
67
+ 0.6498015873015873,6161,1.327281088505106,0.007858618842317406,0.004539399422941466
68
+ 0.6597222222222222,6467,1.2602164579934505,0.008248934921809229,0.004764858962532456
69
+ 0.6696428571428571,6780,1.1855272483643158,0.008648179800505114,0.00499547607329056
70
+ 0.6795634920634921,7289,1.0890041480074235,0.009297431056914718,0.00537050517672786
71
+ 0.689484126984127,7661,1.0212425782882102,0.009771932957473405,0.0056445932444659265
72
+ 0.6994047619047619,8206,0.9414799820328928,0.01046710375264675,0.006046146999619814
73
+ 0.7093253968253969,8603,0.888609564785895,0.010973494221791372,0.006338654964383287
74
+ 0.7202380952380952,9007,0.8391207342688993,0.011488813490140054,0.006636320500313875
75
+ 0.7301587301587301,9788,0.750920049260784,0.012485012372764611,0.007211758083387611
76
+ 0.7400793650793651,10314,0.7019883742889699,0.013155947855812648,0.007599312716802188
77
+ 0.75,10726,0.6646019719884612,0.01368147146610883,0.007902872619780907
78
+ 0.7599206349206349,11316,0.6182192984983755,0.014434041684736856,0.008337582189580528
79
+ 0.7698412698412699,11931,0.5754220771104159,0.015218500471950815,0.008790711656405556
80
+ 0.7797619047619048,12508,0.5409698646849892,0.015954488634914155,0.009215842879751966
81
+ 0.7896825396825397,13194,0.5055156298192968,0.016829510956912166,0.009721284854129151
82
+ 0.7996031746031746,14505,0.4477972728644806,0.01850174749355851,0.01068722425414153
83
+ 0.8095238095238095,15401,0.4148045252952486,0.01964463379167836,0.011347393363532141
84
+ 0.8194444444444444,16677,0.3732660056369872,0.021272226332304394,0.012287544907708947
85
+ 0.8293650793650794,17811,0.3390347868176622,0.02271869180336233,0.01312307143678144
86
+ 0.8392857142857143,19304,0.304132160348205,0.024623077119314267,0.014223107687138786
87
+ 0.8492063492063492,20166,0.2869145938607858,0.02572259496415725,0.014858225736574842
88
+ 0.8601190476190477,22479,0.2465673793342506,0.0286729253297278,0.01656243460936556
89
+ 0.8700396825396826,24972,0.2125022735791463,0.03185285338911707,0.018399266740739214
90
+ 0.8799603174603174,28663,0.1745575677179806,0.03656088165514426,0.021118780337570402
91
+ 0.8898809523809523,35102,0.1308102522001188,0.04477410138013725,0.0258630090154344
92
+ 0.8998015873015873,44279,0.0931967814168663,0.05647975713666165,0.032624584815521045
93
+ 0.9097222222222222,53428,0.0704476225450466,0.0681496976963698,0.03936553033093924
94
+ 0.9196428571428572,67477,0.0496322351327484,0.0860697976989209,0.049716775663337334
95
+ 0.929563492063492,90944,0.0310654466601885,0.11600295925916478,0.067007164603147
96
+ 0.939484126984127,115968,0.0206347700888285,0.14792214087094058,0.08544474472969907
97
+ 0.949404761904762,156999,0.0119895694113137,0.20025893517691779,0.11567621652367914
98
+ 0.9593253968253967,225314,0.0059620822317802,0.28739763769483917,0.16601042713530814
99
+ 0.9692460317460316,341935,0.0022855871148248,0.4361527079772443,0.25193629957531083
100
+ 0.9791666666666666,474954,0.0008589343284472,0.6058241281665349,0.349944150872219
101
+ 0.9890873015873016,606979,0.0003409942258809,0.7742276588688487,0.44721962706339685
102
+ 1.0,783980,9.05288957657348e-05,1.0,0.5776332347991642
metric_analysis/output_standardized/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,1,3128.6932364568866,1.0053282396702524e-05,7.367958810163067e-07
3
+ 0.0109126984126984,11,2754.925734752474,0.00011058610636372776,8.104754691179374e-06
4
+ 0.0208333333333333,21,2581.3315487148006,0.000211118930330753,1.547271350134244e-05
5
+ 0.0307539682539682,31,2500.869915344092,0.00031165175429777824,2.284067231150551e-05
6
+ 0.0406746031746031,41,2303.2090263636537,0.00041218457826480346,3.0208631121668577e-05
7
+ 0.050595238095238,51,2152.268107221841,0.0005127174022318287,3.7576589931831644e-05
8
+ 0.060515873015873,62,2003.0277986460685,0.0006233035085955565,4.568134462301102e-05
9
+ 0.0704365079365079,74,1862.527068589252,0.0007439428973559868,5.4522895195206704e-05
10
+ 0.0803571428571428,88,1637.9180724687665,0.0008846888509098221,6.4838037529435e-05
11
+ 0.0902777777777777,102,1482.542530248446,0.0010254348044636573,7.515317986366329e-05
12
+ 0.1001984126984127,112,1440.1384708594214,0.0011259676284306826,8.252113867382636e-05
13
+ 0.1101190476190476,123,1367.2655234393055,0.0012365537347944105,9.062589336500573e-05
14
+ 0.1200396825396825,136,1253.92530228473,0.0013672464059515431,0.00010020423981821773
15
+ 0.1299603174603174,156,1119.7867538390494,0.0015683120538855936,0.00011494015743854386
16
+ 0.1398809523809523,171,1058.1367647948662,0.0017191112898361316,0.00012599209565378845
17
+ 0.1507936507936507,182,976.0304653073382,0.0018296973961998593,0.00013409685034496783
18
+ 0.1607142857142857,201,866.7503880337599,0.002020709761737207,0.00014809597208427765
19
+ 0.1706349206349206,219,815.3028458797158,0.0022016688448778525,0.00016135829794257118
20
+ 0.1805555555555555,232,786.2690567569583,0.0023323615160349854,0.00017093664439578316
21
+ 0.1904761904761904,248,741.7820711022046,0.002493214034382226,0.00018272537849204409
22
+ 0.2003968253968254,266,691.4859160249838,0.0026741731175228714,0.0001959877043503376
23
+ 0.2103174603174603,286,634.4445375768611,0.002875238765456922,0.00021072362197066373
24
+ 0.2202380952380952,303,599.4004450619773,0.0030461445662008646,0.00022324915194794095
25
+ 0.2301587301587301,327,551.4285181823311,0.003287423343721725,0.00024093225309233232
26
+ 0.2400793650793651,348,533.9551789024323,0.003498542274052478,0.00025640496659367474
27
+ 0.25,367,506.6784740394677,0.0036895546395898263,0.00027040408833298456
28
+ 0.2599206349206349,384,485.52769942049525,0.003860460440333769,0.0002829296183102618
29
+ 0.2698412698412698,421,446.810815150956,0.004232431889011763,0.00031019106590786513
30
+ 0.2797619047619047,444,408.7457694650668,0.00446365738413592,0.0003271373711712402
31
+ 0.2906746031746032,474,384.5033519093221,0.004765255856036996,0.0003492412476017294
32
+ 0.3005952380952381,507,362.2200443714963,0.00509701417512818,0.00037355551167526756
33
+ 0.310515873015873,546,330.0766694728744,0.005489092188599578,0.0004022905510349035
34
+ 0.3204365079365079,578,310.3159564904467,0.005810797225294059,0.0004258680192274253
35
+ 0.3303571428571428,623,292.8722993281888,0.006263194933145672,0.00045902383387315913
36
+ 0.3402777777777778,657,275.9762698041734,0.006605006534633558,0.00048407489382771353
37
+ 0.3501984126984127,693,264.6301245685592,0.006966924700914849,0.0005105995455443006
38
+ 0.3601190476190476,730,249.2392647610653,0.007338896149592842,0.0005378609931419039
39
+ 0.3700396825396825,767,235.05164885003057,0.007710867598270835,0.0005651224407395073
40
+ 0.3799603174603174,822,216.09849275103548,0.008263798130089475,0.0006056462141954042
41
+ 0.3898809523809524,875,202.1903749398219,0.008796622097114707,0.0006446963958892685
42
+ 0.3998015873015873,920,193.56605517288256,0.009249019804966322,0.0006778522105350022
43
+ 0.4097222222222222,963,187.94005117417709,0.00968131094802453,0.0007095344334187034
44
+ 0.4196428571428571,1034,175.0797930343889,0.01039509399819041,0.0007618469409708612
45
+ 0.4305555555555556,1101,164.50275477412396,0.011068663918769478,0.0008112122649989537
46
+ 0.4404761904761904,1145,157.99851670144255,0.01151100834422439,0.0008436312837636713
47
+ 0.4503968253968254,1200,150.88390964671086,0.012063938876043028,0.0008841550572195681
48
+ 0.4603174603174603,1290,140.8240668782626,0.012968734291746255,0.0009504666865110357
49
+ 0.4702380952380952,1369,130.09518134268558,0.013762943601085754,0.001008673561111324
50
+ 0.4801587301587302,1444,124.34249552435476,0.014516939780838444,0.0010639332521875469
51
+ 0.490079365079365,1523,117.28492475872802,0.015311149090177943,0.0011221401267878352
52
+ 0.5,1611,110.04173671014205,0.016195837941087764,0.0011869781643172703
53
+ 0.5099206349206349,1723,102.7628034997729,0.01732180556951845,0.0012694993029910965
54
+ 0.5198412698412699,1831,96.64859215298335,0.01840756006836232,0.0013490732581408578
55
+ 0.5297619047619048,1975,88.97259652512547,0.019855232733487483,0.001455171865007206
56
+ 0.5396825396825397,2052,86.08126921842629,0.020629335478033577,0.0015119051478454616
57
+ 0.5496031746031746,2252,78.06253536357954,0.02263999195737408,0.0016592643240487229
58
+ 0.5595238095238095,2391,73.26624858192154,0.024037398210515735,0.0017616789515099895
59
+ 0.5694444444444444,2484,70.65733437208469,0.024972353473409068,0.001830200968444506
60
+ 0.5803571428571429,2584,68.03407090113993,0.02597768171307932,0.0019038805565461366
61
+ 0.5902777777777778,2770,63.54623310376039,0.02784759223886599,0.00204092459041517
62
+ 0.6001984126984127,2892,60.7634161516708,0.0290740926912637,0.002130813687899159
63
+ 0.6101190476190477,3103,56.50224523966051,0.03119533527696793,0.0022862776187936
64
+ 0.6200396825396826,3193,54.78245772103394,0.032100130692671154,0.0023525892480850677
65
+ 0.6299603174603174,3524,49.07891508810281,0.035427767165979694,0.002596468684701465
66
+ 0.6398809523809523,3665,46.847492839618525,0.03684527998391475,0.002700356903924764
67
+ 0.6498015873015873,3886,43.93985563230335,0.03906705539358601,0.002863188793629368
68
+ 0.6597222222222222,4164,40.57703645974158,0.04186186789986931,0.0030680180485519013
69
+ 0.6696428571428571,4393,38.21854701763988,0.04416406956871419,0.0032367443053046355
70
+ 0.6795634920634921,4677,35.74808318428063,0.047019201769377704,0.003445994335513267
71
+ 0.689484126984127,5052,32.567920421715776,0.05078918266814115,0.003722292790894382
72
+ 0.6994047619047619,5385,30.60214054903069,0.05413692570624309,0.003967645819272812
73
+ 0.7093253968253969,5808,28.183043893453764,0.058389464160048254,0.00427931047694271
74
+ 0.7202380952380952,6079,26.95420443855561,0.06111390368955464,0.004478982160698129
75
+ 0.7301587301587301,6475,25.012739959261825,0.06509500351864884,0.004770753329580586
76
+ 0.7400793650793651,6999,22.874625271719133,0.07036292349452096,0.005156834371233131
77
+ 0.75,7589,20.906896682604128,0.07629436010857545,0.005591543941032752
78
+ 0.7599206349206349,8060,19.641355294192596,0.08102945611742234,0.005938574800991432
79
+ 0.7698412698412699,8483,18.548537529332275,0.08528199457122751,0.006250239458661331
80
+ 0.7797619047619048,9205,16.915461328983927,0.09254046446164672,0.006782206084755104
81
+ 0.7896825396825397,9532,16.27014209744153,0.09582788780536845,0.007023138337847436
82
+ 0.7996031746031746,9920,15.580116677414129,0.09972856137528903,0.007309015139681763
83
+ 0.8095238095238095,11077,13.707925776186906,0.11136020910827385,0.00816148797401763
84
+ 0.8194444444444444,11889,12.681181379795396,0.1195234744143963,0.00875976622940287
85
+ 0.8293650793650794,13090,11.388669220662166,0.13159746657283602,0.009644658082503456
86
+ 0.8392857142857143,14621,9.997273862423672,0.1469890419221876,0.010772692576339421
87
+ 0.8492063492063492,15873,9.0580071984413,0.15957575148285916,0.011695161019371837
88
+ 0.8601190476190477,17036,8.371478269233975,0.1712677189102242,0.012552054628993801
89
+ 0.8700396825396826,17741,7.978472899264016,0.17835528299989947,0.013071495725110299
90
+ 0.8809523809523809,19770,7.0,0.1987533929828089,0.014566454567692384
91
+ 0.8898809523809523,20244,6.807979662864933,0.20351864883884588,0.014915695815294115
92
+ 0.8998015873015873,21629,6.278415213300843,0.21744244495827889,0.0159361581105017
93
+ 0.9097222222222222,24560,5.39307122938644,0.24690861566301398,0.018095706837760493
94
+ 0.9196428571428572,26082,4.999137001078749,0.2622097114707952,0.019217110168667312
95
+ 0.929563492063492,28223,4.501034715181624,0.2837337890821353,0.020794590149923225
96
+ 0.939484126984127,30526,4.072360076314477,0.3068864984417412,0.02249143106390378
97
+ 0.949404761904762,34261,3.497256599434304,0.34443550819342517,0.025243363679499687
98
+ 0.9593253968253967,37923,3.0599798377048133,0.3812506283301498,0.0279415101957814
99
+ 0.9692460317460316,43642,2.5354184580479755,0.43874535035689155,0.03215524583931366
100
+ 0.9791666666666666,50271,2.0852622686277305,0.5053885593646326,0.03703946573457076
101
+ 0.9890873015873016,62079,1.5480748118436458,0.624097717904896,0.04573955149761131
102
+ 1.0,99470,0.7599240299001586,1.0,0.07328908628469204
metric_analysis/output_standardized/virality_followers_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,1,162.90697674418604,2.0508613617719442e-05,7.367958810163067e-07
3
+ 0.0109126984126984,11,59.90412635890944,0.00022559474979491387,8.104754691179374e-06
4
+ 0.0208333333333333,22,41.26621697088786,0.00045118949958982774,1.620950938235875e-05
5
+ 0.0307539682539682,34,27.75797742299627,0.000697292863002461,2.505105995455443e-05
6
+ 0.0406746031746031,45,20.41887906141272,0.0009228876127973749,3.31558146457338e-05
7
+ 0.050595238095238,56,17.36539923954372,0.0011484823625922888,4.126056933691318e-05
8
+ 0.060515873015873,73,13.329307594828814,0.0014971287940935194,5.378609931419039e-05
9
+ 0.0704365079365079,88,11.423473275192162,0.001804757998359311,6.4838037529435e-05
10
+ 0.0803571428571428,100,9.792804922314204,0.002050861361771944,7.367958810163068e-05
11
+ 0.0902777777777777,114,8.966047810728663,0.0023379819524200164,8.399473043585897e-05
12
+ 0.1001984126984127,132,8.241192277609564,0.0027071369975389665,9.725705629415249e-05
13
+ 0.1101190476190476,150,7.225324262618784,0.0030762920426579163,0.00011051938215244602
14
+ 0.1200396825396825,164,6.426211688059448,0.0033634126333059885,0.00012083452448667431
15
+ 0.1299603174603174,181,5.865184280255489,0.0037120590648072192,0.00013336005446395153
16
+ 0.1398809523809523,209,5.123369293423201,0.004286300246103363,0.00015399033913240812
17
+ 0.1507936507936507,233,4.498533912223866,0.00477850697292863,0.00017167344027679948
18
+ 0.1607142857142857,255,4.081553139911354,0.005229696472518457,0.00018788294965915822
19
+ 0.1706349206349206,286,3.653621381908462,0.00586546349466776,0.00021072362197066373
20
+ 0.1805555555555555,318,3.3862837057887427,0.006521739130434782,0.00023430109016318556
21
+ 0.1904761904761904,343,3.1589097951921734,0.007034454470877769,0.00025272098718859325
22
+ 0.2003968253968254,378,2.901110957300737,0.0077522559474979495,0.00027850884302416394
23
+ 0.2103174603174603,412,2.6883909458302067,0.008449548810500411,0.0003035599029787184
24
+ 0.2202380952380952,440,2.5712123367735846,0.009023789991796555,0.00032419018764717495
25
+ 0.2301587301587301,489,2.3775402418530684,0.010028712059064807,0.00036029318581697403
26
+ 0.2400793650793651,516,2.262779417926834,0.010582444626743232,0.0003801866746044143
27
+ 0.25,532,2.212570952273289,0.010910582444626744,0.0003919754087006752
28
+ 0.2599206349206349,567,2.1149449648249323,0.011628383921246923,0.0004177632645362459
29
+ 0.2698412698412698,612,2.01276188638082,0.0125512715340443,0.00045091907918197975
30
+ 0.2797619047619047,663,1.871412888170072,0.01359721082854799,0.0004884956691138114
31
+ 0.2906746031746032,713,1.7528878661684644,0.014622641509433962,0.0005253354631646267
32
+ 0.3005952380952381,757,1.6483081903461123,0.015525020508613617,0.0005577544819293442
33
+ 0.310515873015873,804,1.5657617737942116,0.016488925348646433,0.0005923838883371106
34
+ 0.3204365079365079,868,1.4648138997453877,0.017801476620180477,0.0006395388247221543
35
+ 0.3303571428571428,930,1.3788231439711065,0.01907301066447908,0.0006852201693451653
36
+ 0.3402777777777778,973,1.3111586957558676,0.019954881050041017,0.0007169023922288665
37
+ 0.3501984126984127,1035,1.2350532566427777,0.02122641509433962,0.0007625837368518775
38
+ 0.3601190476190476,1079,1.1946824800319331,0.02212879409351928,0.000795002755616595
39
+ 0.3700396825396825,1157,1.126624449492463,0.023728465955701394,0.000852472834335867
40
+ 0.3799603174603174,1238,1.0517923348956837,0.02538966365873667,0.0009121533006981878
41
+ 0.3898809523809524,1281,1.0159094482787685,0.026271534044298606,0.000943835523581889
42
+ 0.3998015873015873,1340,0.9728949369054694,0.027481542247744052,0.000987306480561851
43
+ 0.4097222222222222,1407,0.933151427225055,0.028855619360131254,0.0010366718045899436
44
+ 0.4196428571428571,1492,0.8837801041456175,0.03059885151763741,0.0010992994544763296
45
+ 0.4305555555555556,1550,0.8543538482681353,0.03178835110746513,0.0011420336155752754
46
+ 0.4404761904761904,1623,0.8251487414647105,0.033285479901558654,0.0011958197148894659
47
+ 0.4503968253968254,1690,0.7949358527010969,0.03465955701394586,0.0012451850389175584
48
+ 0.4603174603174603,1755,0.7707349266582455,0.03599261689909762,0.0012930767711836185
49
+ 0.4702380952380952,1840,0.7363839449817713,0.03773584905660377,0.0013557044210700045
50
+ 0.4801587301587302,1910,0.7136122363660038,0.039171452009844135,0.0014072801327411459
51
+ 0.490079365079365,2000,0.6861455175535648,0.04101722723543889,0.0014735917620326134
52
+ 0.5,2102,0.6582912665179754,0.04310910582444627,0.0015487449418962768
53
+ 0.5099206349206349,2189,0.6347704849378184,0.04489335520918786,0.0016128461835446955
54
+ 0.5198412698412699,2275,0.6145070157601543,0.04665709598031173,0.001676210629312098
55
+ 0.5297619047619048,2381,0.5911688811978919,0.04883100902378999,0.0017543109926998264
56
+ 0.5396825396825397,2480,0.569913190896405,0.05086136177194422,0.0018272537849204407
57
+ 0.5496031746031746,2548,0.554468003063123,0.05225594749794914,0.0018773559048295497
58
+ 0.5595238095238095,2651,0.5376141635489305,0.05436833470057424,0.001953245880574229
59
+ 0.5694444444444444,2811,0.5065435623907723,0.05764971287940935,0.0020711332215368385
60
+ 0.5803571428571429,3009,0.4744476851437965,0.0617104183757178,0.002217018805978067
61
+ 0.5902777777777778,3160,0.4526047518884247,0.06480721903199343,0.0023282749840115293
62
+ 0.6001984126984127,3261,0.4398674447582947,0.0668785890073831,0.0024026913679941766
63
+ 0.6101190476190477,3433,0.4182156945481237,0.07040607054963084,0.0025294202595289813
64
+ 0.6200396825396826,3714,0.385699748941216,0.07616899097621001,0.0027364599020945632
65
+ 0.6299603174603174,3846,0.3735030733895079,0.07887612797374897,0.0028337169583887156
66
+ 0.6398809523809523,3906,0.3673857063584148,0.08010664479081214,0.002877924711249694
67
+ 0.6498015873015873,4074,0.3524044985857981,0.083552091878589,0.003001706419260434
68
+ 0.6597222222222222,4190,0.3419197998258842,0.08593109105824447,0.0030871747414583255
69
+ 0.6696428571428571,4365,0.3263783963721026,0.08952009844134537,0.003216114020636179
70
+ 0.6795634920634921,4530,0.3143190758771201,0.09290401968826907,0.0033376853410038696
71
+ 0.689484126984127,4709,0.3015478411563975,0.09657506152584085,0.0034695718037057884
72
+ 0.6994047619047619,4873,0.2918550526925871,0.09993847415914685,0.0035904063281924628
73
+ 0.7093253968253969,5083,0.2786949054424083,0.10424528301886793,0.003745133463205887
74
+ 0.7202380952380952,5257,0.2691313357520127,0.1078137817883511,0.0038733359465027246
75
+ 0.7301587301587301,5642,0.2494124362008614,0.11570959803117309,0.004157002360694002
76
+ 0.7400793650793651,5863,0.2395665365261546,0.12024200164068909,0.004319834250398606
77
+ 0.75,6103,0.2296975500170638,0.12516406890894174,0.00449666526184252
78
+ 0.7599206349206349,6314,0.2220161304406145,0.12949138638228055,0.004652129192736961
79
+ 0.7698412698412699,6505,0.2150225411561583,0.13340853158326496,0.004792857206011076
80
+ 0.7797619047619048,6648,0.20896555791065,0.13634126333059884,0.004898219016996407
81
+ 0.7896825396825397,6925,0.1988488053196262,0.14202214930270712,0.005102311476037924
82
+ 0.7996031746031746,7172,0.1912203515618782,0.14708777686628383,0.0052843000586489525
83
+ 0.8095238095238095,7453,0.1817782046397847,0.152850697292863,0.005491339701214534
84
+ 0.8194444444444444,7671,0.1758490505459375,0.15732157506152583,0.005651961203276089
85
+ 0.8293650793650794,7952,0.1691944806665418,0.163084495488105,0.0058590008458416715
86
+ 0.8392857142857143,8146,0.1645962122201038,0.16706316652994257,0.006001939246758835
87
+ 0.8492063492063492,8428,0.1573623039169127,0.17284659557013946,0.006209715685205433
88
+ 0.8601190476190477,8785,0.1499039332865125,0.1801681706316653,0.006472751814728255
89
+ 0.8700396825396826,9156,0.1425418343797845,0.18777686628383922,0.006746103086585305
90
+ 0.8799603174603174,9485,0.1369052513814404,0.1945242001640689,0.006988508931439669
91
+ 0.8898809523809523,9888,0.1296152659458055,0.20278917145200984,0.0072854376714892415
92
+ 0.8998015873015873,10242,0.1235157570136157,0.21004922067268253,0.007546263413369014
93
+ 0.9097222222222222,10647,0.1169143777742638,0.2183552091878589,0.007844665745180618
94
+ 0.9196428571428572,11156,0.1099605772220808,0.2287940935192781,0.008219694848617919
95
+ 0.929563492063492,11792,0.1021918749641372,0.24183757178014767,0.008688297028944289
96
+ 0.939484126984127,12184,0.0977734981562192,0.24987694831829368,0.008977121014302682
97
+ 0.949404761904762,12755,0.0912248028336146,0.2615873666940115,0.009397831462362992
98
+ 0.9593253968253967,14007,0.079949907336287,0.28726415094339625,0.01032029990539541
99
+ 0.9692460317460316,14613,0.0749577471557994,0.2996923707957342,0.01076679820929129
100
+ 0.9791666666666666,16478,0.0625395687276457,0.33794093519278096,0.012140922527386702
101
+ 0.9890873015873016,20751,0.0435969642897507,0.42557424118129616,0.015289251326969382
102
+ 1.0,48760,0.0098477489215356,1.0,0.035926167158355116
metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.003076923076923,5,56750.0,3.719988988832593e-05,3.6839794050815336e-06
3
+ 0.0123076923076923,31,28349.359999999968,0.00023063931730762076,2.284067231150551e-05
4
+ 0.0215384615384615,52,24319.559999999983,0.0003868788548385897,3.8313385812847955e-05
5
+ 0.0307692307692307,65,21221.51999999996,0.0004835985685482371,4.789173226605994e-05
6
+ 0.04,77,19165.07999999996,0.0005728783042802194,5.673328283825562e-05
7
+ 0.0523076923076923,95,17173.200000000004,0.0007067979078781927,6.999560869654915e-05
8
+ 0.0615384615384615,106,16057.56,0.0007886376656325097,7.810036338772852e-05
9
+ 0.0707692307692307,120,14684.879999999996,0.0008927973573198224,8.841550572195681e-05
10
+ 0.08,135,13536.373333333351,0.0010043970269848002,9.946744393720141e-05
11
+ 0.0923076923076923,167,11616.800000000008,0.001242476322270086,0.00012304491212972323
12
+ 0.1015384615384615,179,10952.400000000009,0.0013317560580020683,0.0001318864627019189
13
+ 0.1107692307692307,194,9990.320000000002,0.001443355727667046,0.00014293840091716351
14
+ 0.12,209,9376.240000000002,0.0015549553973320238,0.00015399033913240812
15
+ 0.1323076923076923,233,8752.44,0.0017335148687959883,0.00017167344027679948
16
+ 0.1415384615384615,241,8540.986666666664,0.0017930346926173099,0.00017756780732492992
17
+ 0.1507692307692307,278,7806.799999999999,0.0020683138777909216,0.0002048292549225333
18
+ 0.16,299,7523.599999999994,0.002224553415321891,0.00022030196842387574
19
+ 0.1723076923076923,317,7362.599999999994,0.002358473018919864,0.00023356429428216923
20
+ 0.1815384615384615,338,7073.840000000001,0.002514712556450833,0.0002490370077835117
21
+ 0.1907692307692307,384,6516.36,0.0028569515434234316,0.0002829296183102618
22
+ 0.2,419,6033.56,0.003117350772641713,0.00030871747414583253
23
+ 0.2123076923076923,456,5618.76,0.0033926299578153248,0.0003359789217434359
24
+ 0.2215384615384615,487,5280.888,0.003623269275122946,0.0003588195940549414
25
+ 0.2307692307692307,526,4824.393333333336,0.003913428416251888,0.0003875546334145774
26
+ 0.24,609,4260.660000000002,0.004530946588398099,0.0004487086915389308
27
+ 0.2523076923076923,645,4030.0,0.004798785795594045,0.00047523334325551785
28
+ 0.2615384615384615,719,3691.0899999999992,0.005349344165941269,0.0005297562384507246
29
+ 0.2707692307692307,775,3428.82,0.00576598293269052,0.0005710168077876377
30
+ 0.28,784,3394.6466666666665,0.0058329427344895055,0.0005776479707167845
31
+ 0.2892307692307692,845,3166.7999999999997,0.006286781391127082,0.0006225925194587792
32
+ 0.3015384615384615,878,3054.0666666666657,0.006532300664390033,0.0006469067835323173
33
+ 0.3107692307692307,946,2800.0149999999994,0.007038219166871266,0.0006970089034414262
34
+ 0.32,997,2616.640000000003,0.00741765804373219,0.0007345854933732579
35
+ 0.3292307692307692,1048,2474.848000000001,0.007797096920593115,0.0007721620833050895
36
+ 0.3415384615384615,1125,2290.6666666666665,0.008369975224873335,0.0008288953661433451
37
+ 0.3507692307692308,1152,2251.8,0.008570854630270294,0.0008487888549307854
38
+ 0.36,1195,2140.38,0.008890773683309898,0.0008804710778144866
39
+ 0.3692307692307692,1271,2013.0342857142855,0.009456212009612452,0.0009364675647717259
40
+ 0.3815384615384615,1292,1982.117333333333,0.00961245154714342,0.0009519402782730683
41
+ 0.3907692307692307,1368,1840.1704347826085,0.010177889873445974,0.0010079367652303076
42
+ 0.4,1407,1778.9000000000003,0.010468049014574916,0.0010366718045899436
43
+ 0.4092307692307692,1447,1719.88,0.010765648133681525,0.001066143639830596
44
+ 0.4215384615384615,1607,1516.499999999999,0.011956044610107954,0.0011840309807932049
45
+ 0.4307692307692308,1755,1346.0,0.013057161350802402,0.0012930767711836185
46
+ 0.44,1823,1287.986666666667,0.013563079853283633,0.0013431788910927272
47
+ 0.4492307692307692,1975,1153.6727272727285,0.014693956505888743,0.001455171865007206
48
+ 0.4615384615384615,2071,1087.9872268907568,0.0154081943917446,0.0015259042695847714
49
+ 0.4707692307692308,2231,975.34,0.01659859086817103,0.0016437916105473804
50
+ 0.48,2396,887.333684210527,0.017826187234485785,0.001765362930915071
51
+ 0.4892307692307692,2619,793.9771428571429,0.01948530232350512,0.0019296684123817074
52
+ 0.5015384615384615,2736,753.530612244898,0.02035577974689195,0.002015873530460615
53
+ 0.5107692307692308,2919,700.650976744186,0.02171729571680468,0.0021507071766865993
54
+ 0.52,3127,644.4646666666665,0.023264811136159035,0.0023039607199379915
55
+ 0.5292307692307693,3301,600.444358974359,0.02455936730427278,0.0024321632032348285
56
+ 0.5384615384615384,3471,561.6797385620916,0.02582416356047586,0.002557418503007601
57
+ 0.5507692307692308,4115,451.9967654986525,0.03061550937809224,0.003031915050382102
58
+ 0.56,4518,396.6374331550802,0.033613820503091314,0.003328843790431674
59
+ 0.5692307692307692,4667,379.78126543209873,0.034722377221763426,0.003438626376703104
60
+ 0.5784615384615385,4849,357.453793103448,0.03607645321369849,0.0035727232270480716
61
+ 0.5907692307692308,4972,345.265,0.036991570504951304,0.0036633491204130773
62
+ 0.6,5261,320.21842105263147,0.03914172414049654,0.00387628313002679
63
+ 0.6092307692307692,5681,289.5254531126872,0.04226651489111592,0.004185737400053639
64
+ 0.6184615384615385,6055,263.0394202898551,0.0450490666547627,0.004461299059553737
65
+ 0.6307692307692307,6802,226.3273758865248,0.0506067302040786,0.005011685582672919
66
+ 0.64,7367,201.76998989694889,0.05481031776145943,0.005427975255447132
67
+ 0.6492307692307693,7801,184.9621954484605,0.05803926820376612,0.005747744667808209
68
+ 0.6584615384615384,7985,178.01515789473683,0.05940822415165651,0.00588331510991521
69
+ 0.6707692307692308,8079,175.3568580560256,0.06010758208155704,0.005952573922730743
70
+ 0.68,8443,165.98497588652486,0.06281573406542716,0.006220767623420678
71
+ 0.6892307692307692,9241,144.15345029239765,0.06875283649160399,0.006808730736471691
72
+ 0.6984615384615385,9504,138.80271646859083,0.07070955069972992,0.00700250805317898
73
+ 0.7107692307692308,9775,133.6674772036474,0.07272578473167719,0.0072021797369343984
74
+ 0.72,10400,121.42667780562527,0.07737577096771793,0.00766267716256959
75
+ 0.7292307692307692,10868,113.25071895424836,0.08085768066126524,0.008007497634885221
76
+ 0.7384615384615385,11823,99.04954407294836,0.0879628596299355,0.008711137701255795
77
+ 0.7507692307692307,12267,93.8173076923077,0.09126620985201883,0.009038275072427035
78
+ 0.76,12714,88.54265957446803,0.09459188000803517,0.009367622831241325
79
+ 0.7692307692307693,14247,74.15760000000002,0.1059973662477959,0.010497130916839322
80
+ 0.7784615384615384,14743,70.05540389053742,0.10968759532471783,0.01086258167382341
81
+ 0.7876923076923077,15830,62.71639784946234,0.1177748513864399,0.011663478796488136
82
+ 0.8,16463,58.88816251076612,0.12248435744630196,0.012129870589171459
83
+ 0.8092307692307692,17102,55.05751773049646,0.12723850337403,0.012600683157140878
84
+ 0.8184615384615385,19538,44.345175283888736,0.1453622897276224,0.014395517923296602
85
+ 0.8276923076923077,20576,40.51128376496799,0.15308498686843888,0.015160312047791528
86
+ 0.84,22694,34.49191625472521,0.16884286022513373,0.016720845723784065
87
+ 0.8492307692307692,24366,30.565914650720305,0.18128250340378993,0.01795276843684333
88
+ 0.8584615384615385,27034,25.41807646695045,0.20113236464820064,0.019918539847394837
89
+ 0.8676923076923077,28429,23.0279127002554,0.21151113392704357,0.020946370101412586
90
+ 0.88,32972,17.645820519988817,0.24531095387957652,0.024293633788869666
91
+ 0.8892307692307693,39488,12.639793663256718,0.29378985038204286,0.029094595749571923
92
+ 0.8984615384615384,44033,10.027246243440574,0.32760455029053115,0.03244333302879104
93
+ 0.9076923076923076,46235,9.17001356841133,0.3439873817973499,0.03406575755878894
94
+ 0.92,52119,7.312447220141592,0.38776421221793184,0.03840106452268889
95
+ 0.9292307692307692,61820,5.1008903893765485,0.4599394385792618,0.045548721364428085
96
+ 0.9384615384615383,65714,4.641389084785888,0.48891071282429005,0.048417804525105586
97
+ 0.9476923076923076,77011,3.2563426595787157,0.5729601440379737,0.0567413875929468
98
+ 0.96,85185,2.703150426409592,0.6337745240274089,0.06276395712437409
99
+ 0.9692307692307692,90951,2.2612983428008078,0.6766734370466263,0.06701232217431412
100
+ 0.9784615384615384,100735,1.9992355751561923,0.7494661815801025,0.07422113307417766
101
+ 0.9876923076923076,107699,1.5609094071743144,0.8012781882165628,0.07935217958957522
102
+ 1.0,134409,1.0,1.0,0.09903199757152077
metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.0009920634920634,5,56750.0,9.770128418567934e-06,3.6839794050815336e-06
3
+ 0.0109126984126984,68,20769.80999999998,0.0001328737464925239,5.010211990910886e-05
4
+ 0.0208333333333333,108,15734.440000000006,0.00021103477384106737,7.957395514976113e-05
5
+ 0.0307539682539682,172,11420.029999999995,0.0003360924175987369,0.00012672889153480477
6
+ 0.0406746031746031,219,9203.879999999965,0.0004279316247332755,0.00016135829794257118
7
+ 0.050595238095238,284,7694.966666666665,0.0005549432941746586,0.00020925003020863113
8
+ 0.060515873015873,370,6665.859999999988,0.0007229895029740271,0.0002726144759760335
9
+ 0.0704365079365079,476,5388.830000000004,0.0009301162254476673,0.000350714839363762
10
+ 0.0803571428571428,643,4035.7200000000007,0.0012564385146278364,0.00047375975149348525
11
+ 0.0902777777777777,783,3395.8991666666666,0.0015300021103477385,0.0005769111748357682
12
+ 0.1001984126984127,947,2795.887500000001,0.0018504623224767666,0.0006977456993224426
13
+ 0.1101190476190476,1129,2285.583333333333,0.0022060949969126393,0.0008318425496674104
14
+ 0.1200396825396825,1274,2010.7097142857144,0.0024894287210511096,0.0009386779524147748
15
+ 0.1299603174603174,1422,1754.7858823529411,0.0027786245222407202,0.0010477237428051882
16
+ 0.1398809523809523,1789,1316.0649999999996,0.0034957519481636067,0.0013181278311381727
17
+ 0.1507936507936507,2189,1004.0008333333316,0.004277362221649041,0.0016128461835446955
18
+ 0.1607142857142857,2717,759.2236734693877,0.005309087782649815,0.0020018744087213053
19
+ 0.1706349206349206,3285,605.9448076923071,0.0064189743709991325,0.0024203744691385677
20
+ 0.1805555555555555,4507,397.9200534759351,0.008806793756497136,0.0033207390357404944
21
+ 0.1904761904761904,4979,344.9141220238097,0.009729093879209949,0.0036685066915801913
22
+ 0.2003968253968254,6185,256.6727272727276,0.012085648853768534,0.004557082524085858
23
+ 0.2103174603174603,7881,181.9363148148153,0.015399676413346776,0.0058066883382895135
24
+ 0.2202380952380952,8532,163.15705,0.016671747133444322,0.00628634245683113
25
+ 0.2301587301587301,9916,130.25747244296335,0.019376118679703926,0.007306067956157698
26
+ 0.2400793650793651,12130,95.4236079560604,0.023702331543445806,0.008937334036727801
27
+ 0.25,14577,71.5297497155859,0.028483832391492953,0.010740273557574703
28
+ 0.2599206349206349,17026,55.781783181357575,0.033269241290907525,0.012544686670183639
29
+ 0.2698412698412698,22643,34.6215403148756,0.04424500355632675,0.016683269133852235
30
+ 0.2797619047619047,28429,23.02300286355545,0.05555099616229356,0.020946370101412586
31
+ 0.2906746031746032,45428,9.636950619740496,0.08876747875974082,0.03347116328280878
32
+ 0.3005952380952381,61961,5.027143821742062,0.12107338538857755,0.04565260958365138
33
+ 0.310515873015873,86051,2.6456794555995926,0.16814586410923785,0.06340202235733422
34
+ 0.3224206349206349,134409,1.0,0.26263863812225946,0.09903199757152077
35
+ 1.0,511764,0.0,1.0,0.3770656072524292
metric_analysis/output_standardized/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tpr,new_tweets,threshold,fpr,fpr2
2
+ 0.503968253968254,814,1.0,0.0005997518471472737,0.0005997518471472737
3
+ 0.816468253968254,20605,0.99,0.015181679128341001,0.015181679128341001
4
+ 0.876984126984127,46379,0.98,0.03417185616565529,0.03417185616565529
5
+ 0.9067460317460316,61903,0.97,0.04560987542255244,0.04560987542255244
6
+ 0.9206349206349206,92538,0.96,0.068181617237487,0.068181617237487
7
+ 0.9305555555555556,117906,0.95,0.08687265514710867,0.08687265514710867
8
+ 0.9424603174603174,160095,0.94,0.11795733657130564,0.11795733657130564
9
+ 0.9523809523809524,203539,0.93,0.14996669682617805,0.14996669682617805
10
+ 0.9583333333333334,238956,0.92,0.1760617965441326,0.1760617965441326
11
+ 0.9662698412698412,272234,0.91,0.20058088987259326,0.20058088987259326
12
+ 0.9742063492063492,310028,0.9,0.22842735339972356,0.22842735339972356
13
+ 0.9771825396825397,339615,0.89,0.250226933131353,0.250226933131353
14
+ 0.9791666666666666,375135,0.88,0.2763979228250522,0.2763979228250522
15
+ 0.9831349206349206,422834,0.87,0.31154234955364907,0.31154234955364907
16
+ 0.9851190476190476,462519,0.86,0.3407820940917812,0.3407820940917812
17
+ 0.9861111111111112,490552,0.85,0.3614366930243113,0.3614366930243113
18
+ 0.988095238095238,528701,0.84,0.3895447190892024,0.3895447190892024
19
+ 0.9910714285714286,574204,0.83,0.42307114206308744,0.42307114206308744
20
+ 0.9930555555555556,610579,0.82,0.4498720922350556,0.4498720922350556
21
+ 0.9940476190476192,646190,0.81,0.4761101303539273,0.4761101303539273
22
+ 0.9950396825396826,822465,0.75,0.6059888242800767,0.6059888242800767
23
+ 0.996031746031746,937666,0.71,0.6908684465690363,0.6908684465690363
24
+ 0.9970238095238096,1029756,0.66,0.758719979251828,0.758719979251828
25
+ 0.9990079365079364,1072643,0.64,0.7903189442009744,0.7903189442009744
26
+ 1.0,1174373,0.57,0.8652731891767632,0.8652731891767632
metric_analysis/twitter_viral_model.ipynb ADDED
@@ -0,0 +1,2303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Comparison of Twitter's model of viral tweets with other tweets"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "In this notebook, we try to identify common features of Twitter's identified viral tweets found on the topic page \"Viral Tweets\".\n",
15
+ "\n",
16
+ "We also experiment to find if other tweets that have not figured on that topic page, can also be labeled as viral based on these common features. This should help homogeinize the data (those that are viral and those that are not) when training the model."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "import pandas as pd\n",
26
+ "import seaborn as sns\n",
27
+ "import numpy as np\n",
28
+ "\n",
29
+ "import matplotlib.pyplot as plt\n",
30
+ "%matplotlib inline\n",
31
+ "\n",
32
+ "import plotly.express as px\n",
33
+ "import plotly.graph_objects as go\n",
34
+ "from plotly.subplots import make_subplots\n",
35
+ "\n",
36
+ "from helper.text_preprocessing import clear_reply_mentions\n",
37
+ "\n",
38
+ "from tqdm import tqdm\n",
39
+ "\n",
40
+ "#pd.set_option('display.max_rows', None)\n",
41
+ "pd.set_option('display.max_columns', None)\n",
42
+ "\n",
43
+ "DATA_PATH = \"../../data\"\n",
44
+ "VIRAL_TWEETS_PATH = f\"{DATA_PATH}/new/viral\"\n",
45
+ "COVID_TWEETS_PATH = f\"{DATA_PATH}/new/covid\"\n",
46
+ "\n",
47
+ "PROCESSED_PATH_VIRAL = f'{DATA_PATH}/new/processed/viral'\n",
48
+ "PROCESSED_PATH_COVID = f'{DATA_PATH}/new/processed/covid'"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "metadata": {},
54
+ "source": [
55
+ "## 0. Preprocessing"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "viral_dataset = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/all_tweets.parquet.gzip\")"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "covid_dataset = pd.read_parquet(f\"{COVID_TWEETS_PATH}/all_tweets.parquet.gzip\")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "covid_users = pd.read_parquet(f\"{COVID_TWEETS_PATH}/users.parquet.gzip\")"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "markdown",
87
+ "metadata": {},
88
+ "source": [
89
+ "- Keep only original tweets from **covid dataset**. Viral dataset doesn't have retweets"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "metadata": {},
96
+ "outputs": [],
97
+ "source": [
98
+ "def is_retweeted(referenced_tweets):\n",
99
+ " for x in referenced_tweets:\n",
100
+ " if x['type'] == 'retweeted':\n",
101
+ " return True\n",
102
+ " return False\n",
103
+ "\n",
104
+ "# Keep only original tweets\n",
105
+ "referenced = covid_dataset.loc[~covid_dataset.referenced_tweets.isna()].copy()\n",
106
+ "referenced.loc[:, 'is_retweet'] = referenced.referenced_tweets.apply(is_retweeted)\n",
107
+ "retweeted = referenced[referenced.is_retweet]\n",
108
+ "retweeted"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": null,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "original_covid_tweets = covid_dataset[~covid_dataset.id.isin(retweeted.id)]\n",
118
+ "original_covid_tweets.to_parquet(f\"{COVID_TWEETS_PATH}/all_original_tweets.parquet.gzip\", index=False, compression=\"gzip\")"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": null,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "# Clear reply mentions at the beginning of tweets texts\n",
128
+ "original_covid_tweets.loc[:, \"text\"] = original_covid_tweets.text.apply(clear_reply_mentions)\n",
129
+ "viral_dataset.loc[:, \"text\"] = viral_dataset.text.apply(clear_reply_mentions)"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "markdown",
134
+ "metadata": {},
135
+ "source": [
136
+ "## 1. Exploration"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "markdown",
141
+ "metadata": {},
142
+ "source": [
143
+ "### 1.1 - General Exploration"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": null,
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "viral_dataset = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/all_tweets.parquet.gzip\")\n",
153
+ "viral_users = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/users.parquet.gzip\")\n",
154
+ "viral_tweets_ids = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/viral_tweets_ids.parquet.gzip\")"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "original_covid_tweets = pd.read_parquet(f\"{COVID_TWEETS_PATH}/all_original_tweets.parquet.gzip\")\n",
164
+ "covid_users = pd.read_parquet(f\"{COVID_TWEETS_PATH}/users.parquet.gzip\")"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": null,
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "display(\"--- VIRAL DATASET ---\")\n",
174
+ "\n",
175
+ "display(f\"{len(viral_tweets_ids)} viral tweets collected\")\n",
176
+ "display(f\"{len(viral_users)} viral users\")\n",
177
+ "display(f\"{len(viral_dataset)} all tweets collected\")\n",
178
+ "\n",
179
+ "display(\"--- COVID DATASET ---\")\n",
180
+ "\n",
181
+ "display(f\"{len(original_covid_tweets)} original (not retweeted) covid tweets collected\")\n",
182
+ "display(f\"{len(original_covid_tweets.author_id.unique())} covid users collected\")"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "# REMOVE THIS WHEN DONE COLLECTION (WARNING NOT NECESSARILY)\n",
192
+ "viral_dataset['viral'] = viral_dataset.id.isin(viral_tweets_ids.id)\n",
193
+ "\n",
194
+ "#viral_tweets = all_tweets[all_tweets.id.isin(viral_tweets.id)]\n",
195
+ "#viral_tweets\n",
196
+ "\n",
197
+ "len(viral_dataset[viral_dataset.viral])"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "markdown",
202
+ "metadata": {},
203
+ "source": [
204
+ "- merge tweets with user info"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "covid_users.columns"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "user_columns = ['author_id', 'followers_count', 'following_count', 'tweet_count', 'protected', 'verified', 'username']\n",
223
+ "viral_dataset_with_users = viral_dataset.merge(viral_users.rename(columns={'id': 'author_id'})[user_columns], on='author_id')\n",
224
+ "covid_dataset_with_users = original_covid_tweets.merge(covid_users.rename(columns={'id': 'author_id'})[user_columns], on='author_id')"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "markdown",
229
+ "metadata": {},
230
+ "source": [
231
+ "#### 1.1.1 - Correlation between public metrics"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "markdown",
236
+ "metadata": {},
237
+ "source": [
238
+ "- Pearson Correlation between the different public metrics"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": null,
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "public_metrics = ['retweet_count', 'like_count', 'reply_count', 'quote_count', 'followers_count', 'following_count']\n",
248
+ "display(viral_dataset_with_users[public_metrics].corr())\n",
249
+ "display(covid_dataset_with_users[public_metrics].corr())"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": null,
255
+ "metadata": {},
256
+ "outputs": [],
257
+ "source": [
258
+ "px.scatter(viral_dataset, x='like_count', y='retweet_count')"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "markdown",
263
+ "metadata": {},
264
+ "source": [
265
+ "#### 1.1.2 - Exploring retweet count of viral vs non viral tweets"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "markdown",
270
+ "metadata": {},
271
+ "source": [
272
+ "Since we have a large number of tweets to plot, we'll only sample a few from each user"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": null,
278
+ "metadata": {},
279
+ "outputs": [],
280
+ "source": [
281
+ "def get_largest_n(all_tweets, by='retweet_count', n=100):\n",
282
+ " '''Get the largest 100 tweets by retweet count for every user\n",
283
+ " '''\n",
284
+ " top_n_per_user = all_tweets.groupby(by='author_id')[by].nlargest(n=100).reset_index(level=0, drop=True)\n",
285
+ " tweets_for_plot = all_tweets[all_tweets.index.isin(top_n_per_user.index)].reset_index()\n",
286
+ " return tweets_for_plot"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": null,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "tweets_plot_df = get_largest_n(viral_dataset, by='retweet_count')\n",
296
+ "fig = px.scatter(tweets_plot_df, x=tweets_plot_df.index, y='retweet_count', color='viral')\n",
297
+ "\n",
298
+ "fig.update_layout(title_text=\"Viral Dataset Scatter plot of the retweet count for the top 100 tweets per user\", xaxis_title=\"Index\", yaxis_title=\"retweet count\")\n",
299
+ "\n",
300
+ "fig.show()"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": null,
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": [
309
+ "covid_tweets_plot_df = original_covid_tweets.sort_values(by='retweet_count', ascending=False)[:10000]\n",
310
+ "fig = px.scatter(covid_tweets_plot_df, x=covid_tweets_plot_df.reset_index().index, y='retweet_count')\n",
311
+ "\n",
312
+ "fig.update_layout(title_text=\"Covid Dataset Scatter plot of retweet count sorted by retweet count on a 10000 sample\", xaxis_title=\"Index\", yaxis_title=\"retweet count\")\n",
313
+ "\n",
314
+ "fig.show()"
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "markdown",
319
+ "metadata": {},
320
+ "source": [
321
+ "**Finding**: Viral tweets identified by twitter are by no means more viral than other tweets tweeted by the same users. Are users who have tweeted viral tweets (as identified by Twitter) likely to have tweeted other viral tweets?"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": null,
327
+ "metadata": {},
328
+ "outputs": [],
329
+ "source": [
330
+ "# Get the ratio for each tweet's retweet count wrt to the mean retweet count of the user's tweets\n",
331
+ "# Again since we're retrieved 3200 tweets per user, we're only taking the average over that\n",
332
+ "users_avg_retweets = viral_dataset.groupby(by='author_id').agg(mean_retweets=('retweet_count', 'mean'))\n",
333
+ "tweets_merged_avg_retweets = viral_dataset.merge(right=users_avg_retweets, left_on='author_id', right_index=True)\n",
334
+ "tweets_merged_avg_retweets['ratio_avg_retweets'] = tweets_merged_avg_retweets['retweet_count'] / tweets_merged_avg_retweets['mean_retweets']\n",
335
+ "tweets_merged_avg_retweets_sorted = tweets_merged_avg_retweets.sort_values(by='ratio_avg_retweets').reset_index()"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": null,
341
+ "metadata": {},
342
+ "outputs": [],
343
+ "source": [
344
+ "tweets_plot_df = get_largest_n(tweets_merged_avg_retweets_sorted, by='ratio_avg_retweets')\n",
345
+ "\n",
346
+ "fig = px.scatter(tweets_plot_df, x=tweets_plot_df.index, y='ratio_avg_retweets', color='viral')\n",
347
+ "\n",
348
+ "fig.update_layout(title_text=\"Scatter plot of the tweets sorted by the ratio #retweets/(the mean user avg #retweets)\", xaxis_title=\"Index\", yaxis_title=\"ratio\")\n",
349
+ "\n",
350
+ "fig.show()"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "markdown",
355
+ "metadata": {},
356
+ "source": [
357
+ "**Finding**: Cleaner separation. Viral tweets, as expected, are on the other end of the spectrum. However other tweets in the same range could qualify as viral as well. These tweets should be identified as viral by the Twitter model."
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "markdown",
362
+ "metadata": {},
363
+ "source": [
364
+ "### 1.2 Finding the right threshold for virality"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "markdown",
369
+ "metadata": {},
370
+ "source": [
371
+ "#### 1.2.0 - Relabel viral tweets in the viral dataset by correcting the initial virality threshold (ONLY IN OLD PAPER SUBMITTED BY STUDENT)"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "markdown",
376
+ "metadata": {},
377
+ "source": [
378
+ "Let's observe the retweet count of a user based on the tweet date."
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "code",
383
+ "execution_count": null,
384
+ "metadata": {},
385
+ "outputs": [],
386
+ "source": [
387
+ "sample_user = viral_users.id[10]\n",
388
+ "author_tweets = viral_dataset[viral_dataset.author_id == sample_user]\n",
389
+ "fig = px.scatter(author_tweets, x='created_at', y='retweet_count', color='viral')\n",
390
+ "\n",
391
+ "fig.update_layout(title_text=\"Scatter plot of the retweet count wrt to the tweet date for a single user\")\n",
392
+ "\n",
393
+ "fig.show() "
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "markdown",
398
+ "metadata": {},
399
+ "source": [
400
+ "**Finding**: The above graph of a user's retweet count wrt the tweet date, shows that the viral tweets taken from the Twitter \"Viral Tweets\" topic page, have been taken at certain points in time. **Other tweets with higher retweet counts** may have been on that Topic page at different points in time as well. In any case, they **should be qualified as viral all the same**."
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "markdown",
405
+ "metadata": {},
406
+ "source": [
407
+ "One quick fix for that is, for each user, mark as viral all tweets that have higher retweet count than the viral tweet we scraped for that user. "
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "code",
412
+ "execution_count": null,
413
+ "metadata": {},
414
+ "outputs": [],
415
+ "source": [
416
+ "# Get the minimum retweet count out of the viral tweets for each user\n",
417
+ "min_retweet_count_by_user = viral_dataset[viral_dataset.viral].groupby(by='author_id')[['retweet_count']].min()\n",
418
+ "\n",
419
+ "# Set as viral any tweet that has a retweet count higher or equal to the user's minimum retweet count we just computed\n",
420
+ "viral_dataset_labeled = viral_dataset.merge(min_retweet_count_by_user, left_on='author_id', right_index=True, suffixes=(None, \"_user_viral_threshold\"))\n",
421
+ "viral_dataset_labeled['viral'] = viral_dataset_labeled['retweet_count'] >= viral_dataset_labeled['retweet_count_user_viral_threshold']"
422
+ ]
423
+ },
424
+ {
425
+ "cell_type": "code",
426
+ "execution_count": null,
427
+ "metadata": {},
428
+ "outputs": [],
429
+ "source": [
430
+ "# Save this result \n",
431
+ "#viral_dataset_labeled.to_parquet(f'{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip', compression='gzip')"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "code",
436
+ "execution_count": null,
437
+ "metadata": {},
438
+ "outputs": [],
439
+ "source": [
440
+ "display(f\"Number of identified viral tweets increased from {len(viral_tweets_ids)} to {len(viral_dataset_labeled[viral_dataset_labeled.viral])}\")"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "markdown",
445
+ "metadata": {},
446
+ "source": [
447
+ "Another problem we're facing is that we're **missing historical data** on the number of followers of a user. So we cannot use the metric of:\n",
448
+ "$ \\frac{\\#retweets}{\\#followers}$ effectively. That's why we came up with the other metric: $\\frac{\\#retweets}{mean(\\#retweets)}$."
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "markdown",
453
+ "metadata": {},
454
+ "source": [
455
+ "#### 1.2.1 Applying the virality followers metric to both datasets"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": null,
461
+ "metadata": {},
462
+ "outputs": [],
463
+ "source": [
464
+ "# Applying the first metric on the covid dataset\n",
465
+ "covid_dataset_with_users['virality_followers'] = covid_dataset_with_users['retweet_count'] / covid_dataset_with_users['followers_count'].astype(\"float64\")\n",
466
+ "# Handle division by zero if user has 0 followers\n",
467
+ "covid_dataset_with_users['virality_followers'] = covid_dataset_with_users.virality_followers.replace({np.inf: 0.0})"
468
+ ]
469
+ },
470
+ {
471
+ "cell_type": "code",
472
+ "execution_count": null,
473
+ "metadata": {},
474
+ "outputs": [],
475
+ "source": [
476
+ "len(covid_dataset_with_users[(covid_dataset_with_users['virality_followers'] > 0.8)])"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": null,
482
+ "metadata": {},
483
+ "outputs": [],
484
+ "source": [
485
+ "# Applying the second metric on the viral dataset\n",
486
+ "viral_dataset_with_users['virality_followers'] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['followers_count'].astype(\"float64\")\n",
487
+ "# Handle division by zero if user has 0 followers\n",
488
+ "viral_dataset_with_users['virality_followers'] = viral_dataset_with_users.virality_followers.replace({np.inf: 0.0})"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": null,
494
+ "metadata": {},
495
+ "outputs": [],
496
+ "source": [
497
+ "len(viral_dataset_with_users[(viral_dataset_with_users['virality_followers'] > 1)])"
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "markdown",
502
+ "metadata": {},
503
+ "source": [
504
+ "#### 1.2.2 Applying the virality avg retweets metric to viral dataset "
505
+ ]
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "execution_count": null,
510
+ "metadata": {},
511
+ "outputs": [],
512
+ "source": [
513
+ "viral_users_retweet_statistics = viral_dataset_with_users.groupby(by='author_id').retweet_count.agg(['min', 'mean', 'max'])\n",
514
+ "viral_users_retweet_statistics = viral_users_retweet_statistics.rename(columns={\"min\": \"min_user_retweets\", \"max\": \"max_user_retweets\", \"mean\": \"mean_user_retweets\"})"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": null,
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "viral_dataset_with_users = viral_dataset_with_users.merge(viral_users_retweet_statistics, on='author_id')"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": null,
529
+ "metadata": {},
530
+ "outputs": [],
531
+ "source": [
532
+ "# Applying the first metric on the viral dataset\n",
533
+ "viral_dataset_with_users['virality_avg_retweets'] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['mean_user_retweets'].astype(\"float64\")\n",
534
+ "# Handle division by zero if user has 0 followers\n",
535
+ "viral_dataset_with_users['virality_avg_retweets'] = viral_dataset_with_users.virality_avg_retweets.replace({np.inf: 0.0})"
536
+ ]
537
+ },
538
+ {
539
+ "cell_type": "code",
540
+ "execution_count": null,
541
+ "metadata": {},
542
+ "outputs": [],
543
+ "source": [
544
+ "len(viral_dataset_with_users[(viral_dataset_with_users['virality_avg_retweets'] > 1)])"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "markdown",
549
+ "metadata": {},
550
+ "source": [
551
+ "#### 1.2.3 How many tweets are covered by metric 1?"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "code",
556
+ "execution_count": null,
557
+ "metadata": {},
558
+ "outputs": [],
559
+ "source": [
560
+ "temp = viral_dataset_with_users[viral_dataset_with_users.virality_followers > 0]\n",
561
+ "temp_2 = viral_dataset_with_users[viral_dataset_with_users.virality_avg_retweets > 0]\n",
562
+ "viral_temp = viral_dataset_with_users[viral_dataset_with_users.viral]"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": null,
568
+ "metadata": {},
569
+ "outputs": [],
570
+ "source": [
571
+ "fig = px.ecdf(viral_dataset_with_users[viral_dataset_with_users.viral], x='virality_followers')\n",
572
+ "\n",
573
+ "# TODO: percentage y axis\n",
574
+ "# TODO: Only take the scraped tweets\n",
575
+ "fig.update_layout(title_text=\"Percentage of viral tweets recognized by Metric 1: number of followers\", xaxis_title=\"Metric 1: virality_followers\", yaxis_title=\"Percentage\")\n",
576
+ "\n",
577
+ "fig.show()"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": null,
583
+ "metadata": {},
584
+ "outputs": [],
585
+ "source": [
586
+ "fig1 = sns.displot(temp, x='virality_followers', kind='ecdf')\n",
587
+ "\n",
588
+ "plt.xscale('log')\n",
589
+ "plt.title(\"Proportion of tweets labeled as viral as function of Metric 1: number of followers (logscale)\")"
590
+ ]
591
+ },
592
+ {
593
+ "cell_type": "code",
594
+ "execution_count": null,
595
+ "metadata": {},
596
+ "outputs": [],
597
+ "source": [
598
+ "temp_2 = viral_dataset_with_users[viral_dataset_with_users.virality_avg_retweets > 0]"
599
+ ]
600
+ },
601
+ {
602
+ "cell_type": "code",
603
+ "execution_count": null,
604
+ "metadata": {},
605
+ "outputs": [],
606
+ "source": [
607
+ "fig = px.ecdf(viral_temp, x='virality_avg_retweets')\n",
608
+ "\n",
609
+ "fig.update_layout(title_text=\"Percentage of viral tweets recognized by Metric 2 avg retweets\", xaxis_title=\"Metric 2: avg retweets\", yaxis_title=\"Percentage\")\n",
610
+ "\n",
611
+ "fig.show()"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "code",
616
+ "execution_count": null,
617
+ "metadata": {},
618
+ "outputs": [],
619
+ "source": [
620
+ "fig = sns.displot(temp_2, x='virality_avg_retweets', kind='ecdf')\n",
621
+ "\n",
622
+ "plt.xscale('log')\n",
623
+ "plt.title(\"Proportion of tweets labeled as viral as function of Metric 2: avg retweets (logscale)\")"
624
+ ]
625
+ },
626
+ {
627
+ "cell_type": "markdown",
628
+ "metadata": {},
629
+ "source": [
630
+ "TODO: Plot the percentage of viral tweets labeled vs the # of new tweets labeled wrt to the varying threshold of the metric we use. "
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": null,
636
+ "metadata": {},
637
+ "outputs": [],
638
+ "source": [
639
+ "#viral_dataset_with_users = viral_dataset_with_users.groupby(by='virality_followers').count()\n",
640
+ "viral_dataset_with_users = pd.read_parquet(f\"{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip\")\n",
641
+ "# Applying the second metric on the viral dataset\n",
642
+ "viral_dataset_with_users['virality_followers'] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['followers_count'].astype(\"float64\")\n",
643
+ "# Handle division by zero if user has 0 followers\n",
644
+ "viral_dataset_with_users['virality_followers'] = viral_dataset_with_users.virality_followers.replace({np.inf: 0.0})\n"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": null,
650
+ "metadata": {},
651
+ "outputs": [],
652
+ "source": [
653
+ "viral_dataset_with_users"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "code",
658
+ "execution_count": null,
659
+ "metadata": {},
660
+ "outputs": [],
661
+ "source": [
662
+ "viral_dataset_with_users_truncated = viral_dataset_with_users[viral_dataset_with_users.virality_followers > 0.1]\n",
663
+ "#viral_dataset_with_users['viral_metric_1'] = viral_dataset_with_users['']\n",
664
+ "len(viral_dataset_with_users_truncated)"
665
+ ]
666
+ },
667
+ {
668
+ "cell_type": "code",
669
+ "execution_count": null,
670
+ "metadata": {},
671
+ "outputs": [],
672
+ "source": [
673
+ "ready_to_plot = viral_dataset_with_users_truncated.copy()\n",
674
+ "ready_to_plot['viral'] = ready_to_plot['viral'].replace({False: None})\n",
675
+ "ready_to_plot = ready_to_plot.groupby(by='virality_followers').count()[['text', 'viral']].cumsum().rename(columns={'text':'tweets'})"
676
+ ]
677
+ },
678
+ {
679
+ "cell_type": "code",
680
+ "execution_count": null,
681
+ "metadata": {},
682
+ "outputs": [],
683
+ "source": [
684
+ "fig = px.line(ready_to_plot, x='viral', y='tweets', hover_data=[ready_to_plot.index])#, log_y=True)\n",
685
+ "\n",
686
+ "fig.update_layout(title_text=\"Line plot of #viral tweets labeled as viral vs # new tweets labeled as viral by varying threshold of Metric 1 (#followers)\", xaxis_title=\"Number of viral tweets labeled as viral\", yaxis_title=\"Number of new tweets labeled as viral\")\n",
687
+ "fig.show()"
688
+ ]
689
+ },
690
+ {
691
+ "cell_type": "code",
692
+ "execution_count": null,
693
+ "metadata": {},
694
+ "outputs": [],
695
+ "source": [
696
+ "ready_to_plot = viral_dataset_with_users_truncated.copy()\n",
697
+ "ready_to_plot['viral'] = ready_to_plot['viral'].replace({False: None})\n",
698
+ "ready_to_plot = ready_to_plot.groupby(by='virality_followers').count()[['text', 'viral']].cumsum().rename(columns={'text':'tweets'})\n",
699
+ "ready_to_plot['tweets'] = len(viral_dataset_with_users) - ready_to_plot.tweets"
700
+ ]
701
+ },
702
+ {
703
+ "cell_type": "code",
704
+ "execution_count": null,
705
+ "metadata": {},
706
+ "outputs": [],
707
+ "source": [
708
+ "fig = px.line(ready_to_plot, x='viral', y='tweets', hover_data=[ready_to_plot.index])#, log_y=True)\n",
709
+ "\n",
710
+ "fig.update_layout(title_text=\"Line plot of #viral tweets labeled as viral vs # new tweets labeled as viral by varying threshold of Metric 1 (#followers)\", xaxis_title=\"Number of viral tweets labeled as viral\", yaxis_title=\"Number of new tweets labeled as viral\")\n",
711
+ "fig.show()"
712
+ ]
713
+ },
714
+ {
715
+ "cell_type": "code",
716
+ "execution_count": null,
717
+ "metadata": {},
718
+ "outputs": [],
719
+ "source": [
720
+ "'''\n",
721
+ "tempo3 = tempo2.copy()\n",
722
+ "tempo3['viral'] = tempo3['viral'].replace({False: None})\n",
723
+ "tempo3 = tempo3.groupby(by='virality_followers').count()[['text', 'viral']].rename(columns={'text':'tweets'})\n",
724
+ "tempo3['viral_cumsum'] = tempo3.viral.cumsum()\n",
725
+ "tempo3\n",
726
+ "'''"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": null,
732
+ "metadata": {},
733
+ "outputs": [],
734
+ "source": [
735
+ "min_threshold = viral_dataset_with_users.virality_followers.min()\n",
736
+ "max_threshold = viral_dataset_with_users.virality_followers.max()\n",
737
+ "display(f\"sampling from {min_threshold} to {max_threshold}\")\n",
738
+ "thresholds_space = np.linspace(min_threshold, max_threshold, num=10000)\n",
739
+ "\n",
740
+ "number_of_viral_tweets = len(viral_dataset_with_users[viral_dataset_with_users.viral]) \n",
741
+ "\n",
742
+ "percentages_of_viral_covered = []\n",
743
+ "nb_of_tweets_labeled_as_viral = []\n",
744
+ "\n",
745
+ "for i in thresholds_space:\n",
746
+ " new_tweets_labeled = viral_dataset_with_users[viral_dataset_with_users.virality_followers >= i]\n",
747
+ " percentage_of_viral_covered = len(new_tweets_labeled[new_tweets_labeled.viral]) / number_of_viral_tweets\n",
748
+ " nb_of_tweets_labeled_as_viral.append(len(new_tweets_labeled))\n",
749
+ " percentages_of_viral_covered.append(percentage_of_viral_covered)"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": null,
755
+ "metadata": {},
756
+ "outputs": [],
757
+ "source": [
758
+ "result_to_plot = pd.DataFrame({'percentage_of_viral_covered':percentages_of_viral_covered, 'nb_of_tweets_labeled_as_viral':nb_of_tweets_labeled_as_viral, 'thresholds': thresholds_space})\n",
759
+ "\n",
760
+ "px.scatter(\n",
761
+ " result_to_plot,\n",
762
+ " x='percentage_of_viral_covered',\n",
763
+ " y='nb_of_tweets_labeled_as_viral', log_y=True, hover_name='thresholds')"
764
+ ]
765
+ },
766
+ {
767
+ "cell_type": "code",
768
+ "execution_count": null,
769
+ "metadata": {},
770
+ "outputs": [],
771
+ "source": [
772
+ "result_to_plot.to_csv('new_tweets_labeled_vs_percentage_of_viral.csv', index=False)"
773
+ ]
774
+ },
775
+ {
776
+ "cell_type": "markdown",
777
+ "metadata": {},
778
+ "source": [
779
+ "#### 1.2.4 Comparing several metrics wrt distributions of viral tweets covered"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": null,
785
+ "metadata": {},
786
+ "outputs": [],
787
+ "source": [
788
+ "def plot_distribution_for_metric(\n",
789
+ " df, metric='virality_followers', num_experiments=1000, generate_thresholds_from_viral_quantiles=True, min_threshold=None, max_threshold=None, remove_duplicates=True, output_filename=None):\n",
790
+ " viral_tweets = df[df.viral]\n",
791
+ " number_of_viral_tweets = len(viral_tweets)\n",
792
+ " \n",
793
+ " if not generate_thresholds_from_viral_quantiles: \n",
794
+ " # If not, generate a linear space of the thresholds between min and max of the metric values\n",
795
+ " if not min_threshold:\n",
796
+ " min_threshold = df[metric].min()\n",
797
+ " if not max_threshold:\n",
798
+ " max_threshold = df[metric].max()\n",
799
+ " display(f\"sampling from {min_threshold} to {max_threshold}\")\n",
800
+ " thresholds_space = np.linspace(min_threshold, max_threshold, num=num_experiments)\n",
801
+ " else:\n",
802
+ " # Take quantiles of metric for different percentages of viral tweets covered (from 0 to 100)\n",
803
+ " thresholds_space = viral_tweets[metric].quantile([i / 100 for i in range(101)]) \n",
804
+ " display(f\"sampling from {thresholds_space.min()} to {thresholds_space.max()}\")\n",
805
+ "\n",
806
+ " percentages_of_viral_covered = []\n",
807
+ " nb_of_tweets_labeled_as_viral = []\n",
808
+ "\n",
809
+ " for i in thresholds_space:\n",
810
+ " new_tweets_labeled = df[df[metric] >= i]\n",
811
+ " percentage_of_viral_covered = len(new_tweets_labeled[new_tweets_labeled.viral]) / number_of_viral_tweets\n",
812
+ " nb_of_tweets_labeled_as_viral.append(len(new_tweets_labeled))\n",
813
+ " percentages_of_viral_covered.append(percentage_of_viral_covered)\n",
814
+ " \n",
815
+ " results_to_plot = pd.DataFrame({\n",
816
+ " f'percentage_of_viral_covered_{metric}':percentages_of_viral_covered,\n",
817
+ " f'nb_of_tweets_labeled_as_viral_{metric}':nb_of_tweets_labeled_as_viral,\n",
818
+ " f'thresholds_{metric}': thresholds_space})\n",
819
+ "\n",
820
+ " #if remove_duplicates:\n",
821
+ " # results_to_plot = results_to_plot.sort_values(by='nb_of_tweets_labeled_as_viral').drop_duplicates(subset=['percentage_of_viral_covered'], keep='first')\n",
822
+ "\n",
823
+ " # Discard rows where 100% of viral tweets are covered\n",
824
+ " #results_to_plot = results_to_plot[results_to_plot.percentage_of_viral_covered < 1.0]\n",
825
+ " # TODO: take min of 100% coverage\n",
826
+ "\n",
827
+ " fig = px.scatter(\n",
828
+ " results_to_plot,\n",
829
+ " x=f'percentage_of_viral_covered_{metric}',\n",
830
+ " y=f'nb_of_tweets_labeled_as_viral_{metric}', hover_name=f'thresholds_{metric}')#log_y=True, trendline='ols' \n",
831
+ "\n",
832
+ " fig.update_layout(title_text=f\"Percentage of viral covered vs new tweets labeled as viral according to varying metric {metric}\")\n",
833
+ " fig.show()\n",
834
+ "\n",
835
+ " display(f\"Result length {len(results_to_plot)}\")\n",
836
+ " if not output_filename:\n",
837
+ " output_filename = metric\n",
838
+ " results_to_plot.to_csv(f'{output_filename}_viral_covered_vs_new_tweets_labeled.csv', index=False) \n",
839
+ " \n",
840
+ " return results_to_plot"
841
+ ]
842
+ },
843
+ {
844
+ "cell_type": "code",
845
+ "execution_count": null,
846
+ "metadata": {},
847
+ "outputs": [],
848
+ "source": [
849
+ "#viral_dataset_with_users = viral_dataset_with_users.groupby(by='virality_followers').count()\n",
850
+ "METRIC_1 = 'virality_followers'\n",
851
+ "viral_dataset_with_users = pd.read_parquet(f\"{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip\")\n",
852
+ "# Applying the second metric on the viral dataset\n",
853
+ "viral_dataset_with_users[METRIC_1] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['followers_count'].astype(\"float64\")\n",
854
+ "# Handle division by zero if user has 0 followers\n",
855
+ "viral_dataset_with_users[METRIC_1] = viral_dataset_with_users[METRIC_1].replace({np.inf: 0.0})"
856
+ ]
857
+ },
858
+ {
859
+ "cell_type": "code",
860
+ "execution_count": null,
861
+ "metadata": {},
862
+ "outputs": [],
863
+ "source": [
864
+ "df_1 = plot_distribution_for_metric(viral_dataset_with_users, metric='virality_followers', num_experiments=10000)"
865
+ ]
866
+ },
867
+ {
868
+ "cell_type": "code",
869
+ "execution_count": null,
870
+ "metadata": {},
871
+ "outputs": [],
872
+ "source": [
873
+ "# Metric 2: retweet / user avg retweets\n",
874
+ "METRIC_2 = 'virality_avg_retweets'\n",
875
+ "viral_users_retweet_statistics = viral_dataset_with_users.groupby(by='author_id').retweet_count.agg(['min', 'mean', 'max', 'median'])\n",
876
+ "viral_users_retweet_statistics = viral_users_retweet_statistics.rename(columns={\n",
877
+ " \"min\": \"min_user_retweets\", \"max\": \"max_user_retweets\", \"mean\": \"mean_user_retweets\", \"median\": \"median_user_retweets\"})\n",
878
+ "\n",
879
+ "viral_dataset_with_users = viral_dataset_with_users.merge(viral_users_retweet_statistics, on='author_id')\n",
880
+ "\n",
881
+ "viral_dataset_with_users[METRIC_2] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['mean_user_retweets'].astype(\"float64\")\n",
882
+ "# Handle division by zero if user has 0 followers\n",
883
+ "viral_dataset_with_users[METRIC_2] = viral_dataset_with_users[METRIC_2].replace({np.inf: 0.0})"
884
+ ]
885
+ },
886
+ {
887
+ "cell_type": "code",
888
+ "execution_count": null,
889
+ "metadata": {},
890
+ "outputs": [],
891
+ "source": [
892
+ "df_2 = plot_distribution_for_metric(viral_dataset_with_users, metric='virality_avg_retweets', num_experiments=10000)"
893
+ ]
894
+ },
895
+ {
896
+ "cell_type": "code",
897
+ "execution_count": null,
898
+ "metadata": {},
899
+ "outputs": [],
900
+ "source": [
901
+ "# Metric 3: Minimum retweet count (Hard threshold)\n",
902
+ "METRIC_3 = 'retweet_count'\n",
903
+ "\n",
904
+ "viral_tweets = viral_dataset_with_users[viral_dataset_with_users.viral]\n",
905
+ "min_viral_retweet_count = viral_tweets.retweet_count.min()\n",
906
+ "max_viral_retweet_count = viral_tweets.retweet_count.max()\n",
907
+ "\n",
908
+ "df_3 = plot_distribution_for_metric(\n",
909
+ " viral_dataset_with_users, metric=METRIC_3, num_experiments=10000,\n",
910
+ " min_threshold=min_viral_retweet_count, max_threshold=max_viral_retweet_count, generate_thresholds_from_viral_quantiles=False,\n",
911
+ " output_filename='hard_threshold')"
912
+ ]
913
+ },
914
+ {
915
+ "cell_type": "code",
916
+ "execution_count": null,
917
+ "metadata": {},
918
+ "outputs": [],
919
+ "source": [
920
+ "# Metric 4 from Maldonado paper 'Virality Prediction for News Tweets Using RoBERTa'\n",
921
+ "def roberta_paper_metric(x):\n",
922
+ " g = x['retweet_count'] + x['like_count']\n",
923
+ " h = x['followers_count'] - x['following_count']\n",
924
+ " A = 10\n",
925
+ "\n",
926
+ " r = max(x['retweet_count'], 1)\n",
927
+ " f = max(x['like_count'], 1)\n",
928
+ " w = max(x['followers_count'], 1)\n",
929
+ " d = max(x['following_count'], 1)\n",
930
+ " h = max(h, 1)\n",
931
+ "\n",
932
+ " num = g * d * (A * r + f)\n",
933
+ " denom = w * r * (A * d + h)\n",
934
+ " #if denom == 0:\n",
935
+ " # return 0\n",
936
+ " return num / denom"
937
+ ]
938
+ },
939
+ {
940
+ "cell_type": "code",
941
+ "execution_count": null,
942
+ "metadata": {},
943
+ "outputs": [],
944
+ "source": [
945
+ "METRIC_4 = 'roberta_paper_metric'\n",
946
+ "viral_dataset_with_users[METRIC_4] = viral_dataset_with_users.apply(lambda x: roberta_paper_metric(x), axis='columns')\n",
947
+ "\n",
948
+ "df_4 = plot_distribution_for_metric(\n",
949
+ " viral_dataset_with_users, metric=METRIC_4, num_experiments=100000)"
950
+ ]
951
+ },
952
+ {
953
+ "cell_type": "code",
954
+ "execution_count": null,
955
+ "metadata": {},
956
+ "outputs": [],
957
+ "source": [
958
+ "METRIC_5 = 'virality_retweet_percentile_per_user'\n",
959
+ "\n",
960
+ "# Take only tweets with positive retweet count, otherwise the quantiles will be very heavy-tailed\n",
961
+ "#tweets_with_retweets = viral_dataset_with_users[viral_dataset_with_users.retweet_count > 0]\n",
962
+ "\n",
963
+ "viral_tweets = viral_dataset_with_users[viral_dataset_with_users.viral]\n",
964
+ "percentiles = [i/100 for i in range(101)]\n",
965
+ "number_of_viral_tweets = len(viral_tweets)\n",
966
+ "\n",
967
+ "percentages_of_viral_covered = []\n",
968
+ "nb_of_tweets_labeled_as_viral = []\n",
969
+ "\n",
970
+ "for i in tqdm(percentiles):\n",
971
+ " temp = viral_dataset_with_users.groupby(by='author_id')[['retweet_count']].quantile(i).rename(columns={'retweet_count': f'percentile_{i}'})\n",
972
+ " temp = viral_dataset_with_users.merge(temp, on='author_id')\n",
973
+ "\n",
974
+ " new_tweets_labeled = temp[temp['retweet_count'] >= temp[f'percentile_{i}']]\n",
975
+ " percentage_of_viral_covered = len(new_tweets_labeled[new_tweets_labeled.viral]) / number_of_viral_tweets\n",
976
+ " nb_of_tweets_labeled_as_viral.append(len(new_tweets_labeled))\n",
977
+ " percentages_of_viral_covered.append(percentage_of_viral_covered)\n",
978
+ "\n",
979
+ "df_5 = pd.DataFrame({\n",
980
+ " f'percentage_of_viral_covered_{METRIC_5}':percentages_of_viral_covered,\n",
981
+ " f'nb_of_tweets_labeled_as_viral_{METRIC_5}':nb_of_tweets_labeled_as_viral,\n",
982
+ " f'thresholds_{METRIC_5}': percentiles})\n",
983
+ "\n",
984
+ "fig = px.scatter(\n",
985
+ " df_5,\n",
986
+ " x=f'percentage_of_viral_covered_{METRIC_5}',\n",
987
+ " y=f'nb_of_tweets_labeled_as_viral_{METRIC_5}', hover_name=f'thresholds_{METRIC_5}')#log_y=True, trendline='ols' \n",
988
+ "\n",
989
+ "fig.update_layout(title_text=f\"Percentage of viral covered vs new tweets labeled as viral according to varying metric {METRIC_5}\")\n",
990
+ "fig.show()\n",
991
+ "\n",
992
+ "display(f\"Result length {len(df_5)}\")\n",
993
+ "df_5.to_csv(f'{METRIC_5}_viral_covered_vs_new_tweets_labeled.csv', index=False)"
994
+ ]
995
+ },
996
+ {
997
+ "cell_type": "code",
998
+ "execution_count": null,
999
+ "metadata": {},
1000
+ "outputs": [],
1001
+ "source": [
1002
+ "# Metric 6: Median\n",
1003
+ "METRIC_6 = 'virality_median_retweets'\n",
1004
+ "\n",
1005
+ "positive_median_dataset = viral_dataset_with_users[viral_dataset_with_users['median_user_retweets'] > 0].copy()\n",
1006
+ "positive_median_dataset.loc[:, METRIC_6] = positive_median_dataset['retweet_count'] / positive_median_dataset['median_user_retweets'].astype(\"float64\")\n",
1007
+ "# Handle division by zero if user has 0 followers\n",
1008
+ "positive_median_dataset.loc[:, METRIC_6] = positive_median_dataset[METRIC_6].replace({np.inf: 0.0, np.nan:0.0})"
1009
+ ]
1010
+ },
1011
+ {
1012
+ "cell_type": "code",
1013
+ "execution_count": null,
1014
+ "metadata": {},
1015
+ "outputs": [],
1016
+ "source": [
1017
+ "df_6 = plot_distribution_for_metric(\n",
1018
+ " positive_median_dataset, metric=METRIC_6, num_experiments=10000, remove_duplicates=True)"
1019
+ ]
1020
+ },
1021
+ {
1022
+ "cell_type": "code",
1023
+ "execution_count": null,
1024
+ "metadata": {},
1025
+ "outputs": [],
1026
+ "source": [
1027
+ "# log(retweet_counts) / followers_count\n",
1028
+ "METRIC_7 = 'log_retweets_over_followers'\n",
1029
+ "\n",
1030
+ "positive_retweet_and_follower_count = viral_dataset_with_users[(viral_dataset_with_users.retweet_count > 0) & (viral_dataset_with_users.followers_count > 0)].copy()\n",
1031
+ "\n",
1032
+ "positive_retweet_and_follower_count.loc[:, METRIC_7] = (np.log(positive_retweet_and_follower_count['retweet_count']) / positive_retweet_and_follower_count['followers_count']).astype(\"float64\")\n",
1033
+ "positive_retweet_and_follower_count.loc[:, METRIC_7] = positive_retweet_and_follower_count[METRIC_7].replace({np.inf: 0.0, np.nan:0.0})\n",
1034
+ "\n",
1035
+ "df_7 = plot_distribution_for_metric(\n",
1036
+ " positive_retweet_and_follower_count, metric=METRIC_7, num_experiments=10000, remove_duplicates=True)"
1037
+ ]
1038
+ },
1039
+ {
1040
+ "cell_type": "code",
1041
+ "execution_count": null,
1042
+ "metadata": {},
1043
+ "outputs": [],
1044
+ "source": [
1045
+ "METRIC_8 = 'retweets_over_log_followers'\n",
1046
+ "\n",
1047
+ "positive_retweet_and_follower_count.loc[:, METRIC_8] = (positive_retweet_and_follower_count['retweet_count'] / np.log(positive_retweet_and_follower_count['followers_count'])).astype(\"float64\")\n",
1048
+ "positive_retweet_and_follower_count.loc[:, METRIC_8] = positive_retweet_and_follower_count[METRIC_8].replace({np.inf: 0.0, np.nan:0.0})\n",
1049
+ "\n",
1050
+ "df_8 = plot_distribution_for_metric(\n",
1051
+ " positive_retweet_and_follower_count, metric=METRIC_8, num_experiments=10000, remove_duplicates=True)"
1052
+ ]
1053
+ },
1054
+ {
1055
+ "cell_type": "code",
1056
+ "execution_count": null,
1057
+ "metadata": {},
1058
+ "outputs": [],
1059
+ "source": [
1060
+ "METRIC_9 = 'log_retweets_over_log_followers'\n",
1061
+ "\n",
1062
+ "positive_retweet_and_follower_count.loc[:, METRIC_9] = (np.log(positive_retweet_and_follower_count['retweet_count']) / np.log(positive_retweet_and_follower_count['followers_count'])).astype(\"float64\")\n",
1063
+ "positive_retweet_and_follower_count.loc[:, METRIC_9] = positive_retweet_and_follower_count[METRIC_9].replace({np.inf: 0.0, np.nan:0.0})\n",
1064
+ "\n",
1065
+ "df_9 = plot_distribution_for_metric(\n",
1066
+ " positive_retweet_and_follower_count, metric=METRIC_9, num_experiments=10000, remove_duplicates=True)"
1067
+ ]
1068
+ },
1069
+ {
1070
+ "cell_type": "code",
1071
+ "execution_count": null,
1072
+ "metadata": {},
1073
+ "outputs": [],
1074
+ "source": [
1075
+ "final_result = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9], axis=1)\n",
1076
+ "final_result.to_csv('final_result_viral_coverage.csv')"
1077
+ ]
1078
+ },
1079
+ {
1080
+ "cell_type": "markdown",
1081
+ "metadata": {},
1082
+ "source": [
1083
+ "### 1.3 Viral Dataset Exploration: Comparison between viral and non viral tweets using other features "
1084
+ ]
1085
+ },
1086
+ {
1087
+ "cell_type": "code",
1088
+ "execution_count": null,
1089
+ "metadata": {},
1090
+ "outputs": [],
1091
+ "source": [
1092
+ "# TODO: Only take viral tweets from scraped. Since sentiment is already computed on the other dataset, we relabel dataset viral by checking if in scraped ids \n",
1093
+ "# (DONE)"
1094
+ ]
1095
+ },
1096
+ {
1097
+ "cell_type": "code",
1098
+ "execution_count": null,
1099
+ "metadata": {},
1100
+ "outputs": [],
1101
+ "source": [
1102
+ "viral_dataset_labeled = pd.read_parquet(f'{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip')"
1103
+ ]
1104
+ },
1105
+ {
1106
+ "cell_type": "code",
1107
+ "execution_count": null,
1108
+ "metadata": {},
1109
+ "outputs": [],
1110
+ "source": [
1111
+ "display(f\"{len(viral_dataset_labeled[viral_dataset_labeled.viral])} viral tweets out of {len(viral_dataset_labeled)}\")"
1112
+ ]
1113
+ },
1114
+ {
1115
+ "cell_type": "markdown",
1116
+ "metadata": {},
1117
+ "source": [
1118
+ "#### 1.3.1 - Language"
1119
+ ]
1120
+ },
1121
+ {
1122
+ "cell_type": "code",
1123
+ "execution_count": null,
1124
+ "metadata": {},
1125
+ "outputs": [],
1126
+ "source": [
1127
+ "languages_aggregates = viral_dataset_labeled.groupby(by='lang', as_index=False)[['id']].count().rename(columns={'id': 'count'})\n",
1128
+ "languages_aggregates = languages_aggregates.sort_values(by='count', ascending=False)\n",
1129
+ "languages_aggregates.loc[languages_aggregates['count'] < 10000, 'lang'] = 'Other Languages'\n",
1130
+ "fig = px.pie(languages_aggregates, values='count', names='lang', title='Distribution of Tweets languages')\n",
1131
+ "\n",
1132
+ "fig.update_layout(\n",
1133
+ " autosize=False,\n",
1134
+ " width=500,\n",
1135
+ " height=500\n",
1136
+ ")"
1137
+ ]
1138
+ },
1139
+ {
1140
+ "cell_type": "code",
1141
+ "execution_count": null,
1142
+ "metadata": {},
1143
+ "outputs": [],
1144
+ "source": [
1145
+ "pd.crosstab(index = viral_dataset_labeled['lang'] == 'en', columns=viral_dataset_labeled['viral']) "
1146
+ ]
1147
+ },
1148
+ {
1149
+ "cell_type": "markdown",
1150
+ "metadata": {},
1151
+ "source": [
1152
+ "#### 1.3.2 - Media"
1153
+ ]
1154
+ },
1155
+ {
1156
+ "cell_type": "code",
1157
+ "execution_count": null,
1158
+ "metadata": {},
1159
+ "outputs": [],
1160
+ "source": [
1161
+ "# Has media\n",
1162
+ "labels = [\"Media\", \"No Media\"]\n",
1163
+ "viral_has_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == True) & (viral_dataset_labeled.has_media == True)])\n",
1164
+ "viral_no_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == True) & (viral_dataset_labeled.has_media == False)])\n",
1165
+ "normal_has_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == False) & (viral_dataset_labeled.has_media == True)])\n",
1166
+ "normal_no_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == False) & (viral_dataset_labeled.has_media == False)])\n",
1167
+ "\n",
1168
+ "\n",
1169
+ "# Create subplots: use 'domain' type for Pie subplot\n",
1170
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
1171
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_has_media, viral_no_media], name=\"Viral with Media\"),\n",
1172
+ " 1, 1)\n",
1173
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_has_media, normal_no_media], name=\"Tweet with Media\"),\n",
1174
+ " 1, 2)\n",
1175
+ "\n",
1176
+ "# Use `hole` to create a donut-like pie chart\n",
1177
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
1178
+ "\n",
1179
+ "fig.update_layout(\n",
1180
+ " width=1000,\n",
1181
+ " height=500,\n",
1182
+ " title_text=\"Percentage of tweets with some kind of media\",\n",
1183
+ " # Add annotations in the center of the donut pies.\n",
1184
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
1185
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
1186
+ "fig.show()"
1187
+ ]
1188
+ },
1189
+ {
1190
+ "cell_type": "markdown",
1191
+ "metadata": {},
1192
+ "source": [
1193
+ "Calculating the p-value between the target `viral` and `has_media`\n"
1194
+ ]
1195
+ },
1196
+ {
1197
+ "cell_type": "code",
1198
+ "execution_count": null,
1199
+ "metadata": {},
1200
+ "outputs": [],
1201
+ "source": [
1202
+ "from scipy.stats import chi2_contingency \n",
1203
+ "\n",
1204
+ "# Calculating the p-value\n",
1205
+ "contingency_media = pd.crosstab(index = viral_dataset_labeled['has_media'], columns=viral_dataset_labeled['viral']) \n",
1206
+ "display(contingency_media)\n",
1207
+ "# Display with percentages\n",
1208
+ "display(pd.crosstab(index = viral_dataset_labeled['has_media'], columns=viral_dataset_labeled['viral'], normalize='columns') )\n",
1209
+ "\n",
1210
+ "c, p, dof, expected = chi2_contingency(contingency_media) \n",
1211
+ "display(f'p-value {p}')\n",
1212
+ "c, p, dof, expected"
1213
+ ]
1214
+ },
1215
+ {
1216
+ "cell_type": "markdown",
1217
+ "metadata": {},
1218
+ "source": [
1219
+ "**Finding**: Viral tweets have more chance of having some kind of media (Video, Image, GIF..) embedded than non viral tweets."
1220
+ ]
1221
+ },
1222
+ {
1223
+ "cell_type": "markdown",
1224
+ "metadata": {},
1225
+ "source": [
1226
+ "#### 1.3.2 - Context annotations (Topics)"
1227
+ ]
1228
+ },
1229
+ {
1230
+ "cell_type": "code",
1231
+ "execution_count": null,
1232
+ "metadata": {},
1233
+ "outputs": [],
1234
+ "source": [
1235
+ "viral_tweets_topic_domains = viral_dataset_labeled[viral_dataset_labeled.viral == True] \\\n",
1236
+ " .explode('topic_domains') \\\n",
1237
+ " .dropna(axis=0, subset=['topic_domains']) \\\n",
1238
+ " .topic_domains \n",
1239
+ "\n",
1240
+ "tweets_topic_domains = viral_dataset_labeled[viral_dataset_labeled.viral == False] \\\n",
1241
+ " .explode('topic_domains') \\\n",
1242
+ " .dropna(axis=0, subset=['topic_domains']) \\\n",
1243
+ " .topic_domains\n",
1244
+ "\n",
1245
+ "viral_topics_domains_sorted = viral_tweets_topic_domains.groupby(viral_tweets_topic_domains).count().sort_values(ascending=False)\n",
1246
+ "tweet_topics_domains_sorted = tweets_topic_domains.groupby(tweets_topic_domains).count().sort_values(ascending=False)"
1247
+ ]
1248
+ },
1249
+ {
1250
+ "cell_type": "code",
1251
+ "execution_count": null,
1252
+ "metadata": {},
1253
+ "outputs": [],
1254
+ "source": [
1255
+ "import pickle\n",
1256
+ "\n",
1257
+ "with open(f'{DATA_PATH}/topic_domains.pickle', 'rb') as handle:\n",
1258
+ " topic_domains = pickle.load(handle)\n",
1259
+ "\n",
1260
+ "top_10_viral_topic_domains = viral_topics_domains_sorted[:10]\n",
1261
+ "top_10_tweet_topic_domains = tweet_topics_domains_sorted[:10]\n",
1262
+ "\n",
1263
+ "display(f\"Top 10 topic domains in viral tweets: \\n {[topic_domains.get(x)['name'] for x in top_10_viral_topic_domains.index.values]}\")\n",
1264
+ "display(f\"Top 10 topic domains in general tweets: \\n {[topic_domains.get(x)['name'] for x in top_10_tweet_topic_domains.index.values]}\")"
1265
+ ]
1266
+ },
1267
+ {
1268
+ "cell_type": "code",
1269
+ "execution_count": null,
1270
+ "metadata": {},
1271
+ "outputs": [],
1272
+ "source": [
1273
+ "viral_labels = [topic_domains.get(x)['name'] for x in top_10_viral_topic_domains.index.values]\n",
1274
+ "non_viral_labels = [topic_domains.get(x)['name'] for x in top_10_tweet_topic_domains.index.values]\n",
1275
+ "\n",
1276
+ "# Create subplots: use 'domain' type for Pie subplot\n",
1277
+ "fig2 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
1278
+ "fig2.add_trace(go.Pie(labels=viral_labels, values=top_10_viral_topic_domains.values, name=\"Viral Tweet Topic domain\"),\n",
1279
+ " 1, 1)\n",
1280
+ "fig2.add_trace(go.Pie(labels=non_viral_labels, values=top_10_tweet_topic_domains.values, name=\"Non-Viral Tweet Topic domain\"),\n",
1281
+ " 1, 2)\n",
1282
+ "\n",
1283
+ "# Use `hole` to create a donut-like pie chart\n",
1284
+ "fig2.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
1285
+ "\n",
1286
+ "fig2.update_layout(\n",
1287
+ " width=1000,\n",
1288
+ " height=500,\n",
1289
+ " title_text=\"Top 10 topic domains for viral vs non-viral tweets\",\n",
1290
+ " # Add annotations in the center of the donut pies.\n",
1291
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
1292
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
1293
+ "fig2.show()"
1294
+ ]
1295
+ },
1296
+ {
1297
+ "cell_type": "markdown",
1298
+ "metadata": {},
1299
+ "source": [
1300
+ "#### 1.3.3 - Tweet Length"
1301
+ ]
1302
+ },
1303
+ {
1304
+ "cell_type": "code",
1305
+ "execution_count": null,
1306
+ "metadata": {},
1307
+ "outputs": [],
1308
+ "source": [
1309
+ "viral_dataset_labeled.loc[:, 'tweet_length'] = viral_dataset_labeled.text.apply(len)"
1310
+ ]
1311
+ },
1312
+ {
1313
+ "cell_type": "code",
1314
+ "execution_count": null,
1315
+ "metadata": {},
1316
+ "outputs": [],
1317
+ "source": [
1318
+ "display(viral_dataset_labeled[['tweet_length', 'retweet_count']].corr())\n",
1319
+ "\n",
1320
+ "avg_tweet_length_viral = viral_dataset_labeled[viral_dataset_labeled.viral].tweet_length.mean()\n",
1321
+ "avg_tweet_length_non_viral = viral_dataset_labeled[~viral_dataset_labeled.viral].tweet_length.mean()\n",
1322
+ "\n",
1323
+ "display(f'viral avg tweet length: {avg_tweet_length_viral} \\n non-viral avg tweet length: {avg_tweet_length_non_viral}')"
1324
+ ]
1325
+ },
1326
+ {
1327
+ "cell_type": "markdown",
1328
+ "metadata": {},
1329
+ "source": [
1330
+ "Some tweets are replies to others so **mentions are automatically inserted at the beginning of the tweet**, but they do not count in the Twitter max character count, so we should discard them."
1331
+ ]
1332
+ },
1333
+ {
1334
+ "cell_type": "code",
1335
+ "execution_count": null,
1336
+ "metadata": {},
1337
+ "outputs": [],
1338
+ "source": [
1339
+ "viral_dataset_labeled.loc[:, \"text\"] = viral_dataset_labeled.text.apply(clear_reply_mentions)\n",
1340
+ "viral_dataset_labeled.loc[:, 'tweet_length'] = viral_dataset_labeled.text.apply(len)"
1341
+ ]
1342
+ },
1343
+ {
1344
+ "cell_type": "code",
1345
+ "execution_count": null,
1346
+ "metadata": {},
1347
+ "outputs": [],
1348
+ "source": [
1349
+ "display(viral_dataset_labeled[['tweet_length', 'retweet_count']].corr())\n",
1350
+ "\n",
1351
+ "avg_tweet_length_viral = viral_dataset_labeled[viral_dataset_labeled.viral].tweet_length.mean()\n",
1352
+ "avg_tweet_length_non_viral = viral_dataset_labeled[~viral_dataset_labeled.viral].tweet_length.mean()\n",
1353
+ "\n",
1354
+ "display(f'viral avg tweet length: {avg_tweet_length_viral} \\n non-viral avg tweet length: {avg_tweet_length_non_viral}')"
1355
+ ]
1356
+ },
1357
+ {
1358
+ "cell_type": "markdown",
1359
+ "metadata": {},
1360
+ "source": [
1361
+ "Calculating the welch’s t-test (scipy t-test) for continuous variable `tweet_length`"
1362
+ ]
1363
+ },
1364
+ {
1365
+ "cell_type": "code",
1366
+ "execution_count": null,
1367
+ "metadata": {},
1368
+ "outputs": [],
1369
+ "source": [
1370
+ "from scipy.stats import ttest_ind\n",
1371
+ "\n",
1372
+ "ttest_ind(viral_dataset_labeled[viral_dataset_labeled.viral].tweet_length, viral_dataset_labeled[~viral_dataset_labeled.viral].tweet_length, equal_var=False)"
1373
+ ]
1374
+ },
1375
+ {
1376
+ "cell_type": "markdown",
1377
+ "metadata": {},
1378
+ "source": [
1379
+ "#### 1.3.4 - Sentiment "
1380
+ ]
1381
+ },
1382
+ {
1383
+ "cell_type": "markdown",
1384
+ "metadata": {},
1385
+ "source": [
1386
+ "For the sentiment analysis, we used huggingface's [default sentiment analysis model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you). We instantiate a huggingface pipeline using that default model, and we pass the tweets text to it, outputting a **label** (e.g. POSITIVE, NEGATIVE) alongside a **confidence score**. This will only be applied to english tweets.\n",
1387
+ "\n",
1388
+ "**NOTE**: Feel free to skip the following cells if you already have the processed data. Sentiment analysis takes some time (around 2 hours on the whole data). "
1389
+ ]
1390
+ },
1391
+ {
1392
+ "cell_type": "code",
1393
+ "execution_count": null,
1394
+ "metadata": {},
1395
+ "outputs": [],
1396
+ "source": [
1397
+ "from transformers import pipeline\n",
1398
+ "\n",
1399
+ "# Device = 0 means it will use the Cuda at index 0\n",
1400
+ "sentiment_classifier = pipeline(\"sentiment-analysis\", device=0)"
1401
+ ]
1402
+ },
1403
+ {
1404
+ "cell_type": "markdown",
1405
+ "metadata": {},
1406
+ "source": [
1407
+ "This will only be applied to **english tweets**. All the viral tweets we scraped are in English, so we won't be losing viral data when filtering."
1408
+ ]
1409
+ },
1410
+ {
1411
+ "cell_type": "code",
1412
+ "execution_count": null,
1413
+ "metadata": {},
1414
+ "outputs": [],
1415
+ "source": [
1416
+ "english_viral_dataset = viral_dataset_labeled[viral_dataset_labeled.lang == 'en']\n",
1417
+ "english_viral_dataset"
1418
+ ]
1419
+ },
1420
+ {
1421
+ "cell_type": "markdown",
1422
+ "metadata": {},
1423
+ "source": [
1424
+ "Here we use the pandas `apply` function, with `result_type` to *expand*, so that the sentiment scores and label will be output into different columns."
1425
+ ]
1426
+ },
1427
+ {
1428
+ "cell_type": "code",
1429
+ "execution_count": null,
1430
+ "metadata": {},
1431
+ "outputs": [],
1432
+ "source": [
1433
+ "applied = english_viral_dataset.apply(lambda x: sentiment_classifier(x.text)[0], axis=1, result_type='expand')\n",
1434
+ "#pd.concat([small_test_set, applied], axis='columns')\n",
1435
+ "applied"
1436
+ ]
1437
+ },
1438
+ {
1439
+ "cell_type": "code",
1440
+ "execution_count": null,
1441
+ "metadata": {},
1442
+ "outputs": [],
1443
+ "source": [
1444
+ "sentiment_features = pd.concat([english_viral_dataset, applied], axis=1)\n",
1445
+ "sentiment_features"
1446
+ ]
1447
+ },
1448
+ {
1449
+ "cell_type": "code",
1450
+ "execution_count": null,
1451
+ "metadata": {},
1452
+ "outputs": [],
1453
+ "source": [
1454
+ "sentiment_features = sentiment_features.rename(columns={\"label\": \"sentiment\", \"score\": \"sentiment_score\"})"
1455
+ ]
1456
+ },
1457
+ {
1458
+ "cell_type": "code",
1459
+ "execution_count": null,
1460
+ "metadata": {},
1461
+ "outputs": [],
1462
+ "source": [
1463
+ "sentiment_features.to_parquet(f\"{PROCESSED_PATH_VIRAL}/all_english_tweets_with_users_with_sentiment.parquet.gzip\", index=False, compression=\"gzip\")"
1464
+ ]
1465
+ },
1466
+ {
1467
+ "cell_type": "markdown",
1468
+ "metadata": {},
1469
+ "source": [
1470
+ "Get the processed data already"
1471
+ ]
1472
+ },
1473
+ {
1474
+ "cell_type": "code",
1475
+ "execution_count": null,
1476
+ "metadata": {},
1477
+ "outputs": [],
1478
+ "source": [
1479
+ "sentiment_features = pd.read_parquet(f\"{PROCESSED_PATH_VIRAL}/all_english_tweets_with_users_with_sentiment.parquet.gzip\")\n",
1480
+ "display(f\"{len(sentiment_features[sentiment_features.viral])} viral tweets out of {len(sentiment_features)}\")"
1481
+ ]
1482
+ },
1483
+ {
1484
+ "cell_type": "code",
1485
+ "execution_count": null,
1486
+ "metadata": {},
1487
+ "outputs": [],
1488
+ "source": [
1489
+ "# Tweets with sentiment scores over 70%\n",
1490
+ "display(f\"Tweets with sentiment analysis confidence scores above 0.7: {len(sentiment_features[sentiment_features.sentiment_score > 0.7])}\")\n",
1491
+ "display(f\"{len(sentiment_features[sentiment_features.sentiment == 'POSITIVE'])} positive tweets\")\n",
1492
+ "display(f\"{len(sentiment_features[sentiment_features.sentiment == 'NEGATIVE'])} negative tweets\")\n",
1493
+ "\n",
1494
+ "confident_sentiment_tweets = sentiment_features[sentiment_features.sentiment_score > 0.7]"
1495
+ ]
1496
+ },
1497
+ {
1498
+ "cell_type": "code",
1499
+ "execution_count": null,
1500
+ "metadata": {},
1501
+ "outputs": [],
1502
+ "source": [
1503
+ "# We keep only retweeted tweets to pan out tweets with zero retweets with little utility.\n",
1504
+ "#retweeted_tweets = confident_sentiment_tweets[confident_sentiment_tweets.retweet_count > 0]\n",
1505
+ "\n",
1506
+ "labels = [\"Positive\", \"Negative\"]\n",
1507
+ "viral_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
1508
+ "viral_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
1509
+ "normal_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
1510
+ "normal_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
1511
+ "\n",
1512
+ "\n",
1513
+ "# Create subplots: use 'domain' type for Pie subplot\n",
1514
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
1515
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_positive, viral_negative], name=\"Positive Viral Tweets\"),\n",
1516
+ " 1, 1)\n",
1517
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_positive, normal_negative], name=\"Positive Non-Viral Tweets\"),\n",
1518
+ " 1, 2)\n",
1519
+ "\n",
1520
+ "# Use `hole` to create a donut-like pie chart\n",
1521
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
1522
+ "\n",
1523
+ "fig.update_layout(\n",
1524
+ " width=1000,\n",
1525
+ " height=500,\n",
1526
+ " title_text=\"Distribution of positive and negative sentiment in viral vs non-viral tweets\",\n",
1527
+ " # Add annotations in the center of the donut pies.\n",
1528
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
1529
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
1530
+ "fig.show()"
1531
+ ]
1532
+ },
1533
+ {
1534
+ "cell_type": "markdown",
1535
+ "metadata": {},
1536
+ "source": [
1537
+ "Calculating the p-value between the target `viral` and positive sentiment\n"
1538
+ ]
1539
+ },
1540
+ {
1541
+ "cell_type": "code",
1542
+ "execution_count": null,
1543
+ "metadata": {},
1544
+ "outputs": [],
1545
+ "source": [
1546
+ "from scipy.stats import chi2_contingency \n",
1547
+ "\n",
1548
+ "#confident_sentiment_tweets.loc[:, 'is_positive'] = confident_sentiment_tweets.sentiment == 'POSITIVE'\n",
1549
+ "\n",
1550
+ "# Calculating the p-value\n",
1551
+ "contingency_sentiment = pd.crosstab(index = confident_sentiment_tweets['sentiment'], columns=confident_sentiment_tweets['viral']) \n",
1552
+ "# Display with percentages\n",
1553
+ "contingency_sentiment_normalized_percentage = pd.crosstab(\n",
1554
+ " index = confident_sentiment_tweets['sentiment'], columns=confident_sentiment_tweets['viral'], normalize='columns') \n",
1555
+ "display(contingency_sentiment_normalized_percentage)\n",
1556
+ "\n",
1557
+ "c, p, dof, expected = chi2_contingency(contingency_sentiment) \n",
1558
+ "display(f'p-value {p}')\n",
1559
+ "c, p, dof, expected"
1560
+ ]
1561
+ },
1562
+ {
1563
+ "cell_type": "markdown",
1564
+ "metadata": {},
1565
+ "source": [
1566
+ "Calculating the p-value between the target `viral` and negative sentiment\n"
1567
+ ]
1568
+ },
1569
+ {
1570
+ "cell_type": "code",
1571
+ "execution_count": null,
1572
+ "metadata": {},
1573
+ "outputs": [],
1574
+ "source": [
1575
+ "from scipy.stats import chi2_contingency \n",
1576
+ "\n",
1577
+ "confident_sentiment_tweets.loc[:, 'is_negative'] = confident_sentiment_tweets.sentiment == 'NEGATIVE'\n",
1578
+ "\n",
1579
+ "# Calculating the p-value\n",
1580
+ "contingency_negative_sentiment = pd.crosstab(index = confident_sentiment_tweets['is_negative'], columns=confident_sentiment_tweets['viral']) \n",
1581
+ "# Display with percentages\n",
1582
+ "contingency_negative_sentiment_normalized_percentage = pd.crosstab(\n",
1583
+ " index = confident_sentiment_tweets['is_negative'], columns=confident_sentiment_tweets['viral'], normalize='columns') \n",
1584
+ "display(contingency_negative_sentiment_normalized_percentage)\n",
1585
+ "\n",
1586
+ "c, p, dof, expected = chi2_contingency(contingency_negative_sentiment) \n",
1587
+ "display(f'p-value {p}')\n",
1588
+ "c, p, dof, expected"
1589
+ ]
1590
+ },
1591
+ {
1592
+ "cell_type": "code",
1593
+ "execution_count": null,
1594
+ "metadata": {},
1595
+ "outputs": [],
1596
+ "source": [
1597
+ "'''\n",
1598
+ "import spacy\n",
1599
+ "import vaderSentiment\n",
1600
+ "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
1601
+ "\n",
1602
+ "nlp = spacy.load(\"en_core_web_sm\")\n",
1603
+ "\n",
1604
+ "spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS\n",
1605
+ "print('Number of stop words: %d' % len(spacy_stopwords))\n",
1606
+ "print('First ten stop words:',list(spacy_stopwords)[:10])\n",
1607
+ "'''"
1608
+ ]
1609
+ },
1610
+ {
1611
+ "cell_type": "code",
1612
+ "execution_count": null,
1613
+ "metadata": {},
1614
+ "outputs": [],
1615
+ "source": [
1616
+ "'''\n",
1617
+ "# Remove new lines \n",
1618
+ "remove_new_lines = lambda x: \" \".join(x.split())\n",
1619
+ "viral_dataset_labeled['processed_text'] = viral_dataset_labeled['text'].apply(remove_new_lines)\n",
1620
+ "\n",
1621
+ "\n",
1622
+ "english_tweets = viral_dataset_labeled[viral_dataset_labeled.lang == 'en']\n",
1623
+ "'''"
1624
+ ]
1625
+ },
1626
+ {
1627
+ "cell_type": "markdown",
1628
+ "metadata": {},
1629
+ "source": [
1630
+ "#### 1.3.5 - Number of hashtags "
1631
+ ]
1632
+ },
1633
+ {
1634
+ "cell_type": "code",
1635
+ "execution_count": null,
1636
+ "metadata": {},
1637
+ "outputs": [],
1638
+ "source": [
1639
+ "viral_dataset_labeled.loc[:, \"nb_of_hashtags\"] = viral_dataset_labeled.hashtags.apply(lambda x: len(x) if np.all(x) else 0)"
1640
+ ]
1641
+ },
1642
+ {
1643
+ "cell_type": "code",
1644
+ "execution_count": null,
1645
+ "metadata": {},
1646
+ "outputs": [],
1647
+ "source": [
1648
+ "labels = [\"Hashtags\", \"No Hashtags\"]\n",
1649
+ "viral_has_hashtags = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags >= 1)])\n",
1650
+ "viral_no_hashtags = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags == 0)])\n",
1651
+ "normal_has_hashtags = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags >= 1)])\n",
1652
+ "normal_no_hashtags = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags == 0)])\n",
1653
+ "\n",
1654
+ "\n",
1655
+ "# Create subplots: use 'domain' type for Pie subplot\n",
1656
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
1657
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_has_hashtags, viral_no_hashtags], name=\"Viral with Hashtags\"),\n",
1658
+ " 1, 1)\n",
1659
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_has_hashtags, normal_no_hashtags], name=\"Tweet with No Hashtags\"),\n",
1660
+ " 1, 2)\n",
1661
+ "\n",
1662
+ "# Use `hole` to create a donut-like pie chart\n",
1663
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
1664
+ "\n",
1665
+ "fig.update_layout(\n",
1666
+ " width=1000,\n",
1667
+ " height=500,\n",
1668
+ " title_text=\"Percentage of tweets with hashtags\",\n",
1669
+ " # Add annotations in the center of the donut pies.\n",
1670
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
1671
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
1672
+ "fig.show()"
1673
+ ]
1674
+ },
1675
+ {
1676
+ "cell_type": "markdown",
1677
+ "metadata": {},
1678
+ "source": [
1679
+ "Calculating the p-value between the target `viral` and `has_hashtags`\n"
1680
+ ]
1681
+ },
1682
+ {
1683
+ "cell_type": "code",
1684
+ "execution_count": null,
1685
+ "metadata": {},
1686
+ "outputs": [],
1687
+ "source": [
1688
+ "from scipy.stats import chi2_contingency \n",
1689
+ "\n",
1690
+ "viral_dataset_labeled['has_hashtags'] = viral_dataset_labeled.nb_of_hashtags >= 1\n",
1691
+ "\n",
1692
+ "# Calculating the p-value\n",
1693
+ "contingency_has_hashtags = pd.crosstab(index = viral_dataset_labeled['has_hashtags'], columns=viral_dataset_labeled['viral']) \n",
1694
+ "# Display with percentages\n",
1695
+ "contingency_has_hashtags_normalized_percentage = pd.crosstab(\n",
1696
+ " index = viral_dataset_labeled['has_hashtags'], columns=viral_dataset_labeled['viral'], normalize='columns') \n",
1697
+ "display(contingency_has_hashtags_normalized_percentage)\n",
1698
+ "\n",
1699
+ "c, p, dof, expected = chi2_contingency(contingency_has_hashtags) \n",
1700
+ "display(f'p-value {p}')\n",
1701
+ "c, p, dof, expected"
1702
+ ]
1703
+ },
1704
+ {
1705
+ "cell_type": "markdown",
1706
+ "metadata": {},
1707
+ "source": [
1708
+ "#### 1.3.6 - Verified account"
1709
+ ]
1710
+ },
1711
+ {
1712
+ "cell_type": "code",
1713
+ "execution_count": null,
1714
+ "metadata": {},
1715
+ "outputs": [],
1716
+ "source": [
1717
+ "# Verified account\n",
1718
+ "labels = [\"Verified\", \"Not verified\"]\n",
1719
+ "viral_is_verified = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (viral_dataset_labeled.verified)])\n",
1720
+ "viral_not_verified = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (~viral_dataset_labeled.verified)])\n",
1721
+ "normal_is_verified = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (viral_dataset_labeled.verified)])\n",
1722
+ "normal_not_verified = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (~viral_dataset_labeled.verified)])\n",
1723
+ "\n",
1724
+ "\n",
1725
+ "# Create subplots: use 'domain' type for Pie subplot\n",
1726
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
1727
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_is_verified, viral_not_verified], name=\"Viral with verified accounts\"),\n",
1728
+ " 1, 1)\n",
1729
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_is_verified, normal_not_verified], name=\"Tweet with an unverified account\"),\n",
1730
+ " 1, 2)\n",
1731
+ "\n",
1732
+ "# Use `hole` to create a donut-like pie chart\n",
1733
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
1734
+ "\n",
1735
+ "fig.update_layout(\n",
1736
+ " width=1000,\n",
1737
+ " height=500,\n",
1738
+ " title_text=\"Percentage of tweets from verified accounts\",\n",
1739
+ " # Add annotations in the center of the donut pies.\n",
1740
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
1741
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
1742
+ "fig.show()"
1743
+ ]
1744
+ },
1745
+ {
1746
+ "cell_type": "markdown",
1747
+ "metadata": {},
1748
+ "source": [
1749
+ "Calculating the p-value between the target `viral` and `is_verified`\n"
1750
+ ]
1751
+ },
1752
+ {
1753
+ "cell_type": "code",
1754
+ "execution_count": null,
1755
+ "metadata": {},
1756
+ "outputs": [],
1757
+ "source": [
1758
+ "from scipy.stats import chi2_contingency \n",
1759
+ "\n",
1760
+ "# Calculating the p-value\n",
1761
+ "contingency_verified = pd.crosstab(index = viral_dataset_labeled['verified'], columns=viral_dataset_labeled['viral']) \n",
1762
+ "# Display with percentages\n",
1763
+ "contingency_verified_normalized_percentage = pd.crosstab(\n",
1764
+ " index = viral_dataset_labeled['verified'], columns=viral_dataset_labeled['viral'], normalize='columns') \n",
1765
+ "display(contingency_verified_normalized_percentage)\n",
1766
+ "\n",
1767
+ "c, p, dof, expected = chi2_contingency(contingency_verified) \n",
1768
+ "display(f'p-value {p}')\n",
1769
+ "c, p, dof, expected"
1770
+ ]
1771
+ },
1772
+ {
1773
+ "cell_type": "markdown",
1774
+ "metadata": {},
1775
+ "source": [
1776
+ "#### 1.3.7 - Has mentions"
1777
+ ]
1778
+ },
1779
+ {
1780
+ "cell_type": "code",
1781
+ "execution_count": null,
1782
+ "metadata": {},
1783
+ "outputs": [],
1784
+ "source": [
1785
+ "viral_dataset_labeled.loc[:, \"nb_of_mentions\"] = viral_dataset_labeled.mentions.apply(lambda x: len(x) if np.all(x) else 0)"
1786
+ ]
1787
+ },
1788
+ {
1789
+ "cell_type": "code",
1790
+ "execution_count": null,
1791
+ "metadata": {},
1792
+ "outputs": [],
1793
+ "source": [
1794
+ "from scipy.stats import chi2_contingency \n",
1795
+ "\n",
1796
+ "# Calculating the p-value\n",
1797
+ "contingency_has_mentions = pd.crosstab(index = viral_dataset_labeled['nb_of_mentions'] > 0, columns=viral_dataset_labeled['viral']) \n",
1798
+ "display(contingency_has_mentions)\n",
1799
+ "# Display with percentages\n",
1800
+ "display(pd.crosstab(index = viral_dataset_labeled['nb_of_mentions'] > 0, columns=viral_dataset_labeled['viral'], normalize='columns') )\n",
1801
+ "\n",
1802
+ "c, p, dof, expected = chi2_contingency(contingency_has_mentions) \n",
1803
+ "display(f'p-value {p}')\n",
1804
+ "c, p, dof, expected"
1805
+ ]
1806
+ },
1807
+ {
1808
+ "cell_type": "markdown",
1809
+ "metadata": {},
1810
+ "source": [
1811
+ "#### 1.3.8 - Save result of preprocessing to disk"
1812
+ ]
1813
+ },
1814
+ {
1815
+ "cell_type": "code",
1816
+ "execution_count": null,
1817
+ "metadata": {},
1818
+ "outputs": [],
1819
+ "source": [
1820
+ "viral_dataset_labeled.to_parquet(f'{PROCESSED_PATH_VIRAL}/all_english_tweets_with_users_with_sentiment.parquet.gzip', index=False, compression=\"gzip\")"
1821
+ ]
1822
+ },
1823
+ {
1824
+ "cell_type": "markdown",
1825
+ "metadata": {},
1826
+ "source": []
1827
+ },
1828
+ {
1829
+ "cell_type": "code",
1830
+ "execution_count": null,
1831
+ "metadata": {},
1832
+ "outputs": [],
1833
+ "source": [
1834
+ "viral_dataset_labeled.columns\n"
1835
+ ]
1836
+ },
1837
+ {
1838
+ "cell_type": "markdown",
1839
+ "metadata": {},
1840
+ "source": [
1841
+ "### 1.4 - Covid dataset Exploration"
1842
+ ]
1843
+ },
1844
+ {
1845
+ "cell_type": "markdown",
1846
+ "metadata": {},
1847
+ "source": [
1848
+ "Here we concern ourselves only with original tweets (no retweets)."
1849
+ ]
1850
+ },
1851
+ {
1852
+ "cell_type": "code",
1853
+ "execution_count": null,
1854
+ "metadata": {},
1855
+ "outputs": [],
1856
+ "source": [
1857
+ "original_covid_tweets = pd.read_parquet(f\"{COVID_TWEETS_PATH}/all_original_tweets.parquet.gzip\")\n",
1858
+ "original_covid_tweets.loc[:, \"text\"] = original_covid_tweets.text.apply(clear_reply_mentions)\n",
1859
+ "\n",
1860
+ "covid_users = pd.read_parquet(f\"{COVID_TWEETS_PATH}/users.parquet.gzip\")\n",
1861
+ "\n",
1862
+ "display(\"--- COVID DATASET ---\")\n",
1863
+ "\n",
1864
+ "display(f\"{len(original_covid_tweets)} original (not retweeted) covid tweets collected\")\n",
1865
+ "display(f\"{len(original_covid_tweets.author_id.unique())} covid users collected\")\n",
1866
+ "\n",
1867
+ "original_covid_tweets"
1868
+ ]
1869
+ },
1870
+ {
1871
+ "cell_type": "code",
1872
+ "execution_count": null,
1873
+ "metadata": {},
1874
+ "outputs": [],
1875
+ "source": [
1876
+ "user_columns = ['author_id', 'followers_count', 'following_count', 'tweet_count', 'protected', 'verified', 'username']\n",
1877
+ "covid_dataset_with_users = original_covid_tweets.merge(covid_users.rename(columns={'id': 'author_id'})[user_columns], on='author_id')"
1878
+ ]
1879
+ },
1880
+ {
1881
+ "cell_type": "code",
1882
+ "execution_count": null,
1883
+ "metadata": {},
1884
+ "outputs": [],
1885
+ "source": [
1886
+ "# Applying the first metric on the covid dataset\n",
1887
+ "covid_dataset_with_users['virality_followers'] = covid_dataset_with_users['retweet_count'] / covid_dataset_with_users['followers_count'].astype(\"float64\")\n",
1888
+ "# Handle division by zero if user has 0 followers\n",
1889
+ "covid_dataset_with_users['virality_followers'] = covid_dataset_with_users.virality_followers.replace({np.inf: 0.0})"
1890
+ ]
1891
+ },
1892
+ {
1893
+ "cell_type": "code",
1894
+ "execution_count": null,
1895
+ "metadata": {},
1896
+ "outputs": [],
1897
+ "source": [
1898
+ "covid_dataset_with_users"
1899
+ ]
1900
+ },
1901
+ {
1902
+ "cell_type": "code",
1903
+ "execution_count": null,
1904
+ "metadata": {},
1905
+ "outputs": [],
1906
+ "source": [
1907
+ "px.histogram(covid_dataset_with_users, x='followers_count', y = 'virality_followers', log_y=True)"
1908
+ ]
1909
+ },
1910
+ {
1911
+ "cell_type": "code",
1912
+ "execution_count": null,
1913
+ "metadata": {},
1914
+ "outputs": [],
1915
+ "source": [
1916
+ "covid_dataset_with_users['viral'] = covid_dataset_with_users.virality_followers > 1\n",
1917
+ "covid_dataset_with_users[covid_dataset_with_users.viral]"
1918
+ ]
1919
+ },
1920
+ {
1921
+ "cell_type": "markdown",
1922
+ "metadata": {},
1923
+ "source": [
1924
+ "### 1.4.1 - Language"
1925
+ ]
1926
+ },
1927
+ {
1928
+ "cell_type": "code",
1929
+ "execution_count": null,
1930
+ "metadata": {},
1931
+ "outputs": [],
1932
+ "source": [
1933
+ "languages_aggregates = covid_dataset_with_users.groupby(by='lang', as_index=False)[['id']].count().rename(columns={'id': 'count'})\n",
1934
+ "languages_aggregates = languages_aggregates.sort_values(by='count', ascending=False)\n",
1935
+ "languages_aggregates.loc[languages_aggregates['count'] < 10000, 'lang'] = 'Other Languages'\n",
1936
+ "fig = px.pie(languages_aggregates, values='count', names='lang', title='Distribution of Tweets languages')\n",
1937
+ "\n",
1938
+ "fig.update_layout(\n",
1939
+ " autosize=False,\n",
1940
+ " width=500,\n",
1941
+ " height=500\n",
1942
+ ")"
1943
+ ]
1944
+ },
1945
+ {
1946
+ "cell_type": "code",
1947
+ "execution_count": null,
1948
+ "metadata": {},
1949
+ "outputs": [],
1950
+ "source": [
1951
+ "english_covid_tweets = covid_dataset_with_users[covid_dataset_with_users.lang == 'en']\n",
1952
+ "display(f\"{len(english_covid_tweets)} english covid tweets\")\n",
1953
+ "\n",
1954
+ "english_viral_covid_tweets = english_covid_tweets[english_covid_tweets.viral]\n",
1955
+ "display(f\"{len(english_viral_covid_tweets)} viral english covid tweets\")"
1956
+ ]
1957
+ },
1958
+ {
1959
+ "cell_type": "markdown",
1960
+ "metadata": {},
1961
+ "source": [
1962
+ "### 1.4.2 - Media"
1963
+ ]
1964
+ },
1965
+ {
1966
+ "cell_type": "code",
1967
+ "execution_count": null,
1968
+ "metadata": {},
1969
+ "outputs": [],
1970
+ "source": [
1971
+ "# Has media\n",
1972
+ "labels = [\"Media\", \"No Media\"]\n",
1973
+ "viral_has_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == True) & (covid_dataset_with_users.has_media == True)])\n",
1974
+ "viral_no_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == True) & (covid_dataset_with_users.has_media == False)])\n",
1975
+ "normal_has_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == False) & (covid_dataset_with_users.has_media == True)])\n",
1976
+ "normal_no_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == False) & (covid_dataset_with_users.has_media == False)])\n",
1977
+ "\n",
1978
+ "\n",
1979
+ "# Create subplots: use 'domain' type for Pie subplot\n",
1980
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
1981
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_has_media, viral_no_media], name=\"Viral with Media\"),\n",
1982
+ " 1, 1)\n",
1983
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_has_media, normal_no_media], name=\"Tweet with Media\"),\n",
1984
+ " 1, 2)\n",
1985
+ "\n",
1986
+ "# Use `hole` to create a donut-like pie chart\n",
1987
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
1988
+ "\n",
1989
+ "fig.update_layout(\n",
1990
+ " width=1000,\n",
1991
+ " height=500,\n",
1992
+ " title_text=\"Percentage of tweets with some kind of media\",\n",
1993
+ " # Add annotations in the center of the donut pies.\n",
1994
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
1995
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
1996
+ "fig.show()"
1997
+ ]
1998
+ },
1999
+ {
2000
+ "cell_type": "markdown",
2001
+ "metadata": {},
2002
+ "source": [
2003
+ "### 1.4.3 - Tweet Length"
2004
+ ]
2005
+ },
2006
+ {
2007
+ "cell_type": "code",
2008
+ "execution_count": null,
2009
+ "metadata": {},
2010
+ "outputs": [],
2011
+ "source": [
2012
+ "covid_dataset_with_users.loc[:, 'tweet_length'] = covid_dataset_with_users.text.apply(len)\n",
2013
+ "covid_dataset_with_users[['tweet_length', 'retweet_count']].corr()"
2014
+ ]
2015
+ },
2016
+ {
2017
+ "cell_type": "markdown",
2018
+ "metadata": {},
2019
+ "source": [
2020
+ "### 1.4.4 - Sentiment"
2021
+ ]
2022
+ },
2023
+ {
2024
+ "cell_type": "code",
2025
+ "execution_count": null,
2026
+ "metadata": {},
2027
+ "outputs": [],
2028
+ "source": [
2029
+ "from transformers import pipeline\n",
2030
+ "\n",
2031
+ "# Device = 0 means it will use the Cuda at index 0\n",
2032
+ "sentiment_classifier = pipeline(\"sentiment-analysis\", device=0)\n",
2033
+ "\n",
2034
+ "english_covid_dataset = covid_dataset_with_users[covid_dataset_with_users.lang == 'en']\n",
2035
+ "english_covid_dataset"
2036
+ ]
2037
+ },
2038
+ {
2039
+ "cell_type": "markdown",
2040
+ "metadata": {},
2041
+ "source": [
2042
+ "Here we compute sentiments again. To avoid having to compute the sentiments again, we've already preprocessed the data and computed the sentiments and saved it to parquet. Feel free to skip the next 2 cells."
2043
+ ]
2044
+ },
2045
+ {
2046
+ "cell_type": "code",
2047
+ "execution_count": null,
2048
+ "metadata": {},
2049
+ "outputs": [],
2050
+ "source": [
2051
+ "applied = english_covid_dataset.apply(lambda x: sentiment_classifier(x.text)[0], axis=1, result_type='expand')\n",
2052
+ "#pd.concat([small_test_set, applied], axis='columns')\n",
2053
+ "applied"
2054
+ ]
2055
+ },
2056
+ {
2057
+ "cell_type": "code",
2058
+ "execution_count": null,
2059
+ "metadata": {},
2060
+ "outputs": [],
2061
+ "source": [
2062
+ "sentiment_features = pd.concat([english_covid_dataset, applied], axis=1)\n",
2063
+ "sentiment_features = sentiment_features.rename(columns={\"label\": \"sentiment\", \"score\": \"sentiment_score\"})"
2064
+ ]
2065
+ },
2066
+ {
2067
+ "cell_type": "code",
2068
+ "execution_count": null,
2069
+ "metadata": {},
2070
+ "outputs": [],
2071
+ "source": [
2072
+ "sentiment_features = pd.read_parquet(f\"{PROCESSED_PATH_COVID}/english_tweets_with_users_with_sentiment.parquet.gzip\")\n",
2073
+ "sentiment_features"
2074
+ ]
2075
+ },
2076
+ {
2077
+ "cell_type": "code",
2078
+ "execution_count": null,
2079
+ "metadata": {},
2080
+ "outputs": [],
2081
+ "source": [
2082
+ "# Tweets with sentiment scores over 70%\n",
2083
+ "display(f\"Tweets with sentiment analysis confidence scores above 0.7: {len(sentiment_features[sentiment_features.sentiment_score > 0.7])}\")\n",
2084
+ "display(f\"{len(sentiment_features[sentiment_features.sentiment == 'POSITIVE'])} positive tweets\")\n",
2085
+ "display(f\"{len(sentiment_features[sentiment_features.sentiment == 'NEGATIVE'])} negative tweets\")\n",
2086
+ "\n",
2087
+ "confident_sentiment_tweets = sentiment_features[sentiment_features.sentiment_score > 0.7]"
2088
+ ]
2089
+ },
2090
+ {
2091
+ "cell_type": "code",
2092
+ "execution_count": null,
2093
+ "metadata": {},
2094
+ "outputs": [],
2095
+ "source": [
2096
+ "# We keep only retweeted tweets to pan out tweets with zero retweets with little utility.\n",
2097
+ "labels = [\"Positive\", \"Negative\"]\n",
2098
+ "viral_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
2099
+ "viral_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
2100
+ "normal_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
2101
+ "normal_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
2102
+ "\n",
2103
+ "\n",
2104
+ "# Create subplots: use 'domain' type for Pie subplot\n",
2105
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
2106
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_positive, viral_negative], name=\"Positive Viral Tweets\"),\n",
2107
+ " 1, 1)\n",
2108
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_positive, normal_negative], name=\"Positive Non-Viral Tweets\"),\n",
2109
+ " 1, 2)\n",
2110
+ "\n",
2111
+ "# Use `hole` to create a donut-like pie chart\n",
2112
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
2113
+ "\n",
2114
+ "fig.update_layout(\n",
2115
+ " width=1000,\n",
2116
+ " height=500,\n",
2117
+ " title_text=\"Distribution of positive and negative sentiment in viral vs non-viral tweets\",\n",
2118
+ " # Add annotations in the center of the donut pies.\n",
2119
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
2120
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
2121
+ "fig.show()"
2122
+ ]
2123
+ },
2124
+ {
2125
+ "cell_type": "markdown",
2126
+ "metadata": {},
2127
+ "source": [
2128
+ "### 1.4.5 - Number of Hashtags"
2129
+ ]
2130
+ },
2131
+ {
2132
+ "cell_type": "code",
2133
+ "execution_count": null,
2134
+ "metadata": {},
2135
+ "outputs": [],
2136
+ "source": [
2137
+ "covid_dataset_with_users.loc[:, \"nb_of_hashtags\"] = covid_dataset_with_users.hashtags.apply(lambda x: len(x) if np.all(x) else 0)"
2138
+ ]
2139
+ },
2140
+ {
2141
+ "cell_type": "code",
2142
+ "execution_count": null,
2143
+ "metadata": {},
2144
+ "outputs": [],
2145
+ "source": [
2146
+ "labels = [\"Hashtags\", \"No Hashtags\"]\n",
2147
+ "viral_has_hashtags = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags >= 1)])\n",
2148
+ "viral_no_hashtags = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags == 0)])\n",
2149
+ "normal_has_hashtags = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags > 1)])\n",
2150
+ "normal_no_hashtags = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags == 0)])\n",
2151
+ "\n",
2152
+ "\n",
2153
+ "# Create subplots: use 'domain' type for Pie subplot\n",
2154
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
2155
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_has_hashtags, viral_no_hashtags], name=\"Viral with Hashtags\"),\n",
2156
+ " 1, 1)\n",
2157
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_has_hashtags, normal_no_hashtags], name=\"Tweet with No Hashtags\"),\n",
2158
+ " 1, 2)\n",
2159
+ "\n",
2160
+ "# Use `hole` to create a donut-like pie chart\n",
2161
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
2162
+ "\n",
2163
+ "fig.update_layout(\n",
2164
+ " width=1000,\n",
2165
+ " height=500,\n",
2166
+ " title_text=\"Percentage of tweets with hashtags\",\n",
2167
+ " # Add annotations in the center of the donut pies.\n",
2168
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
2169
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
2170
+ "fig.show()"
2171
+ ]
2172
+ },
2173
+ {
2174
+ "cell_type": "markdown",
2175
+ "metadata": {},
2176
+ "source": [
2177
+ "#### 1.4.6 - Verified Account"
2178
+ ]
2179
+ },
2180
+ {
2181
+ "cell_type": "code",
2182
+ "execution_count": null,
2183
+ "metadata": {},
2184
+ "outputs": [],
2185
+ "source": [
2186
+ "# Has media\n",
2187
+ "labels = [\"Verified\", \"Not verified\"]\n",
2188
+ "viral_is_verified = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (covid_dataset_with_users.verified)])\n",
2189
+ "viral_not_verified = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (~covid_dataset_with_users.verified)])\n",
2190
+ "normal_is_verified = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (covid_dataset_with_users.verified)])\n",
2191
+ "normal_not_verified = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (~covid_dataset_with_users.verified)])\n",
2192
+ "\n",
2193
+ "\n",
2194
+ "# Create subplots: use 'domain' type for Pie subplot\n",
2195
+ "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
2196
+ "fig.add_trace(go.Pie(labels=labels, values=[viral_is_verified, viral_not_verified], name=\"Viral with verified accounts\"),\n",
2197
+ " 1, 1)\n",
2198
+ "fig.add_trace(go.Pie(labels=labels, values=[normal_is_verified, normal_not_verified], name=\"Tweet with an unverified account\"),\n",
2199
+ " 1, 2)\n",
2200
+ "\n",
2201
+ "# Use `hole` to create a donut-like pie chart\n",
2202
+ "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
2203
+ "\n",
2204
+ "fig.update_layout(\n",
2205
+ " width=1000,\n",
2206
+ " height=500,\n",
2207
+ " title_text=\"Percentage of tweets from verified accounts\",\n",
2208
+ " # Add annotations in the center of the donut pies.\n",
2209
+ " annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
2210
+ " dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
2211
+ "fig.show()"
2212
+ ]
2213
+ },
2214
+ {
2215
+ "cell_type": "markdown",
2216
+ "metadata": {},
2217
+ "source": [
2218
+ "### 1.4.7 - Save dataframe with analysis to disk"
2219
+ ]
2220
+ },
2221
+ {
2222
+ "cell_type": "code",
2223
+ "execution_count": null,
2224
+ "metadata": {},
2225
+ "outputs": [],
2226
+ "source": [
2227
+ "covid_dataset_with_users.to_parquet(f'{PROCESSED_PATH_COVID}/all_english_tweets_with_users_with_sentiment.parquet.gzip', index=False, compression=\"gzip\")"
2228
+ ]
2229
+ },
2230
+ {
2231
+ "cell_type": "markdown",
2232
+ "metadata": {},
2233
+ "source": [
2234
+ "Questions for TJ:\n",
2235
+ "\n",
2236
+ "Learn threshold? Use unsupervised learning (anomaly detection), x axis date y retweet count, isolation coordinate\n",
2237
+ "Ratio\n",
2238
+ "Try to come up with Different metrics (one cannot be used for second dataset)\n",
2239
+ "\n",
2240
+ "Preprocessing:\n",
2241
+ " - Remove tweets with no retweets or likes? NO\n",
2242
+ " - Define threshold using the metric? DONE (label above viral tweet)\n",
2243
+ " - Skewed distribution if we use only Twitter viral tweets (1000) DONE\n",
2244
+ "- Which features? (Any new ideas)\n",
2245
+ " - Topic\n",
2246
+ " - Hashtags relevant? (Most likely different from coronavirus and we already have topics).\n",
2247
+ " - Has media\n",
2248
+ " - Sentiment? [TODO]\n",
2249
+ " - Tweet length [TODO]\n",
2250
+ " - RETRIEVE USERS THAT LIKED OR RETWEETED USING API [TODO]\n",
2251
+ " - Word cloud of entities [TODO]\n",
2252
+ "- Check bigrams and trigrams distribution\n",
2253
+ "- Normalize features (like, retweets, reply etc...)? DEPENDS, Included in first model, will be removed from second model with covid set.\n",
2254
+ "- BertTweet [DO NOT REMOVE STOP WORDS FOR LANGUAGE MODELS, FOR ]\n",
2255
+ "- Next steps (now that data collection part is done and data analysis almost done)\n",
2256
+ " - Hydrate Covid dataset id\n",
2257
+ "- Viral generator (Trump generator)\n",
2258
+ "\n",
2259
+ "1st classifier: hashtags, twitter entities (context annotations, domain annotations, entities), mentions, domain of urls (youtube.com let’s say)\n",
2260
+ "2nd classifier: bag of words with tf-idf, remove stopwords and other entities that you used in the 1st classifier\n",
2261
+ "3rd: language model\n"
2262
+ ]
2263
+ },
2264
+ {
2265
+ "cell_type": "markdown",
2266
+ "metadata": {},
2267
+ "source": []
2268
+ },
2269
+ {
2270
+ "cell_type": "code",
2271
+ "execution_count": null,
2272
+ "metadata": {},
2273
+ "outputs": [],
2274
+ "source": []
2275
+ }
2276
+ ],
2277
+ "metadata": {
2278
+ "kernelspec": {
2279
+ "display_name": "Python 3 (ipykernel)",
2280
+ "language": "python",
2281
+ "name": "python3"
2282
+ },
2283
+ "language_info": {
2284
+ "codemirror_mode": {
2285
+ "name": "ipython",
2286
+ "version": 3
2287
+ },
2288
+ "file_extension": ".py",
2289
+ "mimetype": "text/x-python",
2290
+ "name": "python",
2291
+ "nbconvert_exporter": "python",
2292
+ "pygments_lexer": "ipython3",
2293
+ "version": "3.8.15"
2294
+ },
2295
+ "vscode": {
2296
+ "interpreter": {
2297
+ "hash": "71d2f77bccee14ca7852d7b7a1fa8ea4708b81087104d93973081337557f0ee6"
2298
+ }
2299
+ }
2300
+ },
2301
+ "nbformat": 4,
2302
+ "nbformat_minor": 4
2303
+ }
metric_analysis/viral_tweet_user_exploration.ipynb ADDED
@@ -0,0 +1,1208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Viral Tweets: User exploration"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "In this notebook, we will explore the users who have tweeted viral tweets. Namely, we will focus our analysis on the viral tweets from the user point of view. For example, we'll examine the popularity of the user vs the popularity of his tweets, the history of his tweets and analyze any flagrant changes in their features when they became viral, etc."
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "## 0 - Setup"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 1,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import pandas as pd\n",
31
+ "import seaborn as sns\n",
32
+ "import numpy as np\n",
33
+ "\n",
34
+ "import matplotlib.pyplot as plt\n",
35
+ "%matplotlib inline\n",
36
+ "\n",
37
+ "from tqdm import tqdm\n",
38
+ "\n",
39
+ "#pd.set_option('display.max_rows', None)\n",
40
+ "pd.set_option('display.max_columns', None)\n",
41
+ "\n",
42
+ "DATA_PATH = \"../../data\"\n",
43
+ "VIRAL_TWEETS_PATH = f\"{DATA_PATH}/viral_users\""
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 2,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "from helper.twitter_client_wrapper import TwitterClientWrapper, EXPANSIONS, MEDIA_FIELDS, TWEET_FIELDS, USER_FIELDS\n",
53
+ "\n",
54
+ "twitter_client_wrapper = TwitterClientWrapper(\"../../api_key.yaml\", wait_on_rate_limit=False)"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "metadata": {},
60
+ "source": [
61
+ "## 1 - Retrieve the data from disk"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "markdown",
66
+ "metadata": {},
67
+ "source": [
68
+ "### 1.1 Retrieve the viral tweets data"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "metadata": {},
74
+ "source": [
75
+ "**Note**: You may notice that all tweets have been retrieved, since some may have been deleted since scraping them."
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "markdown",
80
+ "metadata": {},
81
+ "source": [
82
+ "**Note 2**: Also keep in mind that when retrieving users, the number of users may be less because users may have two or more viral tweets in the sample of viral tweets we have. "
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 3,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "# dtypes={\"id\": str, \"author_id\": str, \"has_media\": bool, \"possibly_sensitive\": bool}\n",
92
+ "dtypes={\"id\": str, \"author_id\": str}"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 4,
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "name": "stderr",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "C:\\Users\\steph\\AppData\\Local\\Temp\\ipykernel_18728\\1524257405.py:2: DtypeWarning: Columns (3,8,14,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
105
+ " viral_tweets_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/all_tweets.csv\", dtype=dtypes, escapechar='\\\\', encoding='utf-8')\n"
106
+ ]
107
+ },
108
+ {
109
+ "data": {
110
+ "text/html": [
111
+ "<div>\n",
112
+ "<style scoped>\n",
113
+ " .dataframe tbody tr th:only-of-type {\n",
114
+ " vertical-align: middle;\n",
115
+ " }\n",
116
+ "\n",
117
+ " .dataframe tbody tr th {\n",
118
+ " vertical-align: top;\n",
119
+ " }\n",
120
+ "\n",
121
+ " .dataframe thead th {\n",
122
+ " text-align: right;\n",
123
+ " }\n",
124
+ "</style>\n",
125
+ "<table border=\"1\" class=\"dataframe\">\n",
126
+ " <thead>\n",
127
+ " <tr style=\"text-align: right;\">\n",
128
+ " <th></th>\n",
129
+ " <th>created_at</th>\n",
130
+ " <th>author_id</th>\n",
131
+ " <th>text</th>\n",
132
+ " <th>possibly_sensitive</th>\n",
133
+ " <th>edit_history_tweet_ids</th>\n",
134
+ " <th>lang</th>\n",
135
+ " <th>id</th>\n",
136
+ " <th>mentions</th>\n",
137
+ " <th>retweet_count</th>\n",
138
+ " <th>reply_count</th>\n",
139
+ " <th>like_count</th>\n",
140
+ " <th>quote_count</th>\n",
141
+ " <th>context_annotations</th>\n",
142
+ " <th>urls</th>\n",
143
+ " <th>has_media</th>\n",
144
+ " <th>annotations</th>\n",
145
+ " <th>hashtags</th>\n",
146
+ " <th>attachments.poll_ids</th>\n",
147
+ " <th>withheld.copyright</th>\n",
148
+ " <th>withheld.country_codes</th>\n",
149
+ " <th>withheld.scope</th>\n",
150
+ " <th>cashtags</th>\n",
151
+ " <th>geo.place_id</th>\n",
152
+ " <th>geo.coordinates.type</th>\n",
153
+ " <th>geo.coordinates.coordinates</th>\n",
154
+ " </tr>\n",
155
+ " </thead>\n",
156
+ " <tbody>\n",
157
+ " <tr>\n",
158
+ " <th>0</th>\n",
159
+ " <td>2022-10-31T03:21:11.000Z</td>\n",
160
+ " <td>1047733077898739712</td>\n",
161
+ " <td>@manjirosx you too jiro🫶🏽</td>\n",
162
+ " <td>False</td>\n",
163
+ " <td>['1586921195059834880']</td>\n",
164
+ " <td>en</td>\n",
165
+ " <td>1586921195059834880</td>\n",
166
+ " <td>[{'start': 0, 'end': 10, 'username': 'manjiros...</td>\n",
167
+ " <td>0.0</td>\n",
168
+ " <td>0.0</td>\n",
169
+ " <td>1.0</td>\n",
170
+ " <td>0.0</td>\n",
171
+ " <td>NaN</td>\n",
172
+ " <td>NaN</td>\n",
173
+ " <td>False</td>\n",
174
+ " <td>NaN</td>\n",
175
+ " <td>NaN</td>\n",
176
+ " <td>NaN</td>\n",
177
+ " <td>NaN</td>\n",
178
+ " <td>NaN</td>\n",
179
+ " <td>NaN</td>\n",
180
+ " <td>NaN</td>\n",
181
+ " <td>NaN</td>\n",
182
+ " <td>NaN</td>\n",
183
+ " <td>NaN</td>\n",
184
+ " </tr>\n",
185
+ " <tr>\n",
186
+ " <th>1</th>\n",
187
+ " <td>2022-10-31T03:13:57.000Z</td>\n",
188
+ " <td>1047733077898739712</td>\n",
189
+ " <td>@ilyicey u omd</td>\n",
190
+ " <td>False</td>\n",
191
+ " <td>['1586919376086704129']</td>\n",
192
+ " <td>nl</td>\n",
193
+ " <td>1586919376086704129</td>\n",
194
+ " <td>[{'start': 0, 'end': 8, 'username': 'ilyicey',...</td>\n",
195
+ " <td>0.0</td>\n",
196
+ " <td>0.0</td>\n",
197
+ " <td>0.0</td>\n",
198
+ " <td>0.0</td>\n",
199
+ " <td>NaN</td>\n",
200
+ " <td>NaN</td>\n",
201
+ " <td>False</td>\n",
202
+ " <td>NaN</td>\n",
203
+ " <td>NaN</td>\n",
204
+ " <td>NaN</td>\n",
205
+ " <td>NaN</td>\n",
206
+ " <td>NaN</td>\n",
207
+ " <td>NaN</td>\n",
208
+ " <td>NaN</td>\n",
209
+ " <td>NaN</td>\n",
210
+ " <td>NaN</td>\n",
211
+ " <td>NaN</td>\n",
212
+ " </tr>\n",
213
+ " <tr>\n",
214
+ " <th>2</th>\n",
215
+ " <td>2022-10-31T03:13:24.000Z</td>\n",
216
+ " <td>1047733077898739712</td>\n",
217
+ " <td>@ilyicey i’m fine</td>\n",
218
+ " <td>False</td>\n",
219
+ " <td>['1586919239243296768']</td>\n",
220
+ " <td>en</td>\n",
221
+ " <td>1586919239243296768</td>\n",
222
+ " <td>[{'start': 0, 'end': 8, 'username': 'ilyicey',...</td>\n",
223
+ " <td>1.0</td>\n",
224
+ " <td>1.0</td>\n",
225
+ " <td>2.0</td>\n",
226
+ " <td>0.0</td>\n",
227
+ " <td>NaN</td>\n",
228
+ " <td>NaN</td>\n",
229
+ " <td>False</td>\n",
230
+ " <td>NaN</td>\n",
231
+ " <td>NaN</td>\n",
232
+ " <td>NaN</td>\n",
233
+ " <td>NaN</td>\n",
234
+ " <td>NaN</td>\n",
235
+ " <td>NaN</td>\n",
236
+ " <td>NaN</td>\n",
237
+ " <td>NaN</td>\n",
238
+ " <td>NaN</td>\n",
239
+ " <td>NaN</td>\n",
240
+ " </tr>\n",
241
+ " <tr>\n",
242
+ " <th>3</th>\n",
243
+ " <td>2022-10-30T22:49:53.000Z</td>\n",
244
+ " <td>1047733077898739712</td>\n",
245
+ " <td>@imVolo_ I’ll unfollow rn</td>\n",
246
+ " <td>False</td>\n",
247
+ " <td>['1586852923706732544']</td>\n",
248
+ " <td>en</td>\n",
249
+ " <td>1586852923706732544</td>\n",
250
+ " <td>[{'start': 0, 'end': 8, 'username': 'imVolo_',...</td>\n",
251
+ " <td>0.0</td>\n",
252
+ " <td>0.0</td>\n",
253
+ " <td>3.0</td>\n",
254
+ " <td>0.0</td>\n",
255
+ " <td>NaN</td>\n",
256
+ " <td>NaN</td>\n",
257
+ " <td>False</td>\n",
258
+ " <td>NaN</td>\n",
259
+ " <td>NaN</td>\n",
260
+ " <td>NaN</td>\n",
261
+ " <td>NaN</td>\n",
262
+ " <td>NaN</td>\n",
263
+ " <td>NaN</td>\n",
264
+ " <td>NaN</td>\n",
265
+ " <td>NaN</td>\n",
266
+ " <td>NaN</td>\n",
267
+ " <td>NaN</td>\n",
268
+ " </tr>\n",
269
+ " <tr>\n",
270
+ " <th>4</th>\n",
271
+ " <td>2022-10-30T22:45:33.000Z</td>\n",
272
+ " <td>1047733077898739712</td>\n",
273
+ " <td>“what do you want to be for halloween?” his li...</td>\n",
274
+ " <td>False</td>\n",
275
+ " <td>['1586851830767591424']</td>\n",
276
+ " <td>en</td>\n",
277
+ " <td>1586851830767591424</td>\n",
278
+ " <td>NaN</td>\n",
279
+ " <td>611.0</td>\n",
280
+ " <td>19.0</td>\n",
281
+ " <td>4132.0</td>\n",
282
+ " <td>55.0</td>\n",
283
+ " <td>[{'domain': {'id': '29', 'name': 'Events [Enti...</td>\n",
284
+ " <td>NaN</td>\n",
285
+ " <td>False</td>\n",
286
+ " <td>NaN</td>\n",
287
+ " <td>NaN</td>\n",
288
+ " <td>NaN</td>\n",
289
+ " <td>NaN</td>\n",
290
+ " <td>NaN</td>\n",
291
+ " <td>NaN</td>\n",
292
+ " <td>NaN</td>\n",
293
+ " <td>NaN</td>\n",
294
+ " <td>NaN</td>\n",
295
+ " <td>NaN</td>\n",
296
+ " </tr>\n",
297
+ " </tbody>\n",
298
+ "</table>\n",
299
+ "</div>"
300
+ ],
301
+ "text/plain": [
302
+ " created_at author_id \\\n",
303
+ "0 2022-10-31T03:21:11.000Z 1047733077898739712 \n",
304
+ "1 2022-10-31T03:13:57.000Z 1047733077898739712 \n",
305
+ "2 2022-10-31T03:13:24.000Z 1047733077898739712 \n",
306
+ "3 2022-10-30T22:49:53.000Z 1047733077898739712 \n",
307
+ "4 2022-10-30T22:45:33.000Z 1047733077898739712 \n",
308
+ "\n",
309
+ " text possibly_sensitive \\\n",
310
+ "0 @manjirosx you too jiro🫶🏽 False \n",
311
+ "1 @ilyicey u omd False \n",
312
+ "2 @ilyicey i’m fine False \n",
313
+ "3 @imVolo_ I’ll unfollow rn False \n",
314
+ "4 “what do you want to be for halloween?” his li... False \n",
315
+ "\n",
316
+ " edit_history_tweet_ids lang id \\\n",
317
+ "0 ['1586921195059834880'] en 1586921195059834880 \n",
318
+ "1 ['1586919376086704129'] nl 1586919376086704129 \n",
319
+ "2 ['1586919239243296768'] en 1586919239243296768 \n",
320
+ "3 ['1586852923706732544'] en 1586852923706732544 \n",
321
+ "4 ['1586851830767591424'] en 1586851830767591424 \n",
322
+ "\n",
323
+ " mentions retweet_count \\\n",
324
+ "0 [{'start': 0, 'end': 10, 'username': 'manjiros... 0.0 \n",
325
+ "1 [{'start': 0, 'end': 8, 'username': 'ilyicey',... 0.0 \n",
326
+ "2 [{'start': 0, 'end': 8, 'username': 'ilyicey',... 1.0 \n",
327
+ "3 [{'start': 0, 'end': 8, 'username': 'imVolo_',... 0.0 \n",
328
+ "4 NaN 611.0 \n",
329
+ "\n",
330
+ " reply_count like_count quote_count \\\n",
331
+ "0 0.0 1.0 0.0 \n",
332
+ "1 0.0 0.0 0.0 \n",
333
+ "2 1.0 2.0 0.0 \n",
334
+ "3 0.0 3.0 0.0 \n",
335
+ "4 19.0 4132.0 55.0 \n",
336
+ "\n",
337
+ " context_annotations urls has_media \\\n",
338
+ "0 NaN NaN False \n",
339
+ "1 NaN NaN False \n",
340
+ "2 NaN NaN False \n",
341
+ "3 NaN NaN False \n",
342
+ "4 [{'domain': {'id': '29', 'name': 'Events [Enti... NaN False \n",
343
+ "\n",
344
+ " annotations hashtags attachments.poll_ids withheld.copyright \\\n",
345
+ "0 NaN NaN NaN NaN \n",
346
+ "1 NaN NaN NaN NaN \n",
347
+ "2 NaN NaN NaN NaN \n",
348
+ "3 NaN NaN NaN NaN \n",
349
+ "4 NaN NaN NaN NaN \n",
350
+ "\n",
351
+ " withheld.country_codes withheld.scope cashtags geo.place_id \\\n",
352
+ "0 NaN NaN NaN NaN \n",
353
+ "1 NaN NaN NaN NaN \n",
354
+ "2 NaN NaN NaN NaN \n",
355
+ "3 NaN NaN NaN NaN \n",
356
+ "4 NaN NaN NaN NaN \n",
357
+ "\n",
358
+ " geo.coordinates.type geo.coordinates.coordinates \n",
359
+ "0 NaN NaN \n",
360
+ "1 NaN NaN \n",
361
+ "2 NaN NaN \n",
362
+ "3 NaN NaN \n",
363
+ "4 NaN NaN "
364
+ ]
365
+ },
366
+ "execution_count": 4,
367
+ "metadata": {},
368
+ "output_type": "execute_result"
369
+ }
370
+ ],
371
+ "source": [
372
+ "# Import tweets first\n",
373
+ "viral_tweets_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/all_tweets.csv\", dtype=dtypes, escapechar='\\\\', encoding='utf-8')\n",
374
+ "# viral_tweets_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/all_tweets.csv\", dtype=dtypes)\n",
375
+ "viral_tweets_df.head()"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 5,
381
+ "metadata": {},
382
+ "outputs": [
383
+ {
384
+ "data": {
385
+ "text/plain": [
386
+ "'RT @strbrkrr: apple be like \"high volume may damage your ears...\" ok… i don’t care'"
387
+ ]
388
+ },
389
+ "execution_count": 5,
390
+ "metadata": {},
391
+ "output_type": "execute_result"
392
+ }
393
+ ],
394
+ "source": [
395
+ "viral_tweets_df[~viral_tweets_df.annotations.isna()].text.iloc[10]"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "markdown",
400
+ "metadata": {},
401
+ "source": [
402
+ "### 1.2 - Retrieve viral tweets users"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "markdown",
407
+ "metadata": {},
408
+ "source": [
409
+ "We start by retrieving the viral tweets users. Users are **included as expansions** when retrieving the tweets, conveniently so. For each user, we retrieve this user's history and information."
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": null,
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": [
418
+ "# Retrieve the user id. The user data is included in the 'includes' field which we get by if we retrieve any expansions\n",
419
+ "users_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/users.csv\", dtype={\"id\": str, \"pinned_tweet_id\": str}, escapechar=\"\\\\\")\n",
420
+ "users_df"
421
+ ]
422
+ },
423
+ {
424
+ "cell_type": "code",
425
+ "execution_count": null,
426
+ "metadata": {},
427
+ "outputs": [],
428
+ "source": [
429
+ "'''\n",
430
+ "id object\n",
431
+ "edit_history_tweet_ids object\n",
432
+ "author_id object\n",
433
+ "created_at object\n",
434
+ "possibly_sensitive bool\n",
435
+ "text object\n",
436
+ "retweet_count int64\n",
437
+ "reply_count int64\n",
438
+ "like_count int64\n",
439
+ "quote_count int64\n",
440
+ "has_media bool\n",
441
+ "urls object\n",
442
+ "context_annotations object\n",
443
+ "annotations object\n",
444
+ "hashtags object\n",
445
+ "geo.place_id object\n",
446
+ "mentions object\n",
447
+ "dtype: object\n",
448
+ "'''\n",
449
+ "viral_tweets_df.dtypes"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "markdown",
454
+ "metadata": {},
455
+ "source": [
456
+ "## 2 - Analysis of single user"
457
+ ]
458
+ },
459
+ {
460
+ "cell_type": "markdown",
461
+ "metadata": {},
462
+ "source": [
463
+ "Let's observe the tweets of single user who has tweeted viral tweets. We'll try to conduct some analysis on their features to try and see what changed in the tweets of the user over time, and how they reflect the changes in the behaviour of the user."
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "code",
468
+ "execution_count": null,
469
+ "metadata": {},
470
+ "outputs": [],
471
+ "source": [
472
+ "# Take first user\n",
473
+ "user_id = users_df.iloc[0].id"
474
+ ]
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "execution_count": null,
479
+ "metadata": {},
480
+ "outputs": [],
481
+ "source": [
482
+ "user_tweets = viral_tweets_df[viral_tweets_df.author_id == user_id]\n",
483
+ "user_tweets['created_at'] = pd.to_datetime(user_tweets.created_at)\n",
484
+ "user_tweets.head()"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "code",
489
+ "execution_count": null,
490
+ "metadata": {},
491
+ "outputs": [],
492
+ "source": [
493
+ "fig, ax = plt.subplots(1, 2, figsize=(10,5))\n",
494
+ "\n",
495
+ "ax[0].set_title(\"Retweet Count vs Tweet Date\")\n",
496
+ "sns.lineplot(user_tweets, x='created_at', y='retweet_count', ax=ax[0])\n",
497
+ "\n",
498
+ "ax[1].set_title(\"Like Count vs Tweet Date\")\n",
499
+ "sns.lineplot(user_tweets, x='created_at', y='like_count', ax=ax[1])\n",
500
+ "\n",
501
+ "plt.tight_layout()"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": null,
507
+ "metadata": {},
508
+ "outputs": [],
509
+ "source": [
510
+ "fig, ax = plt.subplots(1, 2, figsize=(10,5))\n",
511
+ "\n",
512
+ "user_tweets['tweet_length'] = user_tweets['text'].apply(len)\n",
513
+ "\n",
514
+ "ax[0].set_title(\"Retweet Count vs Tweet Length\")\n",
515
+ "sns.lineplot(user_tweets, x='tweet_length', y='retweet_count', ax=ax[0])\n",
516
+ "\n",
517
+ "ax[1].set_title(\"Like Count vs Tweet Length\")\n",
518
+ "sns.lineplot(user_tweets, x='tweet_length', y='like_count', ax=ax[1])\n",
519
+ "\n",
520
+ "plt.tight_layout()"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": null,
526
+ "metadata": {},
527
+ "outputs": [],
528
+ "source": [
529
+ "# Has media\n",
530
+ "sns.jointplot(user_tweets, x='has_media', y='retweet_count')\n",
531
+ "\n",
532
+ "plt.suptitle(\"# Retweets vs Tweet has media\")\n",
533
+ "plt.tight_layout()"
534
+ ]
535
+ },
536
+ {
537
+ "cell_type": "code",
538
+ "execution_count": null,
539
+ "metadata": {},
540
+ "outputs": [],
541
+ "source": [
542
+ "sns.pairplot(user_tweets[['tweet_length', 'has_media', 'retweet_count', 'like_count']])"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": null,
548
+ "metadata": {},
549
+ "outputs": [],
550
+ "source": [
551
+ "fig, ax = plt.subplots(2, 2, figsize=(10,5))\n",
552
+ "\n",
553
+ "user_tweets['tweet_length'] = user_tweets['text'].apply(len)\n",
554
+ "\n",
555
+ "ax[0][0].set_title(\"Retweet Count vs Date\")\n",
556
+ "sns.lineplot(user_tweets, x='created_at', y='retweet_count', ax=ax[0][0])\n",
557
+ "\n",
558
+ "ax[0][1].set_title(\"Like Count vs Date\")\n",
559
+ "sns.lineplot(user_tweets, x='created_at', y='like_count', ax=ax[0][1])\n",
560
+ "\n",
561
+ "ax[1][0].set_title(\"Has Media vs Date\")\n",
562
+ "sns.scatterplot(user_tweets, x='created_at', y='has_media', ax=ax[1][0])\n",
563
+ "\n",
564
+ "ax[1][1].set_title(\"Tweet Length vs Date\")\n",
565
+ "sns.scatterplot(user_tweets, x='created_at', y='tweet_length', ax=ax[1][1])\n",
566
+ "\n",
567
+ "plt.tight_layout()"
568
+ ]
569
+ },
570
+ {
571
+ "cell_type": "code",
572
+ "execution_count": null,
573
+ "metadata": {},
574
+ "outputs": [],
575
+ "source": [
576
+ "### TODO: Analyze the change in tweet features depending on date (one row depending on date, other depending on retweet count to reflect the evolution)\n",
577
+ "### TODO: Concentration on topics [group by topics for a sample user]"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "markdown",
582
+ "metadata": {},
583
+ "source": [
584
+ "## 3 - Aggregate Analysis of all viral users tweets"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "markdown",
589
+ "metadata": {},
590
+ "source": [
591
+ "#### 3.0 - How many tweets per user retrieved"
592
+ ]
593
+ },
594
+ {
595
+ "cell_type": "code",
596
+ "execution_count": null,
597
+ "metadata": {},
598
+ "outputs": [],
599
+ "source": [
600
+ "tweets_per_user = viral_tweets_df.groupby(by='author_id').size().reset_index(name='count')\n",
601
+ "tweets_per_user.sort_values(by='count')"
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "code",
606
+ "execution_count": null,
607
+ "metadata": {},
608
+ "outputs": [],
609
+ "source": [
610
+ "tweets_per_user.hist(column='count', bins=10)\n",
611
+ "plt.title(\"Histogram of distribution of number of tweets retrieved per user\")"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "markdown",
616
+ "metadata": {},
617
+ "source": [
618
+ "#### 3.1 - Retweet count vs like count"
619
+ ]
620
+ },
621
+ {
622
+ "cell_type": "markdown",
623
+ "metadata": {},
624
+ "source": [
625
+ "In order to come up with a metric for the **virality** of the tweet, we need to know which features we will use to determine this metric. *retweet_count* and *like_count* will surely be among those features selected. Let's how the two correlate."
626
+ ]
627
+ },
628
+ {
629
+ "cell_type": "markdown",
630
+ "metadata": {},
631
+ "source": [
632
+ "**NOTE**: \"The retweet will not show the likes and replies, only retweet count. You need to get the counts from the original tweet, which would be referenced in referenced_tweets and included in includes.tweets part of the response.\" - Twitter Community"
633
+ ]
634
+ },
635
+ {
636
+ "cell_type": "code",
637
+ "execution_count": null,
638
+ "metadata": {},
639
+ "outputs": [],
640
+ "source": [
641
+ "# Remove all tweets that might be retweets of others\n",
642
+ "retweeted = viral_tweets_df.retweet_count !=0\n",
643
+ "liked = viral_tweets_df.like_count !=0\n",
644
+ "original_tweets_df = viral_tweets_df[retweeted & liked]\n",
645
+ "\n",
646
+ "# Remove NA in retweet and like count\n",
647
+ "original_tweets_df = original_tweets_df.dropna(axis=0, subset=['retweet_count', 'like_count'])\n",
648
+ "\n",
649
+ "sns.scatterplot(data=original_tweets_df, x='retweet_count', y='like_count')"
650
+ ]
651
+ },
652
+ {
653
+ "cell_type": "markdown",
654
+ "metadata": {},
655
+ "source": [
656
+ "**Finding**: We can see more or less a linear correlation. Especially for lower numbers."
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "markdown",
661
+ "metadata": {},
662
+ "source": [
663
+ "#### 3.2 - (# Retweets / # followers ) ratio \n"
664
+ ]
665
+ },
666
+ {
667
+ "cell_type": "markdown",
668
+ "metadata": {},
669
+ "source": [
670
+ "Here a viable metric for a viral tweet can be the ratio between the retweets (or like) count over the followers count of the user. The idea here is that a user who doesn't have many followers, but has tweeted tweets that have garnered a lot of retweets or likes, can most definitely be considered \"viral\". On the other hand, a user who has many followers can have a standard high # retweets and those cannot be considered viral all the time."
671
+ ]
672
+ },
673
+ {
674
+ "cell_type": "markdown",
675
+ "metadata": {},
676
+ "source": [
677
+ "**Note**: Also note that historical data for the evolution of the # of followers of a user are not easily available and are not provided by the Twitter API. So these calculated ratios do not reflect the actual ratio when the tweet has been tweeted by a user, since by then he may have gained a lot of followers."
678
+ ]
679
+ },
680
+ {
681
+ "cell_type": "code",
682
+ "execution_count": null,
683
+ "metadata": {},
684
+ "outputs": [],
685
+ "source": [
686
+ "viral_tweets_df_subset = original_tweets_df[['id', 'author_id', 'retweet_count', 'like_count']]\n",
687
+ "\n",
688
+ "# Remove NA in follower count\n",
689
+ "users_df_subset = users_df.dropna(axis=0, subset=['followers_count'])\n",
690
+ "\n",
691
+ "# Merge both on author id\n",
692
+ "tweets_users_merged_df = viral_tweets_df_subset.merge(\n",
693
+ " right=users_df_subset[['id', 'followers_count']].set_index('id'), left_on='author_id', right_on='id')"
694
+ ]
695
+ },
696
+ {
697
+ "cell_type": "code",
698
+ "execution_count": null,
699
+ "metadata": {},
700
+ "outputs": [],
701
+ "source": [
702
+ "tweets_users_merged_df['retweets_followers_ratio'] = tweets_users_merged_df['retweet_count'] / tweets_users_merged_df['followers_count']\n",
703
+ "tweets_users_merged_df.sort_values(by='retweets_followers_ratio')"
704
+ ]
705
+ },
706
+ {
707
+ "cell_type": "code",
708
+ "execution_count": null,
709
+ "metadata": {},
710
+ "outputs": [],
711
+ "source": [
712
+ "import plotly.express as px\n",
713
+ "\n",
714
+ "df_ratios_bigger_than_1 = tweets_users_merged_df[tweets_users_merged_df.retweets_followers_ratio > 1.0]\n",
715
+ "fig = px.histogram(\n",
716
+ " df_ratios_bigger_than_1,\n",
717
+ " x=\"retweets_followers_ratio\",\n",
718
+ " nbins=10,\n",
719
+ " log_y=True)\n",
720
+ "\n",
721
+ "fig.update_layout(\n",
722
+ " title={\n",
723
+ " 'text': \"Histogram of the distribution of the retweets/followers ratio > 1\",\n",
724
+ " 'y':0.9,\n",
725
+ " 'x':0.5,\n",
726
+ " 'xanchor': 'center',\n",
727
+ " 'yanchor': 'top'})\n",
728
+ "\n",
729
+ "\n",
730
+ "fig.show()"
731
+ ]
732
+ },
733
+ {
734
+ "cell_type": "markdown",
735
+ "metadata": {},
736
+ "source": [
737
+ "The histogram is not very clear, since we have rare events where the tweets garnered so much popularity wrt the popularity of the user. Those we can definitely consider as viral Maybe we can try K-means to better identify these outliers."
738
+ ]
739
+ },
740
+ {
741
+ "cell_type": "code",
742
+ "execution_count": null,
743
+ "metadata": {},
744
+ "outputs": [],
745
+ "source": [
746
+ "from sklearn.cluster import KMeans\n",
747
+ "\n",
748
+ "n_clusters = 3\n",
749
+ "X = np.array(df_ratios_bigger_than_1[['retweet_count', 'followers_count']])\n",
750
+ "#X = np.vstack((df_ratios_bigger_than_1.retweet_count.to_numpy(), df_ratios_bigger_than_1.followers_count.to_numpy()))\n",
751
+ "#X = df_ratios_bigger_than_1.retweets_followers_ratio.to_numpy().reshape(-1, 1)\n",
752
+ "ratio_kmeans = KMeans(n_clusters=n_clusters, random_state=123).fit(X)\n",
753
+ "\n",
754
+ "#np.vstack((X[:, 0], X[:, 1], ratio_kmeans.labels_)).reshape(-1, 3)\n",
755
+ "#px.scatter(ratio_kmeans, x=)\n",
756
+ "'''\n",
757
+ "plt.title(f'K-Means clustering of #retweets/#followers ratio with k={n_clusters}')\n",
758
+ "plt.xlabel('Retweets')\n",
759
+ "plt.ylabel('Followers')\n",
760
+ "plt.scatter(X[:, 0], X[:, 1], c=ratio_kmeans.labels_)\n",
761
+ "'''"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "code",
766
+ "execution_count": null,
767
+ "metadata": {},
768
+ "outputs": [],
769
+ "source": [
770
+ "kmeans_results_df = pd.DataFrame(X, columns=['retweet_count', 'follower_count']) \n",
771
+ "kmeans_results_df['label'] = ratio_kmeans.labels_"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": null,
777
+ "metadata": {},
778
+ "outputs": [],
779
+ "source": [
780
+ "px.scatter(kmeans_results_df, x='follower_count', y='retweet_count', color='label')\n"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "markdown",
785
+ "metadata": {},
786
+ "source": [
787
+ "#### 3.3 - Metric (# Retweets / avg #retweets of a user)"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "code",
792
+ "execution_count": null,
793
+ "metadata": {},
794
+ "outputs": [],
795
+ "source": [
796
+ "# avg_nb_retweets_per_user = viral_tweets_df_subset.groupby(by='author_id').agg({'retweet_count': ['min', 'mean', 'max'], 'like_count': ['min', 'mean', 'max']})\n",
797
+ "avg_nb_retweets_per_user = viral_tweets_df_subset.groupby(by='author_id').retweet_count.agg(['min', 'mean', 'max'])\n",
798
+ "avg_nb_retweets_per_user"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": null,
804
+ "metadata": {},
805
+ "outputs": [],
806
+ "source": [
807
+ "ratio_retweet_avg_retweets_df = viral_tweets_df_subset.merge(avg_nb_retweets_per_user, on='author_id')\n",
808
+ "ratio_retweet_avg_retweets_df['per_user_performance'] = ratio_retweet_avg_retweets_df['retweet_count'] / ratio_retweet_avg_retweets_df['mean']\n",
809
+ "ratio_retweet_avg_retweets_df"
810
+ ]
811
+ },
812
+ {
813
+ "cell_type": "code",
814
+ "execution_count": null,
815
+ "metadata": {},
816
+ "outputs": [],
817
+ "source": [
818
+ "bigger_than_mean = ratio_retweet_avg_retweets_df[ratio_retweet_avg_retweets_df.per_user_performance > 1]\n",
819
+ "hist = px.histogram(bigger_than_mean, x='per_user_performance', log_y=True)\n",
820
+ "\n",
821
+ "hist.update_layout(title_text=\"Distribution of tweet performance wrt avg #retweets per user\", xaxis_title=\"Tweet performance\", yaxis_title=\"log count\")"
822
+ ]
823
+ },
824
+ {
825
+ "cell_type": "markdown",
826
+ "metadata": {},
827
+ "source": [
828
+ "**Finding**: We established another metric by which we can judge the virality of a tweet, namely the number of retweets vs the average number of retweets per user. We can set a threshold (e.g. > 2) to decide whether a tweet is viral or not. We can also conduct further analysis over those tweets to determine what sets them apart from the others."
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "markdown",
833
+ "metadata": {},
834
+ "source": [
835
+ "#### 3.4 - Tweet Topic (context annotations)"
836
+ ]
837
+ },
838
+ {
839
+ "cell_type": "markdown",
840
+ "metadata": {},
841
+ "source": [
842
+ "What topics are available? Context annotations are Twitter's version of analyzing the topic of a tweet. They are defined as a context **domain** and **entity**. The domain is like a general topic and entity is like a subtopic or a specific topic within the general domain."
843
+ ]
844
+ },
845
+ {
846
+ "cell_type": "code",
847
+ "execution_count": null,
848
+ "metadata": {},
849
+ "outputs": [],
850
+ "source": [
851
+ "import json \n",
852
+ "\n",
853
+ "tweets_with_topics = original_tweets_df.dropna(axis=0, subset='context_annotations')\n",
854
+ "\n",
855
+ "def topic_to_json(x):\n",
856
+ " try:\n",
857
+ " return json.loads(x.replace('\\'', '\"'))\n",
858
+ " except json.JSONDecodeError:\n",
859
+ " print(\"Nope\")\n",
860
+ " return []"
861
+ ]
862
+ },
863
+ {
864
+ "cell_type": "markdown",
865
+ "metadata": {},
866
+ "source": [
867
+ "TODO tomorrow:\n",
868
+ "- Try sample and make it work with context annotations.\n",
869
+ "- Check if has media is not null\n",
870
+ "- hashtags extract tags\n",
871
+ "- Extract context annotations\n",
872
+ "- Use Celia Bearer Token"
873
+ ]
874
+ },
875
+ {
876
+ "cell_type": "code",
877
+ "execution_count": null,
878
+ "metadata": {},
879
+ "outputs": [],
880
+ "source": [
881
+ "from tweepy import Paginator, TooManyRequests\n",
882
+ "client = twitter_client_wrapper.client\n",
883
+ "#tweet_data = twitter_client_wrapper.client.get_users_tweets(id='1584975692126900225', expansions=EXPANSIONS, user_fields=USER_FIELDS, tweet_fields=TWEET_FIELDS, media_fields=MEDIA_FIELDS, exclude='retweets')\n",
884
+ "\n",
885
+ "viral_users_tweets = []\n",
886
+ "# Number of users processed so far\n",
887
+ "try:\n",
888
+ " for tweet in Paginator(client.get_users_tweets, id='1482846121517096961', tweet_fields=TWEET_FIELDS, exclude=\"retweets\").flatten(limit=20):\n",
889
+ " viral_users_tweets.append(tweet.data)\n",
890
+ "except TooManyRequests:\n",
891
+ " print(\"Hit Rate Limit\")\n"
892
+ ]
893
+ },
894
+ {
895
+ "cell_type": "code",
896
+ "execution_count": null,
897
+ "metadata": {},
898
+ "outputs": [],
899
+ "source": [
900
+ "domains = {}\n",
901
+ "entities = {}\n",
902
+ "for tweet in viral_users_tweets:\n",
903
+ " context_annotations = tweet.get('context_annotations', [])\n",
904
+ " tweet_topic_domains = dict([(topic['domain']['id'], topic['domain']) for topic in context_annotations])\n",
905
+ " domains.update(tweet_topic_domains)\n",
906
+ " tweet_topic_entities = dict([(topic['entity']['id'], topic['entity']) for topic in context_annotations])\n",
907
+ " entities.update(tweet_topic_entities)\n",
908
+ " tweet['topic_domain'] = list(tweet_topic_domains.keys())\n",
909
+ " tweet['topic_entity'] = list(tweet_topic_entities.keys())\n",
910
+ " tweet.pop('context_annotations', None)"
911
+ ]
912
+ },
913
+ {
914
+ "cell_type": "code",
915
+ "execution_count": null,
916
+ "metadata": {},
917
+ "outputs": [],
918
+ "source": [
919
+ "import pickle\n",
920
+ "\n",
921
+ "with open('topic_domains.pickle', 'wb') as handle:\n",
922
+ " pickle.dump(entities, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
923
+ "\n",
924
+ "with open('topic_domains.pickle', 'rb') as handle:\n",
925
+ " b = pickle.load(handle)\n",
926
+ "\n",
927
+ "b"
928
+ ]
929
+ },
930
+ {
931
+ "cell_type": "code",
932
+ "execution_count": null,
933
+ "metadata": {},
934
+ "outputs": [],
935
+ "source": [
936
+ "try:\n",
937
+ " with open('topic_domains.pickle', 'rb') as handle:\n",
938
+ " topic_domains = pickle.load(handle)\n",
939
+ "except FileNotFoundError:\n",
940
+ " topic_domains = {}\n",
941
+ "\n",
942
+ "topic_domains"
943
+ ]
944
+ },
945
+ {
946
+ "cell_type": "code",
947
+ "execution_count": null,
948
+ "metadata": {},
949
+ "outputs": [],
950
+ "source": [
951
+ "temp = pd.json_normalize(viral_users_tweets)\n",
952
+ "#temp[temp.context_annotations.notna()]\n",
953
+ "temp"
954
+ ]
955
+ },
956
+ {
957
+ "cell_type": "code",
958
+ "execution_count": null,
959
+ "metadata": {},
960
+ "outputs": [],
961
+ "source": [
962
+ "domains"
963
+ ]
964
+ },
965
+ {
966
+ "cell_type": "code",
967
+ "execution_count": null,
968
+ "metadata": {},
969
+ "outputs": [],
970
+ "source": [
971
+ "s = pd.Series([b[item]['name'] for items in temp.topic_domain.values for item in items])\n",
972
+ "s.groupby(s).count().sort_values()"
973
+ ]
974
+ },
975
+ {
976
+ "cell_type": "code",
977
+ "execution_count": null,
978
+ "metadata": {},
979
+ "outputs": [],
980
+ "source": [
981
+ "viral_users_tweets_2 = []\n",
982
+ "# Number of users processed so far\n",
983
+ "try:\n",
984
+ " for tweet in Paginator(client.get_users_tweets, id='848263392943058944', tweet_fields=TWEET_FIELDS, exclude=\"retweets\").flatten(limit=100):\n",
985
+ " viral_users_tweets_2.append(tweet.data)\n",
986
+ "except TooManyRequests:\n",
987
+ " print(\"Hit Rate Limit\")"
988
+ ]
989
+ },
990
+ {
991
+ "cell_type": "code",
992
+ "execution_count": null,
993
+ "metadata": {},
994
+ "outputs": [],
995
+ "source": [
996
+ "domains = {}\n",
997
+ "entities = {}\n",
998
+ "for tweet in viral_users_tweets_2:\n",
999
+ " context_annotations = tweet.get('context_annotations', [])\n",
1000
+ " tweet_topic_domains = dict([(topic['domain']['id'], topic['domain']) for topic in context_annotations])\n",
1001
+ " domains.update(tweet_topic_domains)\n",
1002
+ " tweet_topic_entities = dict([(topic['entity']['id'], topic['entity']) for topic in context_annotations])\n",
1003
+ " entities.update(tweet_topic_entities)\n",
1004
+ " tweet['topic_domain'] = list(tweet_topic_domains.keys()) if len(tweet_topic_domains.keys()) > 0 else pd.NA\n",
1005
+ " tweet['topic_entity'] = list(tweet_topic_entities.keys()) if len(tweet_topic_entities.keys()) > 0 else pd.NA\n",
1006
+ " #tweet.pop('context_annotations', None)"
1007
+ ]
1008
+ },
1009
+ {
1010
+ "cell_type": "code",
1011
+ "execution_count": null,
1012
+ "metadata": {},
1013
+ "outputs": [],
1014
+ "source": [
1015
+ "temp2_df = pd.json_normalize(viral_users_tweets_2)\n",
1016
+ "first_context = temp2_df[~temp2_df.topic_domain.isna()].topic_domain.iloc[2]"
1017
+ ]
1018
+ },
1019
+ {
1020
+ "cell_type": "code",
1021
+ "execution_count": null,
1022
+ "metadata": {},
1023
+ "outputs": [],
1024
+ "source": [
1025
+ "temp2_df[~temp2_df['entities.hashtags'].isna()]"
1026
+ ]
1027
+ },
1028
+ {
1029
+ "cell_type": "code",
1030
+ "execution_count": null,
1031
+ "metadata": {},
1032
+ "outputs": [],
1033
+ "source": [
1034
+ "temp2_df.to_csv(\"temp.csv\", index=False)"
1035
+ ]
1036
+ },
1037
+ {
1038
+ "cell_type": "code",
1039
+ "execution_count": null,
1040
+ "metadata": {},
1041
+ "outputs": [],
1042
+ "source": [
1043
+ "import ast\n",
1044
+ "\n",
1045
+ "temp2_read = pd.read_csv('temp.csv', converters={'context_annotations': lambda x: eval(x) if (x and len(x) > 0) else np.nan})\n",
1046
+ "first_context = temp2_read[~temp2_read.context_annotations.isna()].context_annotations.iloc[2]\n",
1047
+ "first_context"
1048
+ ]
1049
+ },
1050
+ {
1051
+ "cell_type": "code",
1052
+ "execution_count": null,
1053
+ "metadata": {},
1054
+ "outputs": [],
1055
+ "source": [
1056
+ "eval(first_context)"
1057
+ ]
1058
+ },
1059
+ {
1060
+ "cell_type": "code",
1061
+ "execution_count": null,
1062
+ "metadata": {},
1063
+ "outputs": [],
1064
+ "source": [
1065
+ "def format_context_annotations(context_annotations):\n",
1066
+ " if (pd.isna(context_annotations)):\n",
1067
+ " return []\n",
1068
+ " else:\n",
1069
+ " return json.loads(context_annotations)\n",
1070
+ "\n",
1071
+ "temp2_df.context_annotations.apply(format_context_annotations)"
1072
+ ]
1073
+ },
1074
+ {
1075
+ "cell_type": "code",
1076
+ "execution_count": null,
1077
+ "metadata": {},
1078
+ "outputs": [],
1079
+ "source": [
1080
+ "pd.DataFrame(viral_users_tweets_2, columns=TWEET_FIELDS).to_csv('temp_2.csv', index=False)"
1081
+ ]
1082
+ },
1083
+ {
1084
+ "cell_type": "code",
1085
+ "execution_count": null,
1086
+ "metadata": {},
1087
+ "outputs": [],
1088
+ "source": [
1089
+ "#tweet_data = twitter_client_wrapper.client.get_tweet(id='1584975692126900225', expansions=EXPANSIONS, user_fields=USER_FIELDS, tweet_fields=TWEET_FIELDS, media_fields=MEDIA_FIELDS)\n",
1090
+ "bytes(tweets_with_topics.iloc[1000].context_annotations, encoding='utf-8').decode('unicode_escape')"
1091
+ ]
1092
+ },
1093
+ {
1094
+ "cell_type": "code",
1095
+ "execution_count": 6,
1096
+ "metadata": {},
1097
+ "outputs": [
1098
+ {
1099
+ "data": {
1100
+ "text/plain": [
1101
+ "'46'"
1102
+ ]
1103
+ },
1104
+ "execution_count": 6,
1105
+ "metadata": {},
1106
+ "output_type": "execute_result"
1107
+ }
1108
+ ],
1109
+ "source": [
1110
+ "dtypes={\"id\": str, \"author_id\": str, \"has_media\": bool, \"possibly_sensitive\": bool, \"has_hashtags\": bool}\n",
1111
+ "temp3 = pd.read_csv(\"145371604-to-146944733.csv\", dtype=dtypes)\n",
1112
+ "d = temp3[~temp3.topic_domains.isna()].topic_domains.iloc[0]\n",
1113
+ "eval(d)[0]"
1114
+ ]
1115
+ },
1116
+ {
1117
+ "cell_type": "markdown",
1118
+ "metadata": {},
1119
+ "source": [
1120
+ "#### 3.5 - Tweet Sentiment"
1121
+ ]
1122
+ },
1123
+ {
1124
+ "cell_type": "markdown",
1125
+ "metadata": {},
1126
+ "source": []
1127
+ },
1128
+ {
1129
+ "cell_type": "markdown",
1130
+ "metadata": {},
1131
+ "source": [
1132
+ "#### 3.6 - Possibly sensitive"
1133
+ ]
1134
+ },
1135
+ {
1136
+ "cell_type": "markdown",
1137
+ "metadata": {},
1138
+ "source": []
1139
+ },
1140
+ {
1141
+ "cell_type": "markdown",
1142
+ "metadata": {},
1143
+ "source": [
1144
+ "#### 3.7 - Hashtags"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "cell_type": "code",
1149
+ "execution_count": null,
1150
+ "metadata": {},
1151
+ "outputs": [],
1152
+ "source": [
1153
+ "# TODO: has hashtags (using entities.hashtags)"
1154
+ ]
1155
+ },
1156
+ {
1157
+ "cell_type": "markdown",
1158
+ "metadata": {},
1159
+ "source": [
1160
+ "#### 3.8 - Text preprocessing"
1161
+ ]
1162
+ },
1163
+ {
1164
+ "cell_type": "code",
1165
+ "execution_count": null,
1166
+ "metadata": {},
1167
+ "outputs": [],
1168
+ "source": []
1169
+ },
1170
+ {
1171
+ "cell_type": "markdown",
1172
+ "metadata": {},
1173
+ "source": [
1174
+ "TODO:\n",
1175
+ "- Sort by tweet date (check popularity)\n",
1176
+ "- Use Twitter lists to try and find\n",
1177
+ "- Check if reply or retweet"
1178
+ ]
1179
+ }
1180
+ ],
1181
+ "metadata": {
1182
+ "kernelspec": {
1183
+ "display_name": "Python 3.8.11 ('ada')",
1184
+ "language": "python",
1185
+ "name": "python3"
1186
+ },
1187
+ "language_info": {
1188
+ "codemirror_mode": {
1189
+ "name": "ipython",
1190
+ "version": 3
1191
+ },
1192
+ "file_extension": ".py",
1193
+ "mimetype": "text/x-python",
1194
+ "name": "python",
1195
+ "nbconvert_exporter": "python",
1196
+ "pygments_lexer": "ipython3",
1197
+ "version": "3.8.13"
1198
+ },
1199
+ "orig_nbformat": 4,
1200
+ "vscode": {
1201
+ "interpreter": {
1202
+ "hash": "71d2f77bccee14ca7852d7b7a1fa8ea4708b81087104d93973081337557f0ee6"
1203
+ }
1204
+ }
1205
+ },
1206
+ "nbformat": 4,
1207
+ "nbformat_minor": 2
1208
+ }
othercode/collect_users_tweets.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from tweepy import Paginator, TooManyRequests
3
+ import os
4
+ import pandas as pd
5
+ import pickle
6
+ from tqdm import tqdm
7
+ import yaml
8
+
9
+ import boto3
10
+
11
+ from helper.twitter_client_wrapper import (
12
+ TWEET_FIELDS,
13
+ format_tweets_df, format_context_annotations,
14
+ load_topic_domains, load_topic_entities, TwitterClientWrapper
15
+ )
16
+
17
+ USER_IDS_PATH = "users_ids.csv"
18
+
19
+ def run(twitter_client, directory, users_ids, tweets_per_user=20000, push_to_remote=True):
20
+ topic_domains = load_topic_domains(f'{directory}topic_domains.pickle')
21
+ topic_entities = load_topic_entities(f'{directory}topic_entities.pickle')
22
+
23
+ # List where we accumulate the tweets retrieved so far
24
+ viral_users_tweets = []
25
+ # Number of users processed so far
26
+ users_processed = 0
27
+ filename = f"tweets/{users_ids.id[0]}-to-"
28
+
29
+ try:
30
+ for user_id in tqdm(users_ids.id):
31
+ for tweet in Paginator(twitter_client.get_users_tweets, id=user_id, tweet_fields=TWEET_FIELDS, exclude="retweets").flatten(limit=tweets_per_user):
32
+ processed_tweet, tweet_topic_domains, tweet_topic_entities = format_context_annotations(tweet.data)
33
+ viral_users_tweets.append(processed_tweet)
34
+ topic_domains.update(tweet_topic_domains)
35
+ topic_entities.update(tweet_topic_entities)
36
+ users_processed += 1
37
+ except TooManyRequests:
38
+ # Reached API limit
39
+ print("Hit Rate Limit")
40
+ finally:
41
+ # Dump all to parquet and keep track at which user we stopped.
42
+ if len(viral_users_tweets) > 0:
43
+ # Append end user id for this iteration to end of filename
44
+ filename += f"{user_id}.parquet.gzip"
45
+ filepath = directory + filename
46
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
47
+ format_tweets_df(viral_users_tweets).to_parquet(filepath, compression="gzip", index=False)
48
+
49
+ # Save the topics encountered so far as pickle file
50
+ with open(f'{directory}topic_domains.pickle', 'wb') as handle:
51
+ pickle.dump(topic_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)
52
+
53
+ with open(f'{directory}topic_entities.pickle', 'wb') as handle:
54
+ pickle.dump(topic_entities, handle, protocol=pickle.HIGHEST_PROTOCOL)
55
+
56
+ # Update the users ids to remove the ones already processed
57
+ users_ids[users_processed:].to_csv(f"{directory}{USER_IDS_PATH}", index=False)
58
+
59
+ if (push_to_remote):
60
+ s3 = boto3.resource("s3")
61
+ bucket_name = "semester-project-twitter-storage"
62
+ # Upload to S3
63
+ s3.Bucket(bucket_name).upload_file(filepath, filename)
64
+ else:
65
+ print("Finished processing users")
66
+
67
+ return
68
+
69
+ def main():
70
+ # TODO: Change depending on whether you're executing this script locally or on a remote server (possibly with s3 access)
71
+ LOCAL = False
72
+ TWEETS_PER_USER = 4000
73
+
74
+ if LOCAL:
75
+ DIRECTORY = ""
76
+ with open("api_key.yaml", 'rt') as file:
77
+ secret = yaml.safe_load(file)
78
+ BEARER_TOKEN = secret['Bearer Token']
79
+ PUSH_TO_REMOTE = False
80
+ else:
81
+ DIRECTORY="/home/ubuntu/tweet/"
82
+ BEARER_TOKEN = os.environ["BearerToken"]
83
+ PUSH_TO_REMOTE = True
84
+
85
+ # Authenticate to Twitter
86
+ client_wrapper = TwitterClientWrapper(BEARER_TOKEN, wait_on_rate_limit=False)
87
+ client = client_wrapper.client
88
+
89
+ users_ids = pd.read_csv(f"{DIRECTORY}{USER_IDS_PATH}", dtype={"id": str})
90
+
91
+ if len(users_ids) != 0:
92
+ run(client, DIRECTORY, users_ids=users_ids, tweets_per_user=TWEETS_PER_USER, push_to_remote=PUSH_TO_REMOTE)
93
+
94
+ if __name__ == "__main__":
95
+ main()
othercode/hydrate_tweets.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from tweepy import TooManyRequests
3
+ import os
4
+ import pandas as pd
5
+ import pickle
6
+ import yaml
7
+ import boto3
8
+
9
+ from helper.twitter_client_wrapper import (
10
+ format_tweets_df, format_users_df, format_context_annotations,
11
+ load_topic_domains, load_topic_entities, TwitterClientWrapper
12
+ )
13
+
14
+ COVID_IDS_PATH = "covid_ids.parquet.gzip"
15
+ STEP_SIZE = 100
16
+
17
+ def run(twitter_client, directory, covid_tweets_ids, gather_retweets=True, push_to_remote=True):
18
+ topic_domains = load_topic_domains(f'{directory}topic_domains.pickle')
19
+ topic_entities = load_topic_entities(f'{directory}topic_entities.pickle')
20
+
21
+ # List where we accumulate the tweets retrieved so far
22
+ collected_tweets = []
23
+ # List where we accumulate the users retrieved so far
24
+ collected_users = []
25
+ if gather_retweets:
26
+ # We're gathering retweet ids
27
+ covid_filepath = "covid"
28
+ else:
29
+ # We're gathering retweets themselves
30
+ covid_filepath = "covid_retweets"
31
+ tweet_filepath_temp = f"{covid_filepath}/tweets/"
32
+ user_filepath_temp = f"{covid_filepath}/users/"
33
+ retweet_filepath_temp = f"{covid_filepath}/retweets/"
34
+
35
+ # Take the ceil to process any remaining tweet ids
36
+ steps = int(len(covid_tweets_ids)/STEP_SIZE) + 1
37
+
38
+ try:
39
+ for i in range(steps):
40
+ tweets = twitter_client.retrieve_tweets_by_ids(ids=covid_tweets_ids[i*STEP_SIZE:(i+1)*STEP_SIZE])
41
+ included_users = tweets.includes.get('users', [])
42
+ collected_users += included_users
43
+ for tweet in tweets.data:
44
+ processed_tweet, tweet_topic_domains, tweet_topic_entities = format_context_annotations(tweet.data)
45
+ collected_tweets.append(processed_tweet)
46
+ topic_domains.update(tweet_topic_domains)
47
+ topic_entities.update(tweet_topic_entities)
48
+ except TooManyRequests:
49
+ # Reached API limit
50
+ print(f"Hit Rate Limit, processed {i * STEP_SIZE}")
51
+ print(f'tweets left: {len(covid_tweets_ids) - (i * STEP_SIZE)}')
52
+ finally:
53
+ # Dump all to parquet and keep track at which user we stopped.
54
+ if len(collected_tweets) > 0:
55
+ # Append end tweet id for this iteration to end of filename
56
+ first_processed_tweet_id = collected_tweets[0]['id']
57
+ last_processed_tweet_id = collected_tweets[-1]['id']
58
+ tweet_filename = f"{first_processed_tweet_id}-to-{last_processed_tweet_id}.parquet.gzip"
59
+ tweet_filepath = directory + tweet_filepath_temp + tweet_filename
60
+ os.makedirs(os.path.dirname(tweet_filepath), exist_ok=True)
61
+ format_tweets_df(collected_tweets).to_parquet(tweet_filepath, compression="gzip", index=False)
62
+
63
+ user_filepath = directory + user_filepath_temp + tweet_filename
64
+ os.makedirs(os.path.dirname(user_filepath), exist_ok=True)
65
+ format_users_df([user.data for user in collected_users]).to_parquet(user_filepath, compression="gzip", index=False)
66
+
67
+ if gather_retweets:
68
+ # Check if tweet has referenced tweets
69
+ retweeted = [tweet for tweet in collected_tweets if tweet.get('referenced_tweets')]
70
+ # Retrieve all referenced tweets ids in the tweet
71
+ referenced_tweets_ids = set([referenced_tweet['id'] for tweet in retweeted for referenced_tweet in tweet['referenced_tweets'] if referenced_tweet['type'] == 'retweeted'])
72
+ retweet_filepath = directory + retweet_filepath_temp + tweet_filename
73
+ os.makedirs(os.path.dirname(retweet_filepath), exist_ok=True)
74
+ pd.DataFrame(referenced_tweets_ids, columns=['id']).to_parquet(retweet_filepath, compression="gzip", index=False)
75
+
76
+ # Save the topics encountered so far as pickle file
77
+ with open(f'{directory}topic_domains.pickle', 'wb') as handle:
78
+ pickle.dump(topic_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)
79
+
80
+ with open(f'{directory}topic_entities.pickle', 'wb') as handle:
81
+ pickle.dump(topic_entities, handle, protocol=pickle.HIGHEST_PROTOCOL)
82
+
83
+ # Update the tweets ids to remove the ones already processed
84
+ if len(covid_tweets_ids) < 100:
85
+ pd.DataFrame([], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
86
+ else:
87
+ pd.DataFrame(covid_tweets_ids[(i*STEP_SIZE):], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
88
+
89
+ if (push_to_remote):
90
+ s3 = boto3.resource("s3")
91
+ bucket_name = "semester-project-twitter-storage"
92
+ # Upload to S3
93
+ bucket = s3.Bucket(bucket_name)
94
+ bucket.upload_file(tweet_filepath, f"{tweet_filepath_temp}{tweet_filename}")
95
+ bucket.upload_file(user_filepath, f"{user_filepath_temp}{tweet_filename}")
96
+ if gather_retweets:
97
+ bucket.upload_file(retweet_filepath, f"{retweet_filepath_temp}{tweet_filename}")
98
+ else:
99
+ print("Finished processing users")
100
+
101
+ return
102
+
103
+ def main():
104
+ # TODO: Change depending on whether you're executing this script locally or on a remote server (possibly with s3 access)
105
+ LOCAL = False
106
+
107
+ if LOCAL:
108
+ DIRECTORY = ""
109
+ with open("api_key.yaml", 'rt') as file:
110
+ secret = yaml.safe_load(file)
111
+ BEARER_TOKEN = secret['Bearer Token']
112
+ PUSH_TO_REMOTE = False
113
+ else:
114
+ DIRECTORY="/home/ubuntu/covid_tweets/"
115
+ BEARER_TOKEN = os.environ["BearerToken"]
116
+ PUSH_TO_REMOTE = True
117
+
118
+ # Authenticate to Twitter
119
+ client_wrapper = TwitterClientWrapper(BEARER_TOKEN, wait_on_rate_limit=False)
120
+
121
+ covid_ids = list(pd.read_parquet(f"{DIRECTORY}{COVID_IDS_PATH}").id)
122
+
123
+ if len(covid_ids) != 0:
124
+ run(client_wrapper, DIRECTORY, covid_tweets_ids=covid_ids, gather_retweets=False, push_to_remote=PUSH_TO_REMOTE)
125
+
126
+ if __name__ == "__main__":
127
+ main()
othercode/text_preprocessing.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+
3
+ def clear_reply_mentions(tweet):
4
+ '''Remove user mentions found in a reply to a tweet.
5
+
6
+ Example: @user1 @user2 okay @user3 -> okay @user3
7
+ '''
8
+ # We don't need to use any sophisticated tokenization here like nltk
9
+ tokens = tweet.split(" ")
10
+ for index in range(len(tokens)):
11
+ if not tokens[index].startswith("@"):
12
+ return " ".join(tokens[index:])
13
+ return ""
14
+
15
+ from emoji import demojize, is_emoji
16
+ from nltk.tokenize import TweetTokenizer
17
+
18
+ tweet_tokenizer = TweetTokenizer()
19
+
20
+ def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
21
+ lowercased_token = token.lower()
22
+ if token.startswith("@") and replace_user_mentions:
23
+ return "@USER"
24
+ elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
25
+ return "HTTPURL"
26
+ elif len(token) == 1 and is_emoji(token):
27
+ emojis_found.append(token)
28
+ if demojize_emojis:
29
+ return demojize(token)
30
+ else:
31
+ return token
32
+ else:
33
+ if token == "’":
34
+ return "'"
35
+ elif token == "…":
36
+ return "..."
37
+ else:
38
+ return token
39
+
40
+
41
+ def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
42
+ emojis_found = []
43
+ tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
44
+ normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found,
45
+ replace_user_mentions=replace_user_mentions,
46
+ replace_urls=replace_urls,
47
+ demojize_emojis=demojize_emojis) for token in tokens])
48
+
49
+ if bert_tweet_specific_processing:
50
+ normTweet = (
51
+ normTweet.replace("cannot ", "can not ")
52
+ .replace("n't ", " n't ")
53
+ .replace("n 't ", " n't ")
54
+ .replace("ca n't", "can't")
55
+ .replace("ai n't", "ain't")
56
+ )
57
+ normTweet = (
58
+ normTweet.replace("'m ", " 'm ")
59
+ .replace("'re ", " 're ")
60
+ .replace("'s ", " 's ")
61
+ .replace("'ll ", " 'll ")
62
+ .replace("'d ", " 'd ")
63
+ .replace("'ve ", " 've ")
64
+ )
65
+ normTweet = (
66
+ normTweet.replace(" p . m .", " p.m.")
67
+ .replace(" p . m ", " p.m ")
68
+ .replace(" a . m .", " a.m.")
69
+ .replace(" a . m ", " a.m ")
70
+ )
71
+
72
+ return " ".join(normTweet.split()), emojis_found
73
+
74
+
75
+ def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
76
+ demojize_emojis=True, bert_tweet_specific_processing=True):
77
+ '''Helper function to clean tweets. Highly customizable to fit different needs.
78
+
79
+ Params:
80
+ tweet: the tweet to clean
81
+ clear_html_chars: If true, will unescape any special html entities found in the tweet
82
+ replace_user_mentions: If true, will replace any user mention with the token @USER
83
+ replace_urls: If true, will replace any urls with the token HTTPURL
84
+ demojize_emojis: If true, will demojize emojis
85
+ bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model
86
+
87
+ Returns:
88
+ The cleaned tweet
89
+ '''
90
+ # First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
91
+ # These do not count in the character count of a tweet and may make the tweet length go way overboard.
92
+ cleaned_tweet = clear_reply_mentions(tweet)
93
+
94
+ # Second step: Remove any new lines
95
+ cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')
96
+
97
+ # Third step: if True, escape any html entities
98
+ if clear_html_chars:
99
+ cleaned_tweet = html.unescape(cleaned_tweet)
100
+
101
+ # Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
102
+ normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
103
+ replace_user_mentions=replace_user_mentions,
104
+ replace_urls=replace_urls,
105
+ demojize_emojis=demojize_emojis,
106
+ bert_tweet_specific_processing=bert_tweet_specific_processing)
107
+
108
+ # TODO: process emoticons? e.g. :)
109
+
110
+ return normalized_tweet