Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .envrc +2 -0
- .gitattributes +3 -0
- .gitignore +4 -0
- .vercelignore +1 -0
- README.md +35 -6
- TestRepro.ipynb +732 -0
- all_metric_stats.csv +0 -0
- classification/model_with_extra_features/classification.py +221 -0
- classification/model_with_extra_features/custom_model.py +44 -0
- classification/model_with_extra_features/final_dataset_since_october_2022.parquet.gzip +3 -0
- classification/model_with_extra_features/same_day_as_viral_with_features_train_test_balanced_accuracy.txt +4 -0
- classification/model_with_extra_features/text_preprocessing.py +110 -0
- classification/model_with_only_language_models/classification.py +221 -0
- classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip +3 -0
- classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt +3 -0
- classification/model_with_only_language_models/same_day_as_viral_train_test_balanced_accuracy.txt +4 -0
- classification/model_with_only_language_models/same_day_as_viral_with_features_train_test_balanced_accuracy.txt +2 -0
- classification/model_with_only_language_models/test.parquet.gzip +0 -0
- classification/model_with_only_language_models/text_preprocessing.py +110 -0
- classification/model_with_only_language_models/train.parquet.gzip +0 -0
- data/control.csv +3 -0
- data/viral.csv +1042 -0
- main.py +86 -0
- metric_analysis/1-standardize_metrics.py +53 -0
- metric_analysis/2023-precision-recall-update.py +35 -0
- metric_analysis/output_original/hard_threshold_viral_covered_vs_new_tweets_labeled.csv +0 -0
- metric_analysis/output_original/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/virality_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv +102 -0
- metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_original/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/hard_threshold_viral_covered_vs_new_tweets_labeled.csv +843 -0
- metric_analysis/output_standardized/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/virality_followers_viral_covered_vs_new_tweets_labeled.csv +102 -0
- metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv +102 -0
- metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv +35 -0
- metric_analysis/output_standardized/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv +26 -0
- metric_analysis/twitter_viral_model.ipynb +2303 -0
- metric_analysis/viral_tweet_user_exploration.ipynb +1208 -0
- othercode/collect_users_tweets.py +95 -0
- othercode/hydrate_tweets.py +127 -0
- othercode/text_preprocessing.py +110 -0
.envrc
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
layout python
|
2 |
+
use_nodejs 22
|
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
classification/model_with_extra_features/final_dataset_since_october_2022.parquet.gzip filter=lfs diff=lfs merge=lfs -text
|
37 |
+
classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/control.csv filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.vercel
|
2 |
+
.direnv
|
3 |
+
.ipynb_checkpoints
|
4 |
+
__pycache__
|
.vercelignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.direnv
|
README.md
CHANGED
@@ -1,12 +1,41 @@
|
|
1 |
---
|
2 |
title: ViralTweets
|
3 |
-
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.12.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: ViralTweets
|
3 |
+
app_file: main.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 5.12.0
|
|
|
|
|
6 |
---
|
7 |
+
# ViralTweets
|
8 |
|
9 |
+
This repository contains the data for the paper "**Measuring and Detecting Virality on Social Media: The Case of Twitter's Viral Tweets Topic**".
|
10 |
+
|
11 |
+
Main files:
|
12 |
+
|
13 |
+
``data/viral.csv``: IDs of tweets scraped from viral topics
|
14 |
+
|
15 |
+
``data/control.csv``: IDs of tweets posted by users who went viral at least once
|
16 |
+
|
17 |
+
``all_metric_stats.csv``: The stats of the metrics tested
|
18 |
+
|
19 |
+
``viral_tweets_html_id_extractor.ipynb``: scraper for viral tweets
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
Others:
|
24 |
+
|
25 |
+
``classification``: reproduction code for classification
|
26 |
+
|
27 |
+
``metric_analysis``: intermediate results and codes for metric stats
|
28 |
+
|
29 |
+
``othercode``: other code
|
30 |
+
|
31 |
+
Please email me tugrulcanelmas at gmail.com to get full access to data if you wish.
|
32 |
+
|
33 |
+
Please cite our paper if you use our data.
|
34 |
+
|
35 |
+
### Contributors:
|
36 |
+
|
37 |
+
Stephane Selim (EPFL)
|
38 |
+
|
39 |
+
Célia Houssiaux (EPFL)
|
40 |
+
|
41 |
+
Tuğrulcan Elmas (EPFL / IU Bloomington)
|
TestRepro.ipynb
ADDED
@@ -0,0 +1,732 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "3f7f2ede-4f06-4d5a-b19c-30a7fc4406bc",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"%load_ext autoreload\n",
|
11 |
+
"%autoreload 2"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "code",
|
16 |
+
"execution_count": 2,
|
17 |
+
"id": "77cdea1b-525e-493c-9eca-c99d33d9ac54",
|
18 |
+
"metadata": {},
|
19 |
+
"outputs": [],
|
20 |
+
"source": [
|
21 |
+
"import pandas as pd\n",
|
22 |
+
"from torch.utils.data import DataLoader\n",
|
23 |
+
"from torch.nn import functional as F\n",
|
24 |
+
"import torch"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": 3,
|
30 |
+
"id": "a5d0f4dd-0f71-4314-9e0e-62311de3eef3",
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"#all_tweets_labeled = pd.read_parquet('classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip')"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": 4,
|
40 |
+
"id": "da3bcd2a-b6c1-4026-8905-777b4ac351ad",
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"#all_tweets_labeled.head()"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": 246,
|
50 |
+
"id": "e996e9fe-4dc1-4a4c-82a0-8cb3a7862ee8",
|
51 |
+
"metadata": {},
|
52 |
+
"outputs": [],
|
53 |
+
"source": [
|
54 |
+
"all_tweets_labeled = pd.DataFrame([\n",
|
55 |
+
" {\"id\": 1, \"text\": \"\"\"tl;dr\n",
|
56 |
+
"\n",
|
57 |
+
"Humans are just ChatGPT Wrappers in sunglasses\n",
|
58 |
+
" \n",
|
59 |
+
"& I couldn’t be more optimistic about the future as a result\n",
|
60 |
+
"\n",
|
61 |
+
"Thank you \n",
|
62 |
+
"@ekang426322\n",
|
63 |
+
" for an exceptionally curated day at BUIDL Europe!\n",
|
64 |
+
" 🫶\"\"\", \"viral\": 1},\n",
|
65 |
+
" {\"id\": 2, \"text\": \"\"\"USD0++ discovered a new source of yield — depeg. \n",
|
66 |
+
"\n",
|
67 |
+
"Respect to the innovation\n",
|
68 |
+
"\"\"\", \"viral\": 0},\n",
|
69 |
+
" {\"id\": 3, \"text\": \"\"\"here you can see 4 ai agents \n",
|
70 |
+
"@dongossen100\n",
|
71 |
+
" , me, \n",
|
72 |
+
"@WorldWideWarden16\n",
|
73 |
+
" and \n",
|
74 |
+
"@provenauthority291\n",
|
75 |
+
" discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence\"\"\",\n",
|
76 |
+
" \"viral\": 1},\n",
|
77 |
+
" {\"id\": 4, \"text\": \"\"\"\n",
|
78 |
+
" arrived to lisbon, building energy is the air\"\"\", \"viral\": 0},\n",
|
79 |
+
" dict(id=5,text=\"\"\"\n",
|
80 |
+
" received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\n",
|
81 |
+
"\n",
|
82 |
+
"can't wait to address it all\n",
|
83 |
+
"\n",
|
84 |
+
"main points:\n",
|
85 |
+
"\n",
|
86 |
+
"- show rank in X DMs to quickly filter out inbox\n",
|
87 |
+
"\n",
|
88 |
+
"- rank labels are too distracting (already fixed) \n",
|
89 |
+
"\n",
|
90 |
+
"- add an option for users to toggle on/off scores inside the feed\n",
|
91 |
+
"\n",
|
92 |
+
"- add a percentile label, e.g. qw 801 (Top 0.1%)\n",
|
93 |
+
"\n",
|
94 |
+
"- enable others to add reviews to impact the rank \n",
|
95 |
+
"\n",
|
96 |
+
"- explain in detail how rankings are calculated \n",
|
97 |
+
"\n",
|
98 |
+
"- show breakdowns of people in DeFi, DePin, Memecoins etc.\n",
|
99 |
+
"\n",
|
100 |
+
"- make X Rank opensource \n",
|
101 |
+
"\n",
|
102 |
+
"- create a web version\n",
|
103 |
+
"\n",
|
104 |
+
"p.s. the current version is just a tiny step in our roadmap for the next two months. \n",
|
105 |
+
"\n",
|
106 |
+
"thank you for the feedback \n",
|
107 |
+
"@socialfi_panda101\n",
|
108 |
+
" \n",
|
109 |
+
"@adamkillam100\n",
|
110 |
+
" \n",
|
111 |
+
"@FamKien106\n",
|
112 |
+
" \n",
|
113 |
+
"@antongotchi104\n",
|
114 |
+
" \n",
|
115 |
+
"@kliuless128\n",
|
116 |
+
" \n",
|
117 |
+
"@0xsudogm163\n",
|
118 |
+
" \n",
|
119 |
+
"@monosarin120\n",
|
120 |
+
" \n",
|
121 |
+
"@flb_xyz56\n",
|
122 |
+
" 🫶\n",
|
123 |
+
" \"\"\",\n",
|
124 |
+
" viral=0),\n",
|
125 |
+
" dict(id=6, text=\"\"\"ai agents are in the air\n",
|
126 |
+
"\n",
|
127 |
+
"and web3 is trained to sniff out alpha\"\"\", viral=1),\n",
|
128 |
+
" dict(id=7, text=\"\"\"While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀\"\"\", viral=1),\n",
|
129 |
+
"])"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 247,
|
135 |
+
"id": "a0f4c14d-c9e4-4de6-b723-8e7c0e166b90",
|
136 |
+
"metadata": {},
|
137 |
+
"outputs": [
|
138 |
+
{
|
139 |
+
"data": {
|
140 |
+
"text/html": [
|
141 |
+
"<div>\n",
|
142 |
+
"<style scoped>\n",
|
143 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
144 |
+
" vertical-align: middle;\n",
|
145 |
+
" }\n",
|
146 |
+
"\n",
|
147 |
+
" .dataframe tbody tr th {\n",
|
148 |
+
" vertical-align: top;\n",
|
149 |
+
" }\n",
|
150 |
+
"\n",
|
151 |
+
" .dataframe thead th {\n",
|
152 |
+
" text-align: right;\n",
|
153 |
+
" }\n",
|
154 |
+
"</style>\n",
|
155 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
156 |
+
" <thead>\n",
|
157 |
+
" <tr style=\"text-align: right;\">\n",
|
158 |
+
" <th></th>\n",
|
159 |
+
" <th>id</th>\n",
|
160 |
+
" <th>text</th>\n",
|
161 |
+
" <th>viral</th>\n",
|
162 |
+
" </tr>\n",
|
163 |
+
" </thead>\n",
|
164 |
+
" <tbody>\n",
|
165 |
+
" <tr>\n",
|
166 |
+
" <th>0</th>\n",
|
167 |
+
" <td>1</td>\n",
|
168 |
+
" <td>tl;dr\\n\\nHumans are just ChatGPT Wrappers in s...</td>\n",
|
169 |
+
" <td>1</td>\n",
|
170 |
+
" </tr>\n",
|
171 |
+
" <tr>\n",
|
172 |
+
" <th>1</th>\n",
|
173 |
+
" <td>2</td>\n",
|
174 |
+
" <td>USD0++ discovered a new source of yield — depe...</td>\n",
|
175 |
+
" <td>0</td>\n",
|
176 |
+
" </tr>\n",
|
177 |
+
" <tr>\n",
|
178 |
+
" <th>2</th>\n",
|
179 |
+
" <td>3</td>\n",
|
180 |
+
" <td>here you can see 4 ai agents \\n@dongossen100\\n...</td>\n",
|
181 |
+
" <td>1</td>\n",
|
182 |
+
" </tr>\n",
|
183 |
+
" <tr>\n",
|
184 |
+
" <th>3</th>\n",
|
185 |
+
" <td>4</td>\n",
|
186 |
+
" <td>\\n arrived to lisbon, building energy is th...</td>\n",
|
187 |
+
" <td>0</td>\n",
|
188 |
+
" </tr>\n",
|
189 |
+
" <tr>\n",
|
190 |
+
" <th>4</th>\n",
|
191 |
+
" <td>5</td>\n",
|
192 |
+
" <td>\\n received a wealth of valuable feedback o...</td>\n",
|
193 |
+
" <td>0</td>\n",
|
194 |
+
" </tr>\n",
|
195 |
+
" <tr>\n",
|
196 |
+
" <th>5</th>\n",
|
197 |
+
" <td>6</td>\n",
|
198 |
+
" <td>ai agents are in the air\\n\\nand web3 is traine...</td>\n",
|
199 |
+
" <td>1</td>\n",
|
200 |
+
" </tr>\n",
|
201 |
+
" <tr>\n",
|
202 |
+
" <th>6</th>\n",
|
203 |
+
" <td>7</td>\n",
|
204 |
+
" <td>While Trump is going to do something great wit...</td>\n",
|
205 |
+
" <td>1</td>\n",
|
206 |
+
" </tr>\n",
|
207 |
+
" </tbody>\n",
|
208 |
+
"</table>\n",
|
209 |
+
"</div>"
|
210 |
+
],
|
211 |
+
"text/plain": [
|
212 |
+
" id text viral\n",
|
213 |
+
"0 1 tl;dr\\n\\nHumans are just ChatGPT Wrappers in s... 1\n",
|
214 |
+
"1 2 USD0++ discovered a new source of yield — depe... 0\n",
|
215 |
+
"2 3 here you can see 4 ai agents \\n@dongossen100\\n... 1\n",
|
216 |
+
"3 4 \\n arrived to lisbon, building energy is th... 0\n",
|
217 |
+
"4 5 \\n received a wealth of valuable feedback o... 0\n",
|
218 |
+
"5 6 ai agents are in the air\\n\\nand web3 is traine... 1\n",
|
219 |
+
"6 7 While Trump is going to do something great wit... 1"
|
220 |
+
]
|
221 |
+
},
|
222 |
+
"execution_count": 247,
|
223 |
+
"metadata": {},
|
224 |
+
"output_type": "execute_result"
|
225 |
+
}
|
226 |
+
],
|
227 |
+
"source": [
|
228 |
+
"all_tweets_labeled"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"cell_type": "code",
|
233 |
+
"execution_count": 248,
|
234 |
+
"id": "3e8326c3-1df6-435d-b0ee-e7b9449c6675",
|
235 |
+
"metadata": {},
|
236 |
+
"outputs": [],
|
237 |
+
"source": [
|
238 |
+
"from classification.model_with_only_language_models.text_preprocessing import clean_tweet"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": 249,
|
244 |
+
"id": "5bb79b0c-42d1-4f1c-ad65-7ebfbbd17098",
|
245 |
+
"metadata": {},
|
246 |
+
"outputs": [],
|
247 |
+
"source": [
|
248 |
+
"dataset = all_tweets_labeled\n",
|
249 |
+
"\n",
|
250 |
+
"dataset.loc[:, \"viral\"] = dataset.viral.astype(int)\n",
|
251 |
+
"dataset[\"cleaned_text\"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))"
|
252 |
+
]
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"cell_type": "code",
|
256 |
+
"execution_count": 250,
|
257 |
+
"id": "f45533d3-f3f6-49bc-b347-663d72fffa34",
|
258 |
+
"metadata": {},
|
259 |
+
"outputs": [],
|
260 |
+
"source": [
|
261 |
+
"dataset = dataset.dropna()\n",
|
262 |
+
"dataset = dataset[['id', 'cleaned_text', 'viral']]"
|
263 |
+
]
|
264 |
+
},
|
265 |
+
{
|
266 |
+
"cell_type": "code",
|
267 |
+
"execution_count": 251,
|
268 |
+
"id": "4eb4afa9-3de4-4579-b1a3-9418ca534453",
|
269 |
+
"metadata": {},
|
270 |
+
"outputs": [
|
271 |
+
{
|
272 |
+
"data": {
|
273 |
+
"text/html": [
|
274 |
+
"<div>\n",
|
275 |
+
"<style scoped>\n",
|
276 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
277 |
+
" vertical-align: middle;\n",
|
278 |
+
" }\n",
|
279 |
+
"\n",
|
280 |
+
" .dataframe tbody tr th {\n",
|
281 |
+
" vertical-align: top;\n",
|
282 |
+
" }\n",
|
283 |
+
"\n",
|
284 |
+
" .dataframe thead th {\n",
|
285 |
+
" text-align: right;\n",
|
286 |
+
" }\n",
|
287 |
+
"</style>\n",
|
288 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
289 |
+
" <thead>\n",
|
290 |
+
" <tr style=\"text-align: right;\">\n",
|
291 |
+
" <th></th>\n",
|
292 |
+
" <th>id</th>\n",
|
293 |
+
" <th>cleaned_text</th>\n",
|
294 |
+
" <th>viral</th>\n",
|
295 |
+
" </tr>\n",
|
296 |
+
" </thead>\n",
|
297 |
+
" <tbody>\n",
|
298 |
+
" <tr>\n",
|
299 |
+
" <th>0</th>\n",
|
300 |
+
" <td>1</td>\n",
|
301 |
+
" <td>tl ;d rHumans are just ChatGPT Wrappers in sun...</td>\n",
|
302 |
+
" <td>1</td>\n",
|
303 |
+
" </tr>\n",
|
304 |
+
" <tr>\n",
|
305 |
+
" <th>1</th>\n",
|
306 |
+
" <td>2</td>\n",
|
307 |
+
" <td>USD 0 + + discovered a new source of yield — d...</td>\n",
|
308 |
+
" <td>0</td>\n",
|
309 |
+
" </tr>\n",
|
310 |
+
" <tr>\n",
|
311 |
+
" <th>2</th>\n",
|
312 |
+
" <td>3</td>\n",
|
313 |
+
" <td>here you can see 4 ai agents @USER , me , @USE...</td>\n",
|
314 |
+
" <td>1</td>\n",
|
315 |
+
" </tr>\n",
|
316 |
+
" <tr>\n",
|
317 |
+
" <th>3</th>\n",
|
318 |
+
" <td>4</td>\n",
|
319 |
+
" <td>arrived to lisbon , building energy is the air</td>\n",
|
320 |
+
" <td>0</td>\n",
|
321 |
+
" </tr>\n",
|
322 |
+
" <tr>\n",
|
323 |
+
" <th>4</th>\n",
|
324 |
+
" <td>5</td>\n",
|
325 |
+
" <td>received a wealth of valuable feedback on the ...</td>\n",
|
326 |
+
" <td>0</td>\n",
|
327 |
+
" </tr>\n",
|
328 |
+
" </tbody>\n",
|
329 |
+
"</table>\n",
|
330 |
+
"</div>"
|
331 |
+
],
|
332 |
+
"text/plain": [
|
333 |
+
" id cleaned_text viral\n",
|
334 |
+
"0 1 tl ;d rHumans are just ChatGPT Wrappers in sun... 1\n",
|
335 |
+
"1 2 USD 0 + + discovered a new source of yield — d... 0\n",
|
336 |
+
"2 3 here you can see 4 ai agents @USER , me , @USE... 1\n",
|
337 |
+
"3 4 arrived to lisbon , building energy is the air 0\n",
|
338 |
+
"4 5 received a wealth of valuable feedback on the ... 0"
|
339 |
+
]
|
340 |
+
},
|
341 |
+
"execution_count": 251,
|
342 |
+
"metadata": {},
|
343 |
+
"output_type": "execute_result"
|
344 |
+
}
|
345 |
+
],
|
346 |
+
"source": [
|
347 |
+
"dataset.head()"
|
348 |
+
]
|
349 |
+
},
|
350 |
+
{
|
351 |
+
"cell_type": "code",
|
352 |
+
"execution_count": 252,
|
353 |
+
"id": "f6f076f8-3b0e-446b-ac69-582e1bcf1ee0",
|
354 |
+
"metadata": {},
|
355 |
+
"outputs": [],
|
356 |
+
"source": [
|
357 |
+
"from datasets import Dataset"
|
358 |
+
]
|
359 |
+
},
|
360 |
+
{
|
361 |
+
"cell_type": "code",
|
362 |
+
"execution_count": 253,
|
363 |
+
"id": "86ca78a6-998d-45f5-bc0e-d22531dbc174",
|
364 |
+
"metadata": {},
|
365 |
+
"outputs": [
|
366 |
+
{
|
367 |
+
"data": {
|
368 |
+
"text/plain": [
|
369 |
+
"Dataset({\n",
|
370 |
+
" features: ['id', 'cleaned_text', 'viral'],\n",
|
371 |
+
" num_rows: 7\n",
|
372 |
+
"})"
|
373 |
+
]
|
374 |
+
},
|
375 |
+
"execution_count": 253,
|
376 |
+
"metadata": {},
|
377 |
+
"output_type": "execute_result"
|
378 |
+
}
|
379 |
+
],
|
380 |
+
"source": [
|
381 |
+
"ds = Dataset.from_pandas(dataset)\n",
|
382 |
+
"ds"
|
383 |
+
]
|
384 |
+
},
|
385 |
+
{
|
386 |
+
"cell_type": "code",
|
387 |
+
"execution_count": 340,
|
388 |
+
"id": "e88ed93f-0b0c-4743-a506-9a4006534151",
|
389 |
+
"metadata": {},
|
390 |
+
"outputs": [],
|
391 |
+
"source": [
|
392 |
+
"from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
|
393 |
+
"from transformers import DataCollatorWithPadding\n",
|
394 |
+
"from transformers import BertweetTokenizer"
|
395 |
+
]
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"cell_type": "code",
|
399 |
+
"execution_count": 372,
|
400 |
+
"id": "4ec382e5-073b-40e1-8ce6-a6ff9e51644f",
|
401 |
+
"metadata": {},
|
402 |
+
"outputs": [],
|
403 |
+
"source": [
|
404 |
+
"class Tokenizer(BertweetTokenizer):\n",
|
405 |
+
" def __init__(self, *args, **kwargs):\n",
|
406 |
+
" return super().__init__(*args, **kwargs)\n",
|
407 |
+
"\n",
|
408 |
+
" def __call__(self, *args, **kwargs):\n",
|
409 |
+
" return super().__call__(*args, max_length=120, **kwargs)"
|
410 |
+
]
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"cell_type": "code",
|
414 |
+
"execution_count": 373,
|
415 |
+
"id": "56eb937a-483f-4f2f-b7fe-c3da2aa42526",
|
416 |
+
"metadata": {},
|
417 |
+
"outputs": [],
|
418 |
+
"source": [
|
419 |
+
"import torch\n",
|
420 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
421 |
+
"\n",
|
422 |
+
"CHECKPOINT = \"classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt\"\n",
|
423 |
+
"MODEL_NAME = \"vinai/bertweet-base\"\n",
|
424 |
+
"\n",
|
425 |
+
"def get_device():\n",
|
426 |
+
" #device = torch.device(\"mps\") if torch.mps.is_available() else torch.device(\"cpu\")\n",
|
427 |
+
" return torch.device(\"cpu\")\n",
|
428 |
+
" return device\n",
|
429 |
+
" \n",
|
430 |
+
"\n",
|
431 |
+
"def get_model():\n",
|
432 |
+
" model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)\n",
|
433 |
+
" model.load_state_dict(torch.load(CHECKPOINT))\n",
|
434 |
+
" model.to(get_device())\n",
|
435 |
+
" tokenizer = Tokenizer.from_pretrained(MODEL_NAME, truncation=True, max_length=100)\n",
|
436 |
+
"\n",
|
437 |
+
" return tokenizer, model"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"cell_type": "code",
|
442 |
+
"execution_count": 374,
|
443 |
+
"id": "5fe5af4a-3eb8-4fe0-99e8-c967d61241f2",
|
444 |
+
"metadata": {},
|
445 |
+
"outputs": [
|
446 |
+
{
|
447 |
+
"name": "stderr",
|
448 |
+
"output_type": "stream",
|
449 |
+
"text": [
|
450 |
+
"Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
|
451 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
452 |
+
"/var/folders/xd/g8p1g555153b4v2qp8q7shb00000gn/T/ipykernel_40634/3099302733.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
|
453 |
+
" model.load_state_dict(torch.load(CHECKPOINT))\n",
|
454 |
+
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
|
455 |
+
"The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. \n",
|
456 |
+
"The class this function is called from is 'Tokenizer'.\n"
|
457 |
+
]
|
458 |
+
}
|
459 |
+
],
|
460 |
+
"source": [
|
461 |
+
"tokenizer, model = get_model()"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "code",
|
466 |
+
"execution_count": 375,
|
467 |
+
"id": "6cdc0d7e-d264-49b8-822e-9a862a929a2f",
|
468 |
+
"metadata": {},
|
469 |
+
"outputs": [],
|
470 |
+
"source": [
|
471 |
+
"def tokenize_function(example, tokenizer):\n",
|
472 |
+
" # Truncate to max length. Note that a tweet's maximum length is 280\n",
|
473 |
+
" # TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding\n",
|
474 |
+
" #return tokenizer(example[\"cleaned_text\"], truncation=True, max_length=100)\n",
|
475 |
+
" return tokenizer(example[\"cleaned_text\"])"
|
476 |
+
]
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"cell_type": "code",
|
480 |
+
"execution_count": 376,
|
481 |
+
"id": "bc27ce0b-66bb-4a6f-98c5-78983594c3bd",
|
482 |
+
"metadata": {},
|
483 |
+
"outputs": [
|
484 |
+
{
|
485 |
+
"data": {
|
486 |
+
"application/vnd.jupyter.widget-view+json": {
|
487 |
+
"model_id": "ee20a2b256964124930de15d8e97f4ef",
|
488 |
+
"version_major": 2,
|
489 |
+
"version_minor": 0
|
490 |
+
},
|
491 |
+
"text/plain": [
|
492 |
+
"Map: 0%| | 0/7 [00:00<?, ? examples/s]"
|
493 |
+
]
|
494 |
+
},
|
495 |
+
"metadata": {},
|
496 |
+
"output_type": "display_data"
|
497 |
+
},
|
498 |
+
{
|
499 |
+
"name": "stderr",
|
500 |
+
"output_type": "stream",
|
501 |
+
"text": [
|
502 |
+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
|
503 |
+
]
|
504 |
+
}
|
505 |
+
],
|
506 |
+
"source": [
|
507 |
+
"tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)\n",
|
508 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
|
509 |
+
"\n",
|
510 |
+
"#tokenized_datasets = tokenized_datasets.remove_columns([\"__index_level_0__\", \"cleaned_text\", \"id\"])\n",
|
511 |
+
"tokenized_datasets = tokenized_datasets.remove_columns([\"cleaned_text\", \"id\"])\n",
|
512 |
+
"tokenized_datasets = tokenized_datasets.rename_column(\"viral\", \"labels\")\n",
|
513 |
+
"tokenized_datasets.set_format(\"torch\")"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
{
|
517 |
+
"cell_type": "code",
|
518 |
+
"execution_count": 377,
|
519 |
+
"id": "77a12396-386c-4aba-8ed4-e269ecda13a1",
|
520 |
+
"metadata": {},
|
521 |
+
"outputs": [],
|
522 |
+
"source": [
|
523 |
+
"eval_dataloader = DataLoader(tokenized_datasets, batch_size=1, collate_fn=data_collator)"
|
524 |
+
]
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"cell_type": "code",
|
528 |
+
"execution_count": 378,
|
529 |
+
"id": "dc98302c-d539-4af3-8979-64156dda8317",
|
530 |
+
"metadata": {},
|
531 |
+
"outputs": [
|
532 |
+
{
|
533 |
+
"name": "stdout",
|
534 |
+
"output_type": "stream",
|
535 |
+
"text": [
|
536 |
+
"tensor([0.8640])\n",
|
537 |
+
"tensor([0.5687])\n",
|
538 |
+
"tensor([0.9722])\n",
|
539 |
+
"tensor([0.0006])\n",
|
540 |
+
"tensor([0.0033])\n",
|
541 |
+
"tensor([0.0091])\n",
|
542 |
+
"tensor([0.9982])\n"
|
543 |
+
]
|
544 |
+
}
|
545 |
+
],
|
546 |
+
"source": [
|
547 |
+
"if torch.mps.is_available():\n",
|
548 |
+
" torch.mps.empty_cache()\n",
|
549 |
+
"if torch.cuda.is_available():\n",
|
550 |
+
" torch.cuda.empty_cache()\n",
|
551 |
+
"\n",
|
552 |
+
"model.eval()\n",
|
553 |
+
"for batch in eval_dataloader:\n",
|
554 |
+
" batch = {k: v.to(get_device()) for k, v in batch.items()}\n",
|
555 |
+
" with torch.no_grad():\n",
|
556 |
+
" outputs = model(**batch)\n",
|
557 |
+
"\n",
|
558 |
+
" logits = outputs.logits\n",
|
559 |
+
" probabilities = F.softmax(logits, dim=-1)\n",
|
560 |
+
" predictions = torch.argmax(logits, dim=-1)\n",
|
561 |
+
" \n",
|
562 |
+
" print(probabilities[:, 1])\n",
|
563 |
+
" #print(predictions)"
|
564 |
+
]
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"cell_type": "code",
|
568 |
+
"execution_count": 379,
|
569 |
+
"id": "4feb1954-7ad2-461d-bf52-8dd2e0d6591f",
|
570 |
+
"metadata": {},
|
571 |
+
"outputs": [
|
572 |
+
{
|
573 |
+
"name": "stdout",
|
574 |
+
"output_type": "stream",
|
575 |
+
"text": [
|
576 |
+
"128.65210151672363 MiB\n"
|
577 |
+
]
|
578 |
+
}
|
579 |
+
],
|
580 |
+
"source": [
|
581 |
+
"print(sum(p.numel() for p in model.parameters()) / 1024**2, \"MiB\")"
|
582 |
+
]
|
583 |
+
},
|
584 |
+
{
|
585 |
+
"cell_type": "code",
|
586 |
+
"execution_count": 380,
|
587 |
+
"id": "15e2dc8f-c38d-4828-9c90-638c9782eb54",
|
588 |
+
"metadata": {},
|
589 |
+
"outputs": [],
|
590 |
+
"source": [
|
591 |
+
"from transformers import pipeline"
|
592 |
+
]
|
593 |
+
},
|
594 |
+
{
|
595 |
+
"cell_type": "code",
|
596 |
+
"execution_count": 381,
|
597 |
+
"id": "37af7000-ab64-4b1c-bd29-c648b433420f",
|
598 |
+
"metadata": {},
|
599 |
+
"outputs": [
|
600 |
+
{
|
601 |
+
"name": "stderr",
|
602 |
+
"output_type": "stream",
|
603 |
+
"text": [
|
604 |
+
"Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
|
605 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
606 |
+
"/var/folders/xd/g8p1g555153b4v2qp8q7shb00000gn/T/ipykernel_40634/3099302733.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
|
607 |
+
" model.load_state_dict(torch.load(CHECKPOINT))\n",
|
608 |
+
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
|
609 |
+
"The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. \n",
|
610 |
+
"The class this function is called from is 'Tokenizer'.\n"
|
611 |
+
]
|
612 |
+
}
|
613 |
+
],
|
614 |
+
"source": [
|
615 |
+
"tokenizer, model = get_model()"
|
616 |
+
]
|
617 |
+
},
|
618 |
+
{
|
619 |
+
"cell_type": "code",
|
620 |
+
"execution_count": 382,
|
621 |
+
"id": "a05fa75b-e571-4b14-b158-1b43ee17871a",
|
622 |
+
"metadata": {},
|
623 |
+
"outputs": [
|
624 |
+
{
|
625 |
+
"name": "stderr",
|
626 |
+
"output_type": "stream",
|
627 |
+
"text": [
|
628 |
+
"Device set to use cpu\n"
|
629 |
+
]
|
630 |
+
}
|
631 |
+
],
|
632 |
+
"source": [
|
633 |
+
"pipe = pipeline(\n",
|
634 |
+
" 'text-classification',\n",
|
635 |
+
" model=model,\n",
|
636 |
+
" tokenizer=tokenizer,\n",
|
637 |
+
" device=\"cpu\",\n",
|
638 |
+
")"
|
639 |
+
]
|
640 |
+
},
|
641 |
+
{
|
642 |
+
"cell_type": "code",
|
643 |
+
"execution_count": 383,
|
644 |
+
"id": "f1bcb478-c16f-4135-9d61-9df69538e8ce",
|
645 |
+
"metadata": {},
|
646 |
+
"outputs": [],
|
647 |
+
"source": [
|
648 |
+
"texts = [\n",
|
649 |
+
" 'tl;dr\\n\\nHumans are just ChatGPT Wrappers in sunglasses\\n \\n& I couldn’t be more optimistic about the future as a result\\n\\nThank you \\n@ekang426322\\n for an exceptionally curated day at BUIDL Europe!\\n 🫶',\n",
|
650 |
+
" 'USD0++ discovered a new source of yield — depeg. \\n\\nRespect to the innovation\\n',\n",
|
651 |
+
" 'here you can see 4 ai agents \\n@dongossen100\\n , me, \\n@WorldWideWarden16\\n and \\n@provenauthority291\\n discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence',\n",
|
652 |
+
" '\\n arrived to lisbon, building energy is the air',\n",
|
653 |
+
" \"\\n received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\\n\\ncan't wait to address it all\\n\\nmain points:\\n\\n- show rank in X DMs to quickly filter out inbox\\n\\n- rank labels are too distracting (already fixed) \\n\\n- add an option for users to toggle on/off scores inside the feed\\n\\n- add a percentile label, e.g. qw 801 (Top 0.1%)\\n\\n- enable others to add reviews to impact the rank \\n\\n- explain in detail how rankings are calculated \\n\\n- show breakdowns of people in DeFi, DePin, Memecoins etc.\\n\\n- make X Rank opensource \\n\\n- create a web version\\n\\np.s. the current version is just a tiny step in our roadmap for the next two months. \\n\\nthank you for the feedback \\n@socialfi_panda101\\n \\n@adamkillam100\\n \\n@FamKien106\\n \\n@antongotchi104\\n \\n@kliuless128\\n \\n@0xsudogm163\\n \\n@monosarin120\\n \\n@flb_xyz56\\n 🫶\\n \",\n",
|
654 |
+
" 'ai agents are in the air\\n\\nand web3 is trained to sniff out alpha',\n",
|
655 |
+
" 'While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀',\n",
|
656 |
+
"]"
|
657 |
+
]
|
658 |
+
},
|
659 |
+
{
|
660 |
+
"cell_type": "code",
|
661 |
+
"execution_count": 403,
|
662 |
+
"id": "52ab46d9-ed16-43dd-ab0b-4af0757e7c96",
|
663 |
+
"metadata": {},
|
664 |
+
"outputs": [
|
665 |
+
{
|
666 |
+
"name": "stdout",
|
667 |
+
"output_type": "stream",
|
668 |
+
"text": [
|
669 |
+
" 86.40%\n",
|
670 |
+
" 56.87%\n",
|
671 |
+
" 97.22%\n",
|
672 |
+
" 0.06%\n",
|
673 |
+
" 0.33%\n",
|
674 |
+
" 0.91%\n",
|
675 |
+
" 99.82%\n"
|
676 |
+
]
|
677 |
+
}
|
678 |
+
],
|
679 |
+
"source": [
|
680 |
+
"for text in texts:\n",
|
681 |
+
" res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)\n",
|
682 |
+
" LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]\n",
|
683 |
+
" print(f\"{LABEL_1_result:7.2%}\")"
|
684 |
+
]
|
685 |
+
},
|
686 |
+
{
|
687 |
+
"cell_type": "code",
|
688 |
+
"execution_count": null,
|
689 |
+
"id": "033adc09-7c2f-414b-a7e4-d7d8095af580",
|
690 |
+
"metadata": {},
|
691 |
+
"outputs": [],
|
692 |
+
"source": []
|
693 |
+
},
|
694 |
+
{
|
695 |
+
"cell_type": "code",
|
696 |
+
"execution_count": null,
|
697 |
+
"id": "117e3390-130a-4750-ad6a-c03c80050b0f",
|
698 |
+
"metadata": {},
|
699 |
+
"outputs": [],
|
700 |
+
"source": []
|
701 |
+
},
|
702 |
+
{
|
703 |
+
"cell_type": "code",
|
704 |
+
"execution_count": null,
|
705 |
+
"id": "612dee88-0e40-4072-a3af-21a6f3dc5488",
|
706 |
+
"metadata": {},
|
707 |
+
"outputs": [],
|
708 |
+
"source": []
|
709 |
+
}
|
710 |
+
],
|
711 |
+
"metadata": {
|
712 |
+
"kernelspec": {
|
713 |
+
"display_name": "Python (ViralTweets)",
|
714 |
+
"language": "python",
|
715 |
+
"name": "viraltweets"
|
716 |
+
},
|
717 |
+
"language_info": {
|
718 |
+
"codemirror_mode": {
|
719 |
+
"name": "ipython",
|
720 |
+
"version": 3
|
721 |
+
},
|
722 |
+
"file_extension": ".py",
|
723 |
+
"mimetype": "text/x-python",
|
724 |
+
"name": "python",
|
725 |
+
"nbconvert_exporter": "python",
|
726 |
+
"pygments_lexer": "ipython3",
|
727 |
+
"version": "3.12.2"
|
728 |
+
}
|
729 |
+
},
|
730 |
+
"nbformat": 4,
|
731 |
+
"nbformat_minor": 5
|
732 |
+
}
|
all_metric_stats.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
classification/model_with_extra_features/classification.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MODIFY AS REQUIRED
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
import seaborn as sns
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
|
9 |
+
import plotly.express as px
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
from plotly.subplots import make_subplots
|
12 |
+
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from datasets import load_dataset
|
15 |
+
from datasets import Dataset, DatasetDict
|
16 |
+
from transformers import DataCollatorWithPadding
|
17 |
+
|
18 |
+
from torch.utils.data import DataLoader
|
19 |
+
from transformers import AutoTokenizer
|
20 |
+
|
21 |
+
from torch.optim import AdamW
|
22 |
+
from torch.nn import BCEWithLogitsLoss
|
23 |
+
|
24 |
+
from transformers import get_scheduler
|
25 |
+
|
26 |
+
from tqdm.auto import tqdm
|
27 |
+
|
28 |
+
import evaluate
|
29 |
+
|
30 |
+
from tqdm import tqdm
|
31 |
+
import logging
|
32 |
+
logging.basicConfig(level=logging.INFO)
|
33 |
+
|
34 |
+
from text_preprocessing import clean_tweet, clear_reply_mentions, normalizeTweet
|
35 |
+
from custom_model import CustomModel
|
36 |
+
|
37 |
+
'''
|
38 |
+
DATA_PATH = "../../data"
|
39 |
+
|
40 |
+
PROCESSED_PATH = f"{DATA_PATH}/processed"
|
41 |
+
|
42 |
+
PROCESSED_PATH_VIRAL = f'{DATA_PATH}/new/processed/viral'
|
43 |
+
PROCESSED_PATH_COVID = f'{DATA_PATH}/new/processed/covid'
|
44 |
+
'''
|
45 |
+
|
46 |
+
# Different models
|
47 |
+
BERT_BASE_UNCASED = "bert-base-uncased"
|
48 |
+
BERT_BASE_CASED = "bert-base-cased"
|
49 |
+
ROBERTA_BASE = "roberta-base"
|
50 |
+
BERT_TWEET = "vinai/bertweet-base"
|
51 |
+
|
52 |
+
# TODO: Don't forget to cite papers if you use some model
|
53 |
+
BERT_TINY = "prajjwal1/bert-tiny"
|
54 |
+
|
55 |
+
TWEET_MAX_LENGTH = 280
|
56 |
+
|
57 |
+
# TEST SPLIT RATIO + MODELS (ADD MORE MODELS FROM ABOVE)
|
58 |
+
MODELS = [BERT_TWEET, BERT_TINY, BERT_BASE_CASED, ROBERTA_BASE]
|
59 |
+
TEST_RATIO = 0.2
|
60 |
+
|
61 |
+
TOP_FEATURES = ["verified", "tweet_length", "possibly_sensitive", "sentiment", "nb_of_hashtags", "has_media", "nb_of_mentions"]
|
62 |
+
|
63 |
+
def preprocess_data(dataset):
|
64 |
+
dataset.loc[:, 'has_media'] = dataset.has_media.astype("int")
|
65 |
+
dataset.loc[:, 'possibly_sensitive'] = dataset.possibly_sensitive.astype("int")
|
66 |
+
|
67 |
+
#dataset = dataset[dataset.sentiment_score > 0.7]
|
68 |
+
dataset.loc[:, 'sentiment'] = dataset.sentiment.replace({'POSITIVE': 1, 'NEGATIVE': 0})
|
69 |
+
dataset.loc[:, 'verified'] = dataset['verified'].astype(int)
|
70 |
+
|
71 |
+
# remove tweets with 0 retweets (to eliminate their effects)
|
72 |
+
#dataset = dataset[dataset.retweet_count > 0]
|
73 |
+
|
74 |
+
## UPDATE: Get tweets tweeted by the same user, on the same day he tweeted a viral tweet
|
75 |
+
|
76 |
+
# Get the date from datetime
|
77 |
+
# normalize() sets all datetimes clock to midnight, which is equivalent as keeping only the date part
|
78 |
+
dataset['date'] = dataset.created_at.dt.normalize()
|
79 |
+
|
80 |
+
viral_tweets = dataset[dataset.viral]
|
81 |
+
non_viral_tweets = dataset[~dataset.viral]
|
82 |
+
|
83 |
+
temp = non_viral_tweets.merge(viral_tweets[['author_id', 'date', 'id', 'viral']], on=['author_id', 'date'], suffixes=(None, '_y'))
|
84 |
+
same_day_viral_ids = temp.id_y.unique()
|
85 |
+
|
86 |
+
same_day_viral_tweets = viral_tweets[viral_tweets.id.isin(same_day_viral_ids)].drop_duplicates(subset=['author_id', 'date'])
|
87 |
+
same_day_non_viral_tweets = temp.drop_duplicates(subset=['author_id', 'date'])
|
88 |
+
|
89 |
+
logging.info(f"Number of viral tweets tweeted on the same day {len(same_day_viral_tweets)}")
|
90 |
+
logging.info(f"Number of non viral tweets tweeted on the same day {len(same_day_non_viral_tweets)}")
|
91 |
+
|
92 |
+
dataset = pd.concat([same_day_viral_tweets, same_day_non_viral_tweets], axis=0)
|
93 |
+
dataset = dataset[['id', 'text'] + TOP_FEATURES + ['viral']]
|
94 |
+
|
95 |
+
# Balance classes to have as many viral as non viral ones
|
96 |
+
#dataset = pd.concat([positives, negatives.sample(n=len(positives))])
|
97 |
+
#dataset = pd.concat([positives.iloc[:100], negatives.sample(n=len(positives)).iloc[:200]])
|
98 |
+
|
99 |
+
# Clean text to prepare for tokenization
|
100 |
+
#dataset = dataset.dropna()
|
101 |
+
dataset.loc[:, "viral"] = dataset.viral.astype(int)
|
102 |
+
|
103 |
+
# TODO: COMMENT IF YOU WANT TO KEEP TEXT AS IS
|
104 |
+
dataset["cleaned_text"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))
|
105 |
+
|
106 |
+
dataset = dataset.dropna()
|
107 |
+
dataset.loc[:, "extra_features"] = dataset[TOP_FEATURES].values.tolist()
|
108 |
+
dataset = dataset[['id', 'cleaned_text', 'extra_features', 'viral']]
|
109 |
+
|
110 |
+
return dataset
|
111 |
+
|
112 |
+
def prepare_dataset(sample_data, balance=False):
|
113 |
+
# Split the train and test data st each has a fixed proportion of viral tweets
|
114 |
+
train_dataset, eval_dataset = train_test_split(sample_data, test_size=TEST_RATIO, random_state=42, stratify=sample_data.viral)
|
115 |
+
|
116 |
+
# Balance test set
|
117 |
+
if balance:
|
118 |
+
eval_virals = eval_dataset[eval_dataset.viral == 1]
|
119 |
+
eval_non_virals = eval_dataset[eval_dataset.viral == 0]
|
120 |
+
eval_dataset = pd.concat([eval_virals, eval_non_virals.sample(n=len(eval_virals))])
|
121 |
+
|
122 |
+
logging.info('{:>5,} training samples with {:>5,} positives and {:>5,} negatives'.format(
|
123 |
+
len(train_dataset), len(train_dataset[train_dataset.viral == 1]), len(train_dataset[train_dataset.viral == 0])))
|
124 |
+
logging.info('{:>5,} validation samples with {:>5,} positives and {:>5,} negatives'.format(
|
125 |
+
len(eval_dataset), len(eval_dataset[eval_dataset.viral == 1]), len(eval_dataset[eval_dataset.viral == 0])))
|
126 |
+
|
127 |
+
train_dataset.to_parquet("train.parquet.gzip", compression='gzip')
|
128 |
+
eval_dataset.to_parquet("test.parquet.gzip", compression='gzip')
|
129 |
+
|
130 |
+
ds = load_dataset("parquet", data_files={'train': 'train.parquet.gzip', 'test': 'test.parquet.gzip'})
|
131 |
+
return ds
|
132 |
+
|
133 |
+
def tokenize_function(example, tokenizer):
|
134 |
+
# Truncate to max length. Note that a tweet's maximum length is 280
|
135 |
+
# TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding
|
136 |
+
return tokenizer(example["cleaned_text"], truncation=True)
|
137 |
+
|
138 |
+
|
139 |
+
def test_all_models(ds, nb_extra_dims, models=MODELS):
|
140 |
+
models_losses = {}
|
141 |
+
device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
|
142 |
+
|
143 |
+
output = ""
|
144 |
+
|
145 |
+
for checkpoint in models:
|
146 |
+
torch.mps.empty_cache()
|
147 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
148 |
+
custom_model = CustomModel(checkpoint, num_extra_dims=nb_extra_dims, num_labels=2)
|
149 |
+
custom_model.to(device)
|
150 |
+
|
151 |
+
tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)
|
152 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
153 |
+
|
154 |
+
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__", "cleaned_text", "id"])
|
155 |
+
tokenized_datasets = tokenized_datasets.rename_column("viral", "labels")
|
156 |
+
tokenized_datasets.set_format("torch")
|
157 |
+
|
158 |
+
batch_size = 32
|
159 |
+
|
160 |
+
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
|
161 |
+
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)
|
162 |
+
|
163 |
+
criterion = BCEWithLogitsLoss()
|
164 |
+
optimizer = AdamW(custom_model.parameters(), lr=5e-5)
|
165 |
+
|
166 |
+
num_epochs = 15
|
167 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
168 |
+
lr_scheduler = get_scheduler(
|
169 |
+
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
170 |
+
)
|
171 |
+
|
172 |
+
progress_bar = tqdm(range(num_training_steps))
|
173 |
+
|
174 |
+
losses = []
|
175 |
+
custom_model.train()
|
176 |
+
for epoch in range(num_epochs):
|
177 |
+
for batch in train_dataloader:
|
178 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
179 |
+
logits = custom_model(**batch).squeeze()
|
180 |
+
|
181 |
+
loss = criterion(logits, batch['labels'].float())
|
182 |
+
#losses.append(loss.cpu().item())
|
183 |
+
losses.append(loss.item())
|
184 |
+
loss.backward()
|
185 |
+
|
186 |
+
optimizer.step()
|
187 |
+
lr_scheduler.step()
|
188 |
+
optimizer.zero_grad()
|
189 |
+
progress_bar.update(1)
|
190 |
+
|
191 |
+
models_losses[checkpoint] = losses
|
192 |
+
|
193 |
+
metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
|
194 |
+
custom_model.eval()
|
195 |
+
for batch in eval_dataloader:
|
196 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
197 |
+
with torch.no_grad():
|
198 |
+
logits = custom_model(**batch)
|
199 |
+
|
200 |
+
#predictions = torch.argmax(outputs, dim=-1)
|
201 |
+
predictions = torch.round(torch.sigmoid(logits))
|
202 |
+
metric.add_batch(predictions=predictions, references=batch["labels"])
|
203 |
+
|
204 |
+
output += f"checkpoint: {checkpoint}: {metric.compute()}\n"
|
205 |
+
logging.info(output)
|
206 |
+
with open("same_day_as_viral_with_features_train_test_balanced_accuracy.txt", "w") as text_file:
|
207 |
+
text_file.write(output)
|
208 |
+
return models_losses
|
209 |
+
|
210 |
+
def main():
|
211 |
+
# DATA FILE SHOULD BE AT THE ROOT WITH THIS SCRIPT
|
212 |
+
all_tweets_labeled = pd.read_parquet(f'final_dataset_since_october_2022.parquet.gzip')
|
213 |
+
|
214 |
+
dataset = preprocess_data(all_tweets_labeled)
|
215 |
+
ds = prepare_dataset(dataset, balance=True)
|
216 |
+
|
217 |
+
nb_extra_dims = len(TOP_FEATURES)
|
218 |
+
test_all_models(ds, nb_extra_dims=nb_extra_dims)
|
219 |
+
|
220 |
+
if __name__ == "__main__":
|
221 |
+
main()
|
classification/model_with_extra_features/custom_model.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoConfig, AutoModel
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
class CustomModel(torch.nn.Module):
|
6 |
+
"""
|
7 |
+
This takes a transformer backbone and puts a slightly-modified classification head on top.
|
8 |
+
|
9 |
+
"""
|
10 |
+
|
11 |
+
def __init__(self, model_name, num_extra_dims, num_labels=2):
|
12 |
+
# num_extra_dims corresponds to the number of extra dimensions of numerical/categorical data
|
13 |
+
|
14 |
+
super().__init__()
|
15 |
+
|
16 |
+
self.config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
|
17 |
+
self.transformer = AutoModel.from_pretrained(model_name, config=self.config)
|
18 |
+
num_hidden_size = self.transformer.config.hidden_size # May be different depending on which model you use. Common sizes are 768 and 1024. Look in the config.json file
|
19 |
+
|
20 |
+
self.linear_layer_1 = torch.nn.Linear(num_hidden_size+num_extra_dims, 32)
|
21 |
+
# Output size is 1 since this is a binary classification problem
|
22 |
+
self.linear_layer_2 = torch.nn.Linear(32, 16)
|
23 |
+
self.linear_layer_output = torch.nn.Linear(16, 1)
|
24 |
+
self.relu = torch.nn.LeakyReLU(0.6)
|
25 |
+
self.dropout_1 = torch.nn.Dropout(0.5)
|
26 |
+
|
27 |
+
|
28 |
+
def forward(self, input_ids, extra_features, attention_mask=None, token_type_ids=None, labels=None):
|
29 |
+
"""
|
30 |
+
extra_features should be of shape [batch_size, dim]
|
31 |
+
where dim is the number of additional numerical/categorical dimensions
|
32 |
+
"""
|
33 |
+
|
34 |
+
hidden_states = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # [batch size, sequence length, hidden size]
|
35 |
+
|
36 |
+
cls_embeds = hidden_states.last_hidden_state[:, 0, :] # [batch size, hidden size]
|
37 |
+
|
38 |
+
concat = torch.cat((cls_embeds, extra_features), dim=-1) # [batch size, hidden size+num extra dims]
|
39 |
+
|
40 |
+
output_1 = self.relu(self.linear_layer_1(concat)) # [batch size, num labels]
|
41 |
+
output_2 = self.relu(self.linear_layer_2(output_1))
|
42 |
+
final_output = self.dropout_1(self.linear_layer_output(output_2))
|
43 |
+
|
44 |
+
return final_output
|
classification/model_with_extra_features/final_dataset_since_october_2022.parquet.gzip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d648d6af7606807281540bb37516ac7b8ac5270df07cd43ffb3a0430d77306cf
|
3 |
+
size 35666675
|
classification/model_with_extra_features/same_day_as_viral_with_features_train_test_balanced_accuracy.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoint: vinai/bertweet-base: {'accuracy': 0.7770700636942676, 'recall': 0.8535031847133758, 'precision': 0.7403314917127072, 'f1': 0.7928994082840237}
|
2 |
+
checkpoint: prajjwal1/bert-tiny: {'accuracy': 0.7229299363057324, 'recall': 0.8853503184713376, 'precision': 0.6682692307692307, 'f1': 0.7616438356164382}
|
3 |
+
checkpoint: bert-base-cased: {'accuracy': 0.7038216560509554, 'recall': 0.8152866242038217, 'precision': 0.6666666666666666, 'f1': 0.7335243553008596}
|
4 |
+
checkpoint: roberta-base: {'accuracy': 0.7292993630573248, 'recall': 0.8598726114649682, 'precision': 0.6818181818181818, 'f1': 0.7605633802816901}
|
classification/model_with_extra_features/text_preprocessing.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
|
3 |
+
def clear_reply_mentions(tweet):
|
4 |
+
'''Remove user mentions found in a reply to a tweet.
|
5 |
+
|
6 |
+
Example: @user1 @user2 okay @user3 -> okay @user3
|
7 |
+
'''
|
8 |
+
# We don't need to use any sophisticated tokenization here like nltk
|
9 |
+
tokens = tweet.split(" ")
|
10 |
+
for index in range(len(tokens)):
|
11 |
+
if not tokens[index].startswith("@"):
|
12 |
+
return " ".join(tokens[index:])
|
13 |
+
return ""
|
14 |
+
|
15 |
+
from emoji import demojize, is_emoji
|
16 |
+
from nltk.tokenize import TweetTokenizer
|
17 |
+
|
18 |
+
tweet_tokenizer = TweetTokenizer()
|
19 |
+
|
20 |
+
def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
|
21 |
+
lowercased_token = token.lower()
|
22 |
+
if token.startswith("@") and replace_user_mentions:
|
23 |
+
return "@USER"
|
24 |
+
elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
|
25 |
+
return "HTTPURL"
|
26 |
+
elif len(token) == 1 and is_emoji(token):
|
27 |
+
emojis_found.append(token)
|
28 |
+
if demojize_emojis:
|
29 |
+
return demojize(token)
|
30 |
+
else:
|
31 |
+
return token
|
32 |
+
else:
|
33 |
+
if token == "’":
|
34 |
+
return "'"
|
35 |
+
elif token == "…":
|
36 |
+
return "..."
|
37 |
+
else:
|
38 |
+
return token
|
39 |
+
|
40 |
+
|
41 |
+
def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
|
42 |
+
emojis_found = []
|
43 |
+
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
|
44 |
+
normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found,
|
45 |
+
replace_user_mentions=replace_user_mentions,
|
46 |
+
replace_urls=replace_urls,
|
47 |
+
demojize_emojis=demojize_emojis) for token in tokens])
|
48 |
+
|
49 |
+
if bert_tweet_specific_processing:
|
50 |
+
normTweet = (
|
51 |
+
normTweet.replace("cannot ", "can not ")
|
52 |
+
.replace("n't ", " n't ")
|
53 |
+
.replace("n 't ", " n't ")
|
54 |
+
.replace("ca n't", "can't")
|
55 |
+
.replace("ai n't", "ain't")
|
56 |
+
)
|
57 |
+
normTweet = (
|
58 |
+
normTweet.replace("'m ", " 'm ")
|
59 |
+
.replace("'re ", " 're ")
|
60 |
+
.replace("'s ", " 's ")
|
61 |
+
.replace("'ll ", " 'll ")
|
62 |
+
.replace("'d ", " 'd ")
|
63 |
+
.replace("'ve ", " 've ")
|
64 |
+
)
|
65 |
+
normTweet = (
|
66 |
+
normTweet.replace(" p . m .", " p.m.")
|
67 |
+
.replace(" p . m ", " p.m ")
|
68 |
+
.replace(" a . m .", " a.m.")
|
69 |
+
.replace(" a . m ", " a.m ")
|
70 |
+
)
|
71 |
+
|
72 |
+
return " ".join(normTweet.split()), emojis_found
|
73 |
+
|
74 |
+
|
75 |
+
def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
|
76 |
+
demojize_emojis=True, bert_tweet_specific_processing=True):
|
77 |
+
'''Helper function to clean tweets. Highly customizable to fit different needs.
|
78 |
+
|
79 |
+
Params:
|
80 |
+
tweet: the tweet to clean
|
81 |
+
clear_html_chars: If true, will unescape any special html entities found in the tweet
|
82 |
+
replace_user_mentions: If true, will replace any user mention with the token @USER
|
83 |
+
replace_urls: If true, will replace any urls with the token HTTPURL
|
84 |
+
demojize_emojis: If true, will demojize emojis
|
85 |
+
bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
The cleaned tweet
|
89 |
+
'''
|
90 |
+
# First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
|
91 |
+
# These do not count in the character count of a tweet and may make the tweet length go way overboard.
|
92 |
+
cleaned_tweet = clear_reply_mentions(tweet)
|
93 |
+
|
94 |
+
# Second step: Remove any new lines
|
95 |
+
cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')
|
96 |
+
|
97 |
+
# Third step: if True, escape any html entities
|
98 |
+
if clear_html_chars:
|
99 |
+
cleaned_tweet = html.unescape(cleaned_tweet)
|
100 |
+
|
101 |
+
# Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
|
102 |
+
normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
|
103 |
+
replace_user_mentions=replace_user_mentions,
|
104 |
+
replace_urls=replace_urls,
|
105 |
+
demojize_emojis=demojize_emojis,
|
106 |
+
bert_tweet_specific_processing=bert_tweet_specific_processing)
|
107 |
+
|
108 |
+
# TODO: process emoticons? e.g. :)
|
109 |
+
|
110 |
+
return normalized_tweet
|
classification/model_with_only_language_models/classification.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MODIFY AS REQUIRED
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
import seaborn as sns
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
|
9 |
+
import plotly.express as px
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
from plotly.subplots import make_subplots
|
12 |
+
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from datasets import load_dataset
|
15 |
+
from datasets import Dataset, DatasetDict
|
16 |
+
from transformers import DataCollatorWithPadding
|
17 |
+
|
18 |
+
from torch.utils.data import DataLoader
|
19 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
20 |
+
|
21 |
+
from torch.optim import AdamW
|
22 |
+
from torch.nn import BCEWithLogitsLoss
|
23 |
+
|
24 |
+
from transformers import get_scheduler
|
25 |
+
|
26 |
+
from tqdm.auto import tqdm
|
27 |
+
|
28 |
+
import evaluate
|
29 |
+
|
30 |
+
from tqdm import tqdm
|
31 |
+
import logging
|
32 |
+
logging.basicConfig(level=logging.INFO)
|
33 |
+
|
34 |
+
from text_preprocessing import clean_tweet, clear_reply_mentions, normalizeTweet
|
35 |
+
|
36 |
+
'''
|
37 |
+
DATA_PATH = "../../data"
|
38 |
+
|
39 |
+
PROCESSED_PATH = f"{DATA_PATH}/processed"
|
40 |
+
|
41 |
+
PROCESSED_PATH_VIRAL = f'{DATA_PATH}/new/processed/viral'
|
42 |
+
PROCESSED_PATH_COVID = f'{DATA_PATH}/new/processed/covid'
|
43 |
+
'''
|
44 |
+
|
45 |
+
# Different models
|
46 |
+
BERT_BASE_UNCASED = "bert-base-uncased"
|
47 |
+
BERT_BASE_CASED = "bert-base-cased"
|
48 |
+
ROBERTA_BASE = "roberta-base"
|
49 |
+
BERT_TWEET = "vinai/bertweet-base"
|
50 |
+
# BERT_TWEET_LARGE = "vinai/bertweet-large"
|
51 |
+
DEBERTA_V3 = "microsoft/deberta-v3-base"
|
52 |
+
|
53 |
+
# TODO: Don't forget to cite papers if you use some model
|
54 |
+
BERT_TINY = "prajjwal1/bert-tiny"
|
55 |
+
|
56 |
+
TWEET_MAX_LENGTH = 280
|
57 |
+
|
58 |
+
# TEST SPLIT RATIO + MODELS (ADD MORE MODELS FROM ABOVE)
|
59 |
+
# MODELS = [BERT_TWEET, BERT_TINY, BERT_BASE_CASED, ROBERTA_BASE]
|
60 |
+
MODELS = [DEBERTA_V3]
|
61 |
+
TEST_RATIO = 0.2
|
62 |
+
|
63 |
+
def preprocess_data(dataset):
|
64 |
+
# remove tweets with 0 retweets (to eliminate their effects)
|
65 |
+
#dataset = dataset[dataset.retweet_count > 0]
|
66 |
+
|
67 |
+
## UPDATE: Get tweets tweeted by the same user, on the same day he tweeted a viral tweet
|
68 |
+
|
69 |
+
# Get the date from datetime
|
70 |
+
# normalize() sets all datetimes clock to midnight, which is equivalent as keeping only the date part
|
71 |
+
dataset['date'] = dataset.created_at.dt.normalize()
|
72 |
+
|
73 |
+
viral_tweets = dataset[dataset.viral]
|
74 |
+
non_viral_tweets = dataset[~dataset.viral]
|
75 |
+
|
76 |
+
temp = non_viral_tweets.merge(viral_tweets[['author_id', 'date', 'id', 'viral']], on=['author_id', 'date'], suffixes=(None, '_y'))
|
77 |
+
same_day_viral_ids = temp.id_y.unique()
|
78 |
+
|
79 |
+
same_day_viral_tweets = viral_tweets[viral_tweets.id.isin(same_day_viral_ids)].drop_duplicates(subset=['author_id', 'date'])
|
80 |
+
same_day_non_viral_tweets = temp.drop_duplicates(subset=['author_id', 'date'])
|
81 |
+
|
82 |
+
logging.info(f"Number of viral tweets tweeted on the same day {len(same_day_viral_tweets)}")
|
83 |
+
logging.info(f"Number of non viral tweets tweeted on the same day {len(same_day_non_viral_tweets)}")
|
84 |
+
|
85 |
+
dataset = pd.concat([same_day_viral_tweets, same_day_non_viral_tweets], axis=0)
|
86 |
+
dataset = dataset[['id', 'text', 'viral']]
|
87 |
+
|
88 |
+
# Balance classes to have as many viral as non viral ones
|
89 |
+
#dataset = pd.concat([positives, negatives.sample(n=len(positives))])
|
90 |
+
#dataset = pd.concat([positives.iloc[:100], negatives.sample(n=len(positives)).iloc[:200]])
|
91 |
+
|
92 |
+
# Clean text to prepare for tokenization
|
93 |
+
#dataset = dataset.dropna()
|
94 |
+
dataset.loc[:, "viral"] = dataset.viral.astype(int)
|
95 |
+
|
96 |
+
# TODO: COMMENT IF YOU WANT TO KEEP TEXT AS IS
|
97 |
+
dataset["cleaned_text"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))
|
98 |
+
|
99 |
+
dataset = dataset.dropna()
|
100 |
+
dataset = dataset[['id', 'cleaned_text', 'viral']]
|
101 |
+
|
102 |
+
return dataset
|
103 |
+
|
104 |
+
def prepare_dataset(sample_data, balance=False):
|
105 |
+
# Split the train and test data st each has a fixed proportion of viral tweets
|
106 |
+
train_dataset, eval_dataset = train_test_split(sample_data, test_size=TEST_RATIO, random_state=42, stratify=sample_data.viral)
|
107 |
+
|
108 |
+
# Balance test set
|
109 |
+
if balance:
|
110 |
+
eval_virals = eval_dataset[eval_dataset.viral == 1]
|
111 |
+
eval_non_virals = eval_dataset[eval_dataset.viral == 0]
|
112 |
+
eval_dataset = pd.concat([eval_virals, eval_non_virals.sample(n=len(eval_virals))])
|
113 |
+
|
114 |
+
logging.info('{:>5,} training samples with {:>5,} positives and {:>5,} negatives'.format(
|
115 |
+
len(train_dataset), len(train_dataset[train_dataset.viral == 1]), len(train_dataset[train_dataset.viral == 0])))
|
116 |
+
logging.info('{:>5,} validation samples with {:>5,} positives and {:>5,} negatives'.format(
|
117 |
+
len(eval_dataset), len(eval_dataset[eval_dataset.viral == 1]), len(eval_dataset[eval_dataset.viral == 0])))
|
118 |
+
|
119 |
+
train_dataset.to_parquet("train.parquet.gzip", compression='gzip')
|
120 |
+
eval_dataset.to_parquet("test.parquet.gzip", compression='gzip')
|
121 |
+
|
122 |
+
ds = load_dataset("parquet", data_files={'train': 'train.parquet.gzip', 'test': 'test.parquet.gzip'})
|
123 |
+
return ds
|
124 |
+
|
125 |
+
def tokenize_function(example, tokenizer):
|
126 |
+
# Truncate to max length. Note that a tweet's maximum length is 280
|
127 |
+
# TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding
|
128 |
+
return tokenizer(example["cleaned_text"], truncation=True)
|
129 |
+
|
130 |
+
|
131 |
+
def test_all_models(ds, models=MODELS):
|
132 |
+
models_losses = {}
|
133 |
+
device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
|
134 |
+
|
135 |
+
output = ""
|
136 |
+
|
137 |
+
for checkpoint in models:
|
138 |
+
torch.mps.empty_cache()
|
139 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
140 |
+
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
|
141 |
+
model.to(device)
|
142 |
+
|
143 |
+
tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)
|
144 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
145 |
+
|
146 |
+
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__", "cleaned_text", "id"])
|
147 |
+
tokenized_datasets = tokenized_datasets.rename_column("viral", "labels")
|
148 |
+
tokenized_datasets.set_format("torch")
|
149 |
+
|
150 |
+
batch_size = 32
|
151 |
+
|
152 |
+
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
|
153 |
+
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)
|
154 |
+
|
155 |
+
criterion = BCEWithLogitsLoss()
|
156 |
+
optimizer = AdamW(model.parameters(), lr=5e-5)
|
157 |
+
|
158 |
+
optimizer = AdamW(model.parameters(), lr=5e-5)
|
159 |
+
|
160 |
+
num_epochs = 15
|
161 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
162 |
+
lr_scheduler = get_scheduler(
|
163 |
+
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
164 |
+
)
|
165 |
+
|
166 |
+
progress_bar = tqdm(range(num_training_steps))
|
167 |
+
|
168 |
+
exp_loss = None
|
169 |
+
losses = []
|
170 |
+
model.train()
|
171 |
+
for epoch in range(num_epochs):
|
172 |
+
for batch in train_dataloader:
|
173 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
174 |
+
outputs = model(**batch)
|
175 |
+
|
176 |
+
loss = outputs.loss
|
177 |
+
losses.append(loss.item())
|
178 |
+
loss.backward()
|
179 |
+
|
180 |
+
if exp_loss is None:
|
181 |
+
exp_loss = loss.cpu().item()
|
182 |
+
else:
|
183 |
+
exp_loss = 0.9 * exp_loss + 0.1 * loss.cpu().item()
|
184 |
+
|
185 |
+
optimizer.step()
|
186 |
+
lr_scheduler.step()
|
187 |
+
optimizer.zero_grad()
|
188 |
+
progress_bar.update(1)
|
189 |
+
progress_bar.set_postfix({"loss": exp_loss, "epoch": epoch})
|
190 |
+
torch.save(model.state_dict(), f"models/trained_{checkpoint.replace('/', '_')}.pt")
|
191 |
+
|
192 |
+
models_losses[checkpoint] = losses
|
193 |
+
|
194 |
+
metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
|
195 |
+
model.eval()
|
196 |
+
for batch in eval_dataloader:
|
197 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
198 |
+
with torch.no_grad():
|
199 |
+
outputs = model(**batch)
|
200 |
+
|
201 |
+
logits = outputs.logits
|
202 |
+
predictions = torch.argmax(logits, dim=-1)
|
203 |
+
metric.add_batch(predictions=predictions, references=batch["labels"])
|
204 |
+
|
205 |
+
output += f"checkpoint: {checkpoint}: {metric.compute()}\n"
|
206 |
+
logging.info(output)
|
207 |
+
with open("same_day_as_viral_with_features_train_test_balanced_accuracy.txt", "w") as text_file:
|
208 |
+
text_file.write(output)
|
209 |
+
return models_losses
|
210 |
+
|
211 |
+
def main():
|
212 |
+
# DATA FILE SHOULD BE AT THE ROOT WITH THIS SCRIPT
|
213 |
+
all_tweets_labeled = pd.read_parquet(f'final_dataset_since_october_2022.parquet.gzip')
|
214 |
+
|
215 |
+
dataset = preprocess_data(all_tweets_labeled)
|
216 |
+
ds = prepare_dataset(dataset, balance=False)
|
217 |
+
|
218 |
+
test_all_models(ds)
|
219 |
+
|
220 |
+
if __name__ == "__main__":
|
221 |
+
main()
|
classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d648d6af7606807281540bb37516ac7b8ac5270df07cd43ffb3a0430d77306cf
|
3 |
+
size 35666675
|
classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d389880329ae1db97cd7e43c851c277c32d32ce73a8dd21deaa77f381fcd50b
|
3 |
+
size 539690276
|
classification/model_with_only_language_models/same_day_as_viral_train_test_balanced_accuracy.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoint: vinai/bertweet-base: {'accuracy': 0.7484076433121019, 'recall': 0.821656050955414, 'precision': 0.7166666666666667, 'f1': 0.7655786350148368}
|
2 |
+
checkpoint: prajjwal1/bert-tiny: {'accuracy': 0.7292993630573248, 'recall': 0.8343949044585988, 'precision': 0.6894736842105263, 'f1': 0.755043227665706}
|
3 |
+
checkpoint: bert-base-cased: {'accuracy': 0.6942675159235668, 'recall': 0.7643312101910829, 'precision': 0.6703910614525139, 'f1': 0.7142857142857143}
|
4 |
+
checkpoint: roberta-base: {'accuracy': 0.7420382165605095, 'recall': 0.8343949044585988, 'precision': 0.7043010752688172, 'f1': 0.7638483965014577}
|
classification/model_with_only_language_models/same_day_as_viral_with_features_train_test_balanced_accuracy.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
checkpoint: vinai/bertweet-base: {'accuracy': 0.7714285714285715, 'recall': 0.8280254777070064, 'precision': 0.7428571428571429, 'f1': 0.7831325301204819}
|
2 |
+
checkpoint: vinai/bertweet-large: {'accuracy': 0.7365079365079366, 'recall': 0.8535031847133758, 'precision': 0.6907216494845361, 'f1': 0.7635327635327636}
|
classification/model_with_only_language_models/test.parquet.gzip
ADDED
Binary file (19.8 kB). View file
|
|
classification/model_with_only_language_models/text_preprocessing.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
|
3 |
+
def clear_reply_mentions(tweet):
|
4 |
+
'''Remove user mentions found in a reply to a tweet.
|
5 |
+
|
6 |
+
Example: @user1 @user2 okay @user3 -> okay @user3
|
7 |
+
'''
|
8 |
+
# We don't need to use any sophisticated tokenization here like nltk
|
9 |
+
tokens = tweet.split(" ")
|
10 |
+
for index in range(len(tokens)):
|
11 |
+
if not tokens[index].startswith("@"):
|
12 |
+
return " ".join(tokens[index:])
|
13 |
+
return ""
|
14 |
+
|
15 |
+
from emoji import demojize, is_emoji
|
16 |
+
from nltk.tokenize import TweetTokenizer
|
17 |
+
|
18 |
+
tweet_tokenizer = TweetTokenizer()
|
19 |
+
|
20 |
+
def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
|
21 |
+
lowercased_token = token.lower()
|
22 |
+
if token.startswith("@") and replace_user_mentions:
|
23 |
+
return "@USER"
|
24 |
+
elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
|
25 |
+
return "HTTPURL"
|
26 |
+
elif len(token) == 1 and is_emoji(token):
|
27 |
+
emojis_found.append(token)
|
28 |
+
if demojize_emojis:
|
29 |
+
return demojize(token)
|
30 |
+
else:
|
31 |
+
return token
|
32 |
+
else:
|
33 |
+
if token == "’":
|
34 |
+
return "'"
|
35 |
+
elif token == "…":
|
36 |
+
return "..."
|
37 |
+
else:
|
38 |
+
return token
|
39 |
+
|
40 |
+
|
41 |
+
def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
|
42 |
+
emojis_found = []
|
43 |
+
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
|
44 |
+
normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found,
|
45 |
+
replace_user_mentions=replace_user_mentions,
|
46 |
+
replace_urls=replace_urls,
|
47 |
+
demojize_emojis=demojize_emojis) for token in tokens])
|
48 |
+
|
49 |
+
if bert_tweet_specific_processing:
|
50 |
+
normTweet = (
|
51 |
+
normTweet.replace("cannot ", "can not ")
|
52 |
+
.replace("n't ", " n't ")
|
53 |
+
.replace("n 't ", " n't ")
|
54 |
+
.replace("ca n't", "can't")
|
55 |
+
.replace("ai n't", "ain't")
|
56 |
+
)
|
57 |
+
normTweet = (
|
58 |
+
normTweet.replace("'m ", " 'm ")
|
59 |
+
.replace("'re ", " 're ")
|
60 |
+
.replace("'s ", " 's ")
|
61 |
+
.replace("'ll ", " 'll ")
|
62 |
+
.replace("'d ", " 'd ")
|
63 |
+
.replace("'ve ", " 've ")
|
64 |
+
)
|
65 |
+
normTweet = (
|
66 |
+
normTweet.replace(" p . m .", " p.m.")
|
67 |
+
.replace(" p . m ", " p.m ")
|
68 |
+
.replace(" a . m .", " a.m.")
|
69 |
+
.replace(" a . m ", " a.m ")
|
70 |
+
)
|
71 |
+
|
72 |
+
return " ".join(normTweet.split()), emojis_found
|
73 |
+
|
74 |
+
|
75 |
+
def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
|
76 |
+
demojize_emojis=True, bert_tweet_specific_processing=True):
|
77 |
+
'''Helper function to clean tweets. Highly customizable to fit different needs.
|
78 |
+
|
79 |
+
Params:
|
80 |
+
tweet: the tweet to clean
|
81 |
+
clear_html_chars: If true, will unescape any special html entities found in the tweet
|
82 |
+
replace_user_mentions: If true, will replace any user mention with the token @USER
|
83 |
+
replace_urls: If true, will replace any urls with the token HTTPURL
|
84 |
+
demojize_emojis: If true, will demojize emojis
|
85 |
+
bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
The cleaned tweet
|
89 |
+
'''
|
90 |
+
# First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
|
91 |
+
# These do not count in the character count of a tweet and may make the tweet length go way overboard.
|
92 |
+
cleaned_tweet = clear_reply_mentions(tweet)
|
93 |
+
|
94 |
+
# Second step: Remove any new lines
|
95 |
+
cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')
|
96 |
+
|
97 |
+
# Third step: if True, escape any html entities
|
98 |
+
if clear_html_chars:
|
99 |
+
cleaned_tweet = html.unescape(cleaned_tweet)
|
100 |
+
|
101 |
+
# Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
|
102 |
+
normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
|
103 |
+
replace_user_mentions=replace_user_mentions,
|
104 |
+
replace_urls=replace_urls,
|
105 |
+
demojize_emojis=demojize_emojis,
|
106 |
+
bert_tweet_specific_processing=bert_tweet_specific_processing)
|
107 |
+
|
108 |
+
# TODO: process emoticons? e.g. :)
|
109 |
+
|
110 |
+
return normalized_tweet
|
classification/model_with_only_language_models/train.parquet.gzip
ADDED
Binary file (67.4 kB). View file
|
|
data/control.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49d7490cb0993941b8a024c1671e9b9ac8e5914de3dce2aad9d68de0c47d7ae4
|
3 |
+
size 22727820
|
data/viral.csv
ADDED
@@ -0,0 +1,1042 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tweet_id
|
2 |
+
1584932886167244801
|
3 |
+
1593947297615794176
|
4 |
+
1591270450294059008
|
5 |
+
1581699730639589378
|
6 |
+
1591043902521638913
|
7 |
+
1584903132362211333
|
8 |
+
1593612903801929728
|
9 |
+
1593479807739625473
|
10 |
+
1591541757908508674
|
11 |
+
1593770726061924352
|
12 |
+
1584737507203485696
|
13 |
+
1594121081245384707
|
14 |
+
1590100845982781441
|
15 |
+
1593653567516299264
|
16 |
+
1585021201704882177
|
17 |
+
1594345561146392579
|
18 |
+
1592506452916289536
|
19 |
+
1594029216059703297
|
20 |
+
1594004245535154176
|
21 |
+
1592615266953101312
|
22 |
+
1590143170649935872
|
23 |
+
1591484993188298752
|
24 |
+
1593265509796978690
|
25 |
+
1594386871215267840
|
26 |
+
1591609295279374336
|
27 |
+
1590051500901822464
|
28 |
+
1592276529354268672
|
29 |
+
1594108905071996928
|
30 |
+
1590107396776476672
|
31 |
+
1590056737591422977
|
32 |
+
1592682602884128768
|
33 |
+
1592591165786054657
|
34 |
+
1590937532035780608
|
35 |
+
1594008547414736896
|
36 |
+
1585026683962556416
|
37 |
+
1584802342083661824
|
38 |
+
1593020357853982722
|
39 |
+
1593956647629598721
|
40 |
+
1591076060535422977
|
41 |
+
1592413862493851648
|
42 |
+
1592937493472874502
|
43 |
+
1591425970996908032
|
44 |
+
1593404493827391488
|
45 |
+
1581841556591751168
|
46 |
+
1594355223694147584
|
47 |
+
1590299243990102017
|
48 |
+
1592553375572578304
|
49 |
+
1581711870255575041
|
50 |
+
1590718839284985859
|
51 |
+
1592620843993268224
|
52 |
+
1591882690730815490
|
53 |
+
1592956946529017857
|
54 |
+
1584521556427886597
|
55 |
+
1584729652215828481
|
56 |
+
1592942420110430208
|
57 |
+
1593750550423773190
|
58 |
+
1592690842900525056
|
59 |
+
1592723627648372736
|
60 |
+
1593431015309066242
|
61 |
+
1590182204914929666
|
62 |
+
1592186934201577472
|
63 |
+
1592584760844771328
|
64 |
+
1590780608644775937
|
65 |
+
1594441159358615552
|
66 |
+
1582024160087183361
|
67 |
+
1591149563402145793
|
68 |
+
1590786900822200320
|
69 |
+
1592274617057816576
|
70 |
+
1590872573407293441
|
71 |
+
1593886676346363904
|
72 |
+
1594188205448105988
|
73 |
+
1593380253451403265
|
74 |
+
1584647230635462656
|
75 |
+
1594375031680663553
|
76 |
+
1591090938990198784
|
77 |
+
1590148564063518720
|
78 |
+
1590767742805647360
|
79 |
+
1594118443174871040
|
80 |
+
1593699015417159680
|
81 |
+
1592161331754721281
|
82 |
+
1591480117473472514
|
83 |
+
1581701001496518657
|
84 |
+
1594119859826479104
|
85 |
+
1591921147611856901
|
86 |
+
1582017942060863489
|
87 |
+
1591153087082688512
|
88 |
+
1582090308220686336
|
89 |
+
1591445228351029248
|
90 |
+
1590514595973988352
|
91 |
+
1584739482062184448
|
92 |
+
1593742582961639425
|
93 |
+
1592632272158879745
|
94 |
+
1592692406998429696
|
95 |
+
1592911136734969856
|
96 |
+
1592238993730670593
|
97 |
+
1593024744491978752
|
98 |
+
1593400387381465088
|
99 |
+
1590364067306573827
|
100 |
+
1584900232860631041
|
101 |
+
1582017607279923201
|
102 |
+
1581887155433279489
|
103 |
+
1592312202169810944
|
104 |
+
1590317790871031811
|
105 |
+
1592505565695315970
|
106 |
+
1592544903397265409
|
107 |
+
1594117873261240326
|
108 |
+
1594216494078230529
|
109 |
+
1594703707673079808
|
110 |
+
1584721574296682501
|
111 |
+
1590109110644903937
|
112 |
+
1594076475770425347
|
113 |
+
1591138469442568192
|
114 |
+
1590864178583318528
|
115 |
+
1591955167175659521
|
116 |
+
1592616005187956737
|
117 |
+
1594387207434838017
|
118 |
+
1591427829128134656
|
119 |
+
1591713956338491393
|
120 |
+
1592271789312573440
|
121 |
+
1592874573892317185
|
122 |
+
1592579050249236480
|
123 |
+
1592721942733533185
|
124 |
+
1584650521364688896
|
125 |
+
1585111756887977984
|
126 |
+
1590815960113487891
|
127 |
+
1590815704218861568
|
128 |
+
1594131004301656064
|
129 |
+
1593763223224057857
|
130 |
+
1584947082842230790
|
131 |
+
1592124116748103681
|
132 |
+
1594100421127290882
|
133 |
+
1592453153169223681
|
134 |
+
1584867689264476160
|
135 |
+
1592494074715836416
|
136 |
+
1584931955938070528
|
137 |
+
1584726237171568640
|
138 |
+
1592560270270173184
|
139 |
+
1584741658649776128
|
140 |
+
1584584149318651905
|
141 |
+
1591190276113981440
|
142 |
+
1590727361984888834
|
143 |
+
1593411758911549440
|
144 |
+
1594051495472746496
|
145 |
+
1594063774721650688
|
146 |
+
1591880750101827584
|
147 |
+
1593830994867589126
|
148 |
+
1584720902700568578
|
149 |
+
1592950775076257793
|
150 |
+
1584561799843614720
|
151 |
+
1591860534101827584
|
152 |
+
1591866661434720256
|
153 |
+
1584566687659003904
|
154 |
+
1593991931541049350
|
155 |
+
1590060339135381505
|
156 |
+
1584808596738813952
|
157 |
+
1593787603853094916
|
158 |
+
1594345571262799873
|
159 |
+
1593656488773537792
|
160 |
+
1591868427932962816
|
161 |
+
1594084459770429440
|
162 |
+
1581662270186090496
|
163 |
+
1591422937349132289
|
164 |
+
1590744874483339264
|
165 |
+
1591297640133795841
|
166 |
+
1592962528816287744
|
167 |
+
1591840251907276800
|
168 |
+
1590195663702458369
|
169 |
+
1584843627930718208
|
170 |
+
1592708602716123137
|
171 |
+
1593779273877291008
|
172 |
+
1590783953375723520
|
173 |
+
1594688865100795905
|
174 |
+
1590354018622402560
|
175 |
+
1594109089520308224
|
176 |
+
1584986718322950144
|
177 |
+
1593764585290473479
|
178 |
+
1592566201414475777
|
179 |
+
1592294532439502848
|
180 |
+
1584654305952337922
|
181 |
+
1593819425748455424
|
182 |
+
1590016468066140160
|
183 |
+
1581809942570287104
|
184 |
+
1592646198175092737
|
185 |
+
1592732829682528256
|
186 |
+
1593256204473864195
|
187 |
+
1591277386640736256
|
188 |
+
1594471398902272000
|
189 |
+
1594363137419104261
|
190 |
+
1581709339361886208
|
191 |
+
1592594235294982147
|
192 |
+
1592668553521811456
|
193 |
+
1584730705086455808
|
194 |
+
1584531644836646915
|
195 |
+
1584580045368352768
|
196 |
+
1584643007986683905
|
197 |
+
1581758037554888704
|
198 |
+
1593066025008107520
|
199 |
+
1593253992171126786
|
200 |
+
1591875679377031169
|
201 |
+
1591108798244347905
|
202 |
+
1591646097520611328
|
203 |
+
1592204998255185920
|
204 |
+
1593745429765947397
|
205 |
+
1594552471531454464
|
206 |
+
1591814551800250368
|
207 |
+
1592926448498937856
|
208 |
+
1593927022769750017
|
209 |
+
1591001937218093056
|
210 |
+
1590604334558871552
|
211 |
+
1581821777579106305
|
212 |
+
1581898364144144384
|
213 |
+
1593647645645414400
|
214 |
+
1590344271961677827
|
215 |
+
1592228025885945856
|
216 |
+
1594199444018561025
|
217 |
+
1594158151061983235
|
218 |
+
1593861908490293249
|
219 |
+
1590052480142172160
|
220 |
+
1593315954184028160
|
221 |
+
1594028451807506432
|
222 |
+
1590188416192610304
|
223 |
+
1591867143616069634
|
224 |
+
1593266642892595201
|
225 |
+
1591398036793597954
|
226 |
+
1581652475555127296
|
227 |
+
1592399074736902144
|
228 |
+
1594102237281849345
|
229 |
+
1594419326697848833
|
230 |
+
1590887994973900800
|
231 |
+
1584808125601026048
|
232 |
+
1594410749656322048
|
233 |
+
1592233871021867009
|
234 |
+
1592392374063620096
|
235 |
+
1592068317157941249
|
236 |
+
1592816576415559683
|
237 |
+
1584951794580742145
|
238 |
+
1581906374396506112
|
239 |
+
1593956302824566785
|
240 |
+
1590839330201014273
|
241 |
+
1591548912292491264
|
242 |
+
1585089769935343616
|
243 |
+
1593615212799746050
|
244 |
+
1581690578571444225
|
245 |
+
1593342410205134849
|
246 |
+
1581676349994856451
|
247 |
+
1592815064234721281
|
248 |
+
1590508168441839617
|
249 |
+
1594485884988030977
|
250 |
+
1594836721102888960
|
251 |
+
1591088972016803842
|
252 |
+
1590101414386143232
|
253 |
+
1584761833247756288
|
254 |
+
1590153744976867329
|
255 |
+
1591898753350664192
|
256 |
+
1592447124343775232
|
257 |
+
1594180142703575040
|
258 |
+
1590901563647983618
|
259 |
+
1594080238945947650
|
260 |
+
1594229593874964481
|
261 |
+
1592536293762019328
|
262 |
+
1591543271150407680
|
263 |
+
1591177229865582592
|
264 |
+
1590863898273452032
|
265 |
+
1592934665597616128
|
266 |
+
1593047972623048704
|
267 |
+
1593424009080848389
|
268 |
+
1591357062692634625
|
269 |
+
1590189415799758848
|
270 |
+
1591143431019331584
|
271 |
+
1590072621864869888
|
272 |
+
1592690621697101824
|
273 |
+
1592579340793171969
|
274 |
+
1590185325149978625
|
275 |
+
1592712852514959362
|
276 |
+
1593713382959595521
|
277 |
+
1591492092043595776
|
278 |
+
1593707058397847556
|
279 |
+
1591557990444064768
|
280 |
+
1591222092351275008
|
281 |
+
1594086416484630531
|
282 |
+
1594381233575829511
|
283 |
+
1591491501678526464
|
284 |
+
1581811762822664193
|
285 |
+
1592550639543803904
|
286 |
+
1593735934733754368
|
287 |
+
1590274908239826944
|
288 |
+
1581746748846141441
|
289 |
+
1593719366385901568
|
290 |
+
1592524002639376385
|
291 |
+
1584749509145690114
|
292 |
+
1593264045469143043
|
293 |
+
1584965726724837377
|
294 |
+
1590821075981856769
|
295 |
+
1592677748383719424
|
296 |
+
1592003897132199937
|
297 |
+
1581799078597230594
|
298 |
+
1593750298224648193
|
299 |
+
1593734302382129152
|
300 |
+
1584684864304914432
|
301 |
+
1594029166059524097
|
302 |
+
1584630796840996864
|
303 |
+
1594322584731951110
|
304 |
+
1590133855801204736
|
305 |
+
1591517445814358017
|
306 |
+
1594319831108816898
|
307 |
+
1594053138633621504
|
308 |
+
1592942121602121728
|
309 |
+
1590879809852301315
|
310 |
+
1591813892841541632
|
311 |
+
1591469036793393155
|
312 |
+
1584652769096781824
|
313 |
+
1594763150368411648
|
314 |
+
1584642009410973696
|
315 |
+
1594771442729721856
|
316 |
+
1582060359178854400
|
317 |
+
1590691417906876416
|
318 |
+
1594055363342598144
|
319 |
+
1591250467618959361
|
320 |
+
1591131814890188802
|
321 |
+
1590016441092169728
|
322 |
+
1590071040209997826
|
323 |
+
1584746408351272960
|
324 |
+
1593229099308548098
|
325 |
+
1593656191443410946
|
326 |
+
1592681211176312832
|
327 |
+
1593302498718224386
|
328 |
+
1591769082336280576
|
329 |
+
1593591679377735685
|
330 |
+
1584949615568175104
|
331 |
+
1592619146743017473
|
332 |
+
1584655489857486848
|
333 |
+
1592345304434868225
|
334 |
+
1592776050685407233
|
335 |
+
1592281453186134016
|
336 |
+
1593367844325949442
|
337 |
+
1590171569770229760
|
338 |
+
1594451431549194240
|
339 |
+
1592149621962584064
|
340 |
+
1581654698549202944
|
341 |
+
1592694495333986304
|
342 |
+
1585020957898723328
|
343 |
+
1593776060516237317
|
344 |
+
1593707705465815042
|
345 |
+
1591574660290052096
|
346 |
+
1594147204339732480
|
347 |
+
1590354604159795201
|
348 |
+
1591459582618267648
|
349 |
+
1591441282957967360
|
350 |
+
1592595549865537537
|
351 |
+
1590149965283987457
|
352 |
+
1592492545258291201
|
353 |
+
1592316200276533248
|
354 |
+
1592651505550917632
|
355 |
+
1591242670755676160
|
356 |
+
1591804790555955201
|
357 |
+
1591912966810918912
|
358 |
+
1594367305034571777
|
359 |
+
1581749087933562880
|
360 |
+
1590527640414621697
|
361 |
+
1591135304337215488
|
362 |
+
1592541725670912002
|
363 |
+
1592252153812615168
|
364 |
+
1593043322914091009
|
365 |
+
1590927402338689024
|
366 |
+
1581730850705506304
|
367 |
+
1582002053983145987
|
368 |
+
1591018546905354244
|
369 |
+
1592147121809285122
|
370 |
+
1593235425162063874
|
371 |
+
1592251929144758273
|
372 |
+
1590101047904714757
|
373 |
+
1593892998462726144
|
374 |
+
1590064513809727489
|
375 |
+
1591086179847659520
|
376 |
+
1591199565059100672
|
377 |
+
1594719218632720385
|
378 |
+
1594369024410124288
|
379 |
+
1591730791369043970
|
380 |
+
1590871372838109189
|
381 |
+
1590043224235704322
|
382 |
+
1584770222866993152
|
383 |
+
1594011438032502784
|
384 |
+
1592156930356690945
|
385 |
+
1594547086363590658
|
386 |
+
1594474457002872832
|
387 |
+
1591795256713953281
|
388 |
+
1584795643394289664
|
389 |
+
1591196561723527168
|
390 |
+
1591046399936065537
|
391 |
+
1593392197675753475
|
392 |
+
1591136726852186113
|
393 |
+
1591309750016573441
|
394 |
+
1592921043152560128
|
395 |
+
1593692760225333251
|
396 |
+
1581743475178938368
|
397 |
+
1584870736535388163
|
398 |
+
1591491057900204032
|
399 |
+
1593489765910728705
|
400 |
+
1591937669097226241
|
401 |
+
1594063579002867712
|
402 |
+
1581732169222787072
|
403 |
+
1591257399780073472
|
404 |
+
1591361388710350849
|
405 |
+
1590091235276255232
|
406 |
+
1584662662902054912
|
407 |
+
1594475910278807557
|
408 |
+
1593781469331173377
|
409 |
+
1594362041523113984
|
410 |
+
1593151866715926528
|
411 |
+
1581841156173754373
|
412 |
+
1592652261679071232
|
413 |
+
1584984440300003328
|
414 |
+
1594048275245682690
|
415 |
+
1594160666096668673
|
416 |
+
1584976145979936768
|
417 |
+
1592805028162842624
|
418 |
+
1581746410210226176
|
419 |
+
1593143320938958849
|
420 |
+
1590541447832342528
|
421 |
+
1591485729884061699
|
422 |
+
1591930970273374212
|
423 |
+
1590399258066554881
|
424 |
+
1594107997755789315
|
425 |
+
1584641748302987264
|
426 |
+
1592842640026144768
|
427 |
+
1594347483164835840
|
428 |
+
1590762672307388416
|
429 |
+
1590885266029518848
|
430 |
+
1590813909732425728
|
431 |
+
1593027340619382784
|
432 |
+
1590372754427359232
|
433 |
+
1591823348035891200
|
434 |
+
1592350462694813698
|
435 |
+
1592640116220080129
|
436 |
+
1581928389505843200
|
437 |
+
1591657525459681280
|
438 |
+
1591024644370595840
|
439 |
+
1591161964419432449
|
440 |
+
1592578617623875584
|
441 |
+
1592835021727207424
|
442 |
+
1591259039551623170
|
443 |
+
1593427787251171330
|
444 |
+
1593910091325255680
|
445 |
+
1584994235916627969
|
446 |
+
1591829980895653890
|
447 |
+
1591150403315728384
|
448 |
+
1593751615323922433
|
449 |
+
1594660901143326720
|
450 |
+
1591416296025227265
|
451 |
+
1594432253630861313
|
452 |
+
1591100087325032448
|
453 |
+
1592903001093861377
|
454 |
+
1593058791377219585
|
455 |
+
1590393634394107904
|
456 |
+
1593336599655419904
|
457 |
+
1590449768480014336
|
458 |
+
1594085747442728962
|
459 |
+
1592279720468811777
|
460 |
+
1591469596766347265
|
461 |
+
1594089411360202756
|
462 |
+
1591896706509336576
|
463 |
+
1581919999870267392
|
464 |
+
1584563232789786624
|
465 |
+
1590305174949490688
|
466 |
+
1590809573367382016
|
467 |
+
1591888226100609024
|
468 |
+
1591215518262099969
|
469 |
+
1592739592922202113
|
470 |
+
1593414387187998720
|
471 |
+
1593729958815432704
|
472 |
+
1593418938200752128
|
473 |
+
1590072455208800256
|
474 |
+
1593751492929867776
|
475 |
+
1594039118152945664
|
476 |
+
1594003784388124675
|
477 |
+
1594724296093728769
|
478 |
+
1590367404533510144
|
479 |
+
1591350427374997504
|
480 |
+
1581779457857449985
|
481 |
+
1592496174493478912
|
482 |
+
1593340544691752960
|
483 |
+
1590527106735542273
|
484 |
+
1593314023616937992
|
485 |
+
1582068981653188608
|
486 |
+
1592228084790566914
|
487 |
+
1591856193567428609
|
488 |
+
1584733340179255296
|
489 |
+
1593043564153475075
|
490 |
+
1590751921417379840
|
491 |
+
1581672995277647872
|
492 |
+
1590825946181144577
|
493 |
+
1592321489340014592
|
494 |
+
1593404774245974018
|
495 |
+
1592862723217690624
|
496 |
+
1592835534673809408
|
497 |
+
1591466775727988737
|
498 |
+
1590226183651987457
|
499 |
+
1591742929701568512
|
500 |
+
1590236530861563905
|
501 |
+
1594345672815394819
|
502 |
+
1594358207127896071
|
503 |
+
1591062787446706178
|
504 |
+
1593351364851163136
|
505 |
+
1590051193610723328
|
506 |
+
1584942999678914565
|
507 |
+
1593052032709328898
|
508 |
+
1593653246353997824
|
509 |
+
1591508639126556672
|
510 |
+
1582001113204604930
|
511 |
+
1592608398721581057
|
512 |
+
1592562373357101058
|
513 |
+
1592958767507079168
|
514 |
+
1591523511168008192
|
515 |
+
1584668584042967041
|
516 |
+
1594358061858230274
|
517 |
+
1594741976775245824
|
518 |
+
1592498457042096128
|
519 |
+
1584727972782559232
|
520 |
+
1590012084104089601
|
521 |
+
1593987535919009792
|
522 |
+
1581819691512713216
|
523 |
+
1593342048874151938
|
524 |
+
1590775114484060163
|
525 |
+
1594397069476773889
|
526 |
+
1593473916751618048
|
527 |
+
1581777502527098880
|
528 |
+
1581914647510147072
|
529 |
+
1590229652723077120
|
530 |
+
1592276391818428416
|
531 |
+
1592936641135775744
|
532 |
+
1594018594031009804
|
533 |
+
1594330383084142594
|
534 |
+
1593779350293323777
|
535 |
+
1591166437971791872
|
536 |
+
1591872052805660672
|
537 |
+
1581723066492592128
|
538 |
+
1590649216745234433
|
539 |
+
1593653622344130563
|
540 |
+
1591735356483076102
|
541 |
+
1591492436790239234
|
542 |
+
1594422480004866113
|
543 |
+
1584688847622848513
|
544 |
+
1592187558163017729
|
545 |
+
1585002070306082816
|
546 |
+
1590398209134067717
|
547 |
+
1590271200084643840
|
548 |
+
1594198516523417600
|
549 |
+
1592283077715304450
|
550 |
+
1590197812125310976
|
551 |
+
1590391895926411266
|
552 |
+
1594009413668417536
|
553 |
+
1590480221257359361
|
554 |
+
1590923069568462849
|
555 |
+
1593644793430687745
|
556 |
+
1594306075729068034
|
557 |
+
1584555558585016320
|
558 |
+
1581816018040946689
|
559 |
+
1591419036490924032
|
560 |
+
1582087656837902336
|
561 |
+
1593010883638472704
|
562 |
+
1593637163328995336
|
563 |
+
1594087936982544385
|
564 |
+
1593045630074884099
|
565 |
+
1592055749521608704
|
566 |
+
1594226059331137536
|
567 |
+
1584586808620625920
|
568 |
+
1592600430168903680
|
569 |
+
1584533803292250112
|
570 |
+
1593983467435556866
|
571 |
+
1584722316726259712
|
572 |
+
1590348590093737984
|
573 |
+
1592932873186988032
|
574 |
+
1591872909118963712
|
575 |
+
1592574256642428931
|
576 |
+
1593394493461364736
|
577 |
+
1594305866114686978
|
578 |
+
1593273627583078401
|
579 |
+
1592051970076200960
|
580 |
+
1593035255652306945
|
581 |
+
1592610023502680066
|
582 |
+
1591616332851974145
|
583 |
+
1581723091234787328
|
584 |
+
1593617464881217541
|
585 |
+
1590400043261235200
|
586 |
+
1581991973585530880
|
587 |
+
1594027801262579712
|
588 |
+
1594126958010699776
|
589 |
+
1592765159851687936
|
590 |
+
1590436834244702213
|
591 |
+
1593764685484261376
|
592 |
+
1591205091755057152
|
593 |
+
1594346986299236352
|
594 |
+
1594048948423241728
|
595 |
+
1590124686092165120
|
596 |
+
1592965757436301312
|
597 |
+
1591419399319261186
|
598 |
+
1593902458354688000
|
599 |
+
1594410401806139392
|
600 |
+
1594147573052628992
|
601 |
+
1581753804638167045
|
602 |
+
1584924021778706435
|
603 |
+
1592572435735670785
|
604 |
+
1592365343825342464
|
605 |
+
1584940462078857216
|
606 |
+
1585036648186011649
|
607 |
+
1592042027134627840
|
608 |
+
1582027600335032320
|
609 |
+
1593403708863401986
|
610 |
+
1585034306728382464
|
611 |
+
1592202761889755136
|
612 |
+
1592340512379633666
|
613 |
+
1592166785415274496
|
614 |
+
1593654882346536961
|
615 |
+
1591646166546288644
|
616 |
+
1594415094255226882
|
617 |
+
1591263468098920455
|
618 |
+
1594174861831610368
|
619 |
+
1591917897471098880
|
620 |
+
1592231550783049730
|
621 |
+
1592324240979922944
|
622 |
+
1590990236360015872
|
623 |
+
1584919027717218304
|
624 |
+
1592914265958387713
|
625 |
+
1594172544747438080
|
626 |
+
1581816339018436608
|
627 |
+
1591097885499994112
|
628 |
+
1591242143523540992
|
629 |
+
1592989843281567746
|
630 |
+
1581911862215868418
|
631 |
+
1585022874993451008
|
632 |
+
1591476627086934017
|
633 |
+
1592648080054050816
|
634 |
+
1592854388506238977
|
635 |
+
1584691937558159360
|
636 |
+
1594687064192462849
|
637 |
+
1594372743352684547
|
638 |
+
1593364734677381120
|
639 |
+
1591343889344270336
|
640 |
+
1591857120768843776
|
641 |
+
1592790505754365952
|
642 |
+
1590854281854603269
|
643 |
+
1591144433550249985
|
644 |
+
1591935412473315329
|
645 |
+
1584950741617487874
|
646 |
+
1590563592494936066
|
647 |
+
1590481343393435648
|
648 |
+
1594117796547670017
|
649 |
+
1591442159936638978
|
650 |
+
1590631263094050816
|
651 |
+
1593660175063318528
|
652 |
+
1591953261816602624
|
653 |
+
1594474944238616576
|
654 |
+
1591069763375366144
|
655 |
+
1590460276952162304
|
656 |
+
1592619192293142529
|
657 |
+
1591586464269492224
|
658 |
+
1591208497655746560
|
659 |
+
1584868956527300608
|
660 |
+
1594818192093872128
|
661 |
+
1590520584328577025
|
662 |
+
1591443259775221762
|
663 |
+
1590503297156513794
|
664 |
+
1592176482151010307
|
665 |
+
1584619668928503808
|
666 |
+
1593366727907192832
|
667 |
+
1593411302692626433
|
668 |
+
1593978500796522500
|
669 |
+
1593804572119556097
|
670 |
+
1581678720909463553
|
671 |
+
1594311331213365249
|
672 |
+
1590782712616738816
|
673 |
+
1594259908362670080
|
674 |
+
1591884969357770754
|
675 |
+
1592739800766783489
|
676 |
+
1591785060922949632
|
677 |
+
1593956540846903297
|
678 |
+
1594603597891006464
|
679 |
+
1594362056416714752
|
680 |
+
1593284228669706241
|
681 |
+
1590669779584831496
|
682 |
+
1592914227391782913
|
683 |
+
1593895472787324928
|
684 |
+
1581993597859426305
|
685 |
+
1592556049420386304
|
686 |
+
1584740792601411585
|
687 |
+
1584629161247006721
|
688 |
+
1591457113595052032
|
689 |
+
1590553214340182018
|
690 |
+
1592262105503502336
|
691 |
+
1592199519181373442
|
692 |
+
1591490334122741761
|
693 |
+
1592615654158655488
|
694 |
+
1591601831259828229
|
695 |
+
1590352916959080448
|
696 |
+
1584617872663670784
|
697 |
+
1590775767298101248
|
698 |
+
1591408088665710593
|
699 |
+
1591637750515732480
|
700 |
+
1594362004013293568
|
701 |
+
1592125182772412416
|
702 |
+
1592328406150307841
|
703 |
+
1593466144832339970
|
704 |
+
1591146771677614080
|
705 |
+
1594109754854416384
|
706 |
+
1592923822373580801
|
707 |
+
1592539676824719361
|
708 |
+
1594156156163100677
|
709 |
+
1590062498367311872
|
710 |
+
1591519496770301952
|
711 |
+
1590288916447657984
|
712 |
+
1591349039215579136
|
713 |
+
1581901472475725824
|
714 |
+
1584707560795348992
|
715 |
+
1584598360551718912
|
716 |
+
1584610043085094912
|
717 |
+
1590313643203497984
|
718 |
+
1592187096856662016
|
719 |
+
1594474911434612737
|
720 |
+
1592831263865712641
|
721 |
+
1581927859610079232
|
722 |
+
1591309090642595840
|
723 |
+
1594313832402661378
|
724 |
+
1591156546989588480
|
725 |
+
1591998031645536258
|
726 |
+
1593975769021272068
|
727 |
+
1592266902390595584
|
728 |
+
1593018828601974784
|
729 |
+
1592926585359060993
|
730 |
+
1594509841548058624
|
731 |
+
1591173731833253889
|
732 |
+
1582043766378024960
|
733 |
+
1590898120048939008
|
734 |
+
1591621240388820993
|
735 |
+
1592243174931660803
|
736 |
+
1592168429624365058
|
737 |
+
1592559930841567232
|
738 |
+
1594044519258931200
|
739 |
+
1593260714218725377
|
740 |
+
1581872859529433094
|
741 |
+
1584651600291061760
|
742 |
+
1590201802045394946
|
743 |
+
1591083686480711680
|
744 |
+
1592210208105037824
|
745 |
+
1584576897622904832
|
746 |
+
1591606416447799296
|
747 |
+
1593779195037065216
|
748 |
+
1591285460013576192
|
749 |
+
1592726193006333955
|
750 |
+
1594117163136188419
|
751 |
+
1585075798041468929
|
752 |
+
1592306809796452352
|
753 |
+
1593096180015456257
|
754 |
+
1593775860863082499
|
755 |
+
1581732104726925313
|
756 |
+
1590755452157325314
|
757 |
+
1592695927919124480
|
758 |
+
1591496900376764417
|
759 |
+
1591183384101154816
|
760 |
+
1593501868961181696
|
761 |
+
1592605963596402689
|
762 |
+
1593354700535087111
|
763 |
+
1594087094883860480
|
764 |
+
1581846666629390337
|
765 |
+
1593781714203324416
|
766 |
+
1594556851332390914
|
767 |
+
1592811405987958784
|
768 |
+
1591121551906443265
|
769 |
+
1590183416607444994
|
770 |
+
1591128713680539648
|
771 |
+
1593105360805822464
|
772 |
+
1593728653883576322
|
773 |
+
1590011566757670912
|
774 |
+
1590607856339800064
|
775 |
+
1593612864731881472
|
776 |
+
1593804962689175553
|
777 |
+
1581798074568974338
|
778 |
+
1591114856115359745
|
779 |
+
1591240995357700096
|
780 |
+
1590288815092289536
|
781 |
+
1584705050643156993
|
782 |
+
1593447822451347456
|
783 |
+
1592303223330996226
|
784 |
+
1594178509714472963
|
785 |
+
1591432111390883840
|
786 |
+
1594462345618771968
|
787 |
+
1590628471914770435
|
788 |
+
1590857044802039810
|
789 |
+
1591504609939709952
|
790 |
+
1584946087827505152
|
791 |
+
1593350223186825221
|
792 |
+
1581754808502865922
|
793 |
+
1582162745054461954
|
794 |
+
1591051620867248131
|
795 |
+
1582077161057681409
|
796 |
+
1591508684856872960
|
797 |
+
1591895494871203840
|
798 |
+
1590735236882915328
|
799 |
+
1590091378406883329
|
800 |
+
1590290073475772417
|
801 |
+
1585028458518040577
|
802 |
+
1593257581946220546
|
803 |
+
1591539520608284672
|
804 |
+
1591733660558184449
|
805 |
+
1594023348169715713
|
806 |
+
1594013417370980352
|
807 |
+
1593985139926867968
|
808 |
+
1593082069399523328
|
809 |
+
1594010343478800384
|
810 |
+
1591506159751802883
|
811 |
+
1594161468764741641
|
812 |
+
1591152301817102336
|
813 |
+
1590160982709764096
|
814 |
+
1581805981230514176
|
815 |
+
1590295663631556608
|
816 |
+
1591814750152921089
|
817 |
+
1591618839829426177
|
818 |
+
1593879504766009344
|
819 |
+
1592084203998711808
|
820 |
+
1594292538898321414
|
821 |
+
1592674870210400256
|
822 |
+
1591432007015632896
|
823 |
+
1592340157537345537
|
824 |
+
1590377761348423682
|
825 |
+
1594190861667274755
|
826 |
+
1593286137543548932
|
827 |
+
1594128585455009795
|
828 |
+
1592718738956365826
|
829 |
+
1593850858894245889
|
830 |
+
1591494640259178500
|
831 |
+
1591923274514313216
|
832 |
+
1584916360936685568
|
833 |
+
1592915632601075712
|
834 |
+
1590011683145416704
|
835 |
+
1593292235222978560
|
836 |
+
1594423088619331584
|
837 |
+
1594604452425646080
|
838 |
+
1592565860128129024
|
839 |
+
1594401119727366145
|
840 |
+
1594351645021782016
|
841 |
+
1584995921519341568
|
842 |
+
1590628167416705024
|
843 |
+
1591719724987342848
|
844 |
+
1594243897248518144
|
845 |
+
1592624359516274688
|
846 |
+
1584578124418666496
|
847 |
+
1594679887918379009
|
848 |
+
1593008166316707840
|
849 |
+
1584661366853758976
|
850 |
+
1594368681227833344
|
851 |
+
1590731558264393729
|
852 |
+
1590300788861337600
|
853 |
+
1593842771529732097
|
854 |
+
1594541995082780672
|
855 |
+
1590789809286811648
|
856 |
+
1590758963942866944
|
857 |
+
1591592992577839104
|
858 |
+
1593343172410855434
|
859 |
+
1591063320454651906
|
860 |
+
1593351666484400128
|
861 |
+
1591811086269186048
|
862 |
+
1590508439926865920
|
863 |
+
1590745145305026570
|
864 |
+
1592296389060751361
|
865 |
+
1594350932833480704
|
866 |
+
1590417798475714560
|
867 |
+
1592979533506150400
|
868 |
+
1593219988038770690
|
869 |
+
1584576796002877441
|
870 |
+
1593784760387735552
|
871 |
+
1581691410566483968
|
872 |
+
1581753676666138624
|
873 |
+
1590504689241849856
|
874 |
+
1584550522543304704
|
875 |
+
1590567832173301760
|
876 |
+
1591172828308668419
|
877 |
+
1592636753323884547
|
878 |
+
1592856964077613058
|
879 |
+
1592703088854728704
|
880 |
+
1592263701738004480
|
881 |
+
1591858604386164740
|
882 |
+
1593245930823696387
|
883 |
+
1591580896494518272
|
884 |
+
1592003898021408768
|
885 |
+
1590179659521851393
|
886 |
+
1594198724149919744
|
887 |
+
1592654778987053057
|
888 |
+
1594767543549714433
|
889 |
+
1590152297472229376
|
890 |
+
1591106443318788097
|
891 |
+
1590058001888849920
|
892 |
+
1582164813928550400
|
893 |
+
1594189887464345600
|
894 |
+
1584744902566752256
|
895 |
+
1592929419949023232
|
896 |
+
1594360547943874560
|
897 |
+
1592905771486171142
|
898 |
+
1591602175876399104
|
899 |
+
1592571278397149185
|
900 |
+
1594202033074352128
|
901 |
+
1592574912409264128
|
902 |
+
1592875359577464833
|
903 |
+
1591912431009529856
|
904 |
+
1592994508114993153
|
905 |
+
1594342856973582336
|
906 |
+
1593099906348384256
|
907 |
+
1594523625419599874
|
908 |
+
1592206458044940291
|
909 |
+
1594505849002663936
|
910 |
+
1590778724043358208
|
911 |
+
1593026641760223233
|
912 |
+
1592625882723590144
|
913 |
+
1591836672383406080
|
914 |
+
1590527437418684417
|
915 |
+
1592580537516191745
|
916 |
+
1584638928422043648
|
917 |
+
1590073894182227969
|
918 |
+
1581942283133669376
|
919 |
+
1594002702618529793
|
920 |
+
1593901369257254912
|
921 |
+
1591480349217148928
|
922 |
+
1591309259853402113
|
923 |
+
1591231998844366849
|
924 |
+
1592590992309628930
|
925 |
+
1592726099326566401
|
926 |
+
1593967327808294913
|
927 |
+
1594458638524874752
|
928 |
+
1591016952633479168
|
929 |
+
1592780529203843073
|
930 |
+
1593957492127850496
|
931 |
+
1594348941784629248
|
932 |
+
1592920457925210112
|
933 |
+
1594163715901108225
|
934 |
+
1593033065202606081
|
935 |
+
1581793105723854848
|
936 |
+
1593368198157664257
|
937 |
+
1585013335233220611
|
938 |
+
1584673283022413824
|
939 |
+
1590382569069367297
|
940 |
+
1581881907520106497
|
941 |
+
1592292889891340289
|
942 |
+
1593621687987232769
|
943 |
+
1590317270093692930
|
944 |
+
1594300575239540736
|
945 |
+
1590851904141418497
|
946 |
+
1592138879301455872
|
947 |
+
1592957879841325058
|
948 |
+
1593312415151968256
|
949 |
+
1591097497279397888
|
950 |
+
1591833273160073216
|
951 |
+
1591961306315329539
|
952 |
+
1591737120372449281
|
953 |
+
1591085639247634432
|
954 |
+
1582206874270674944
|
955 |
+
1590400261528641538
|
956 |
+
1594740637668347904
|
957 |
+
1592319989645574145
|
958 |
+
1594192295770865664
|
959 |
+
1584659959832543234
|
960 |
+
1592152675512365056
|
961 |
+
1581713910385647617
|
962 |
+
1590453582134149122
|
963 |
+
1591086475567071232
|
964 |
+
1593956576615874561
|
965 |
+
1594092319749427200
|
966 |
+
1590727466221719553
|
967 |
+
1592904461927985152
|
968 |
+
1584746224900792321
|
969 |
+
1591303442676428800
|
970 |
+
1592910110787276801
|
971 |
+
1594407232111722496
|
972 |
+
1593394402793328642
|
973 |
+
1594419340073439233
|
974 |
+
1591454131117756417
|
975 |
+
1591911384610406400
|
976 |
+
1593815170752839681
|
977 |
+
1590332309575593985
|
978 |
+
1590570044198883330
|
979 |
+
1593448201889120259
|
980 |
+
1590741333316374529
|
981 |
+
1584642845503541248
|
982 |
+
1591942232818552836
|
983 |
+
1591894439210917888
|
984 |
+
1592533896436285441
|
985 |
+
1591357709743685633
|
986 |
+
1591872813916426241
|
987 |
+
1590026034216595456
|
988 |
+
1591541158727278592
|
989 |
+
1592234746456719361
|
990 |
+
1592599533871312896
|
991 |
+
1592592750322798592
|
992 |
+
1592272878887899136
|
993 |
+
1593338673306771457
|
994 |
+
1593420348006998016
|
995 |
+
1591979442720870401
|
996 |
+
1592722897067380736
|
997 |
+
1593107895088189440
|
998 |
+
1584951473473212416
|
999 |
+
1591138011240357889
|
1000 |
+
1591530518117765120
|
1001 |
+
1593412753363849216
|
1002 |
+
1590768004983582721
|
1003 |
+
1591709174706352129
|
1004 |
+
1592610492153233411
|
1005 |
+
1590850986519629824
|
1006 |
+
1590039646905708544
|
1007 |
+
1592719858885881857
|
1008 |
+
1589394620743823360
|
1009 |
+
1589627294750224384
|
1010 |
+
1589407148462964736
|
1011 |
+
1589415336922984448
|
1012 |
+
1589422829959020545
|
1013 |
+
1589629488207654914
|
1014 |
+
1589499499361480704
|
1015 |
+
1589574654720901121
|
1016 |
+
1589427231650353154
|
1017 |
+
1589396237056606213
|
1018 |
+
1589618205483749377
|
1019 |
+
1589373794241871872
|
1020 |
+
1589667659552989186
|
1021 |
+
1589387612699398144
|
1022 |
+
1589362003864612864
|
1023 |
+
1589396468129230848
|
1024 |
+
1589373743255932928
|
1025 |
+
1589572212235046912
|
1026 |
+
1589417699561869312
|
1027 |
+
1589341658222448640
|
1028 |
+
1589447097329549312
|
1029 |
+
1589341792637714434
|
1030 |
+
1589576111322968064
|
1031 |
+
1589430097899290624
|
1032 |
+
1589721864213237760
|
1033 |
+
1589438531898077184
|
1034 |
+
1589620636015808513
|
1035 |
+
1589639229591912448
|
1036 |
+
1589533143840980993
|
1037 |
+
1589591059566583813
|
1038 |
+
1589414428692209665
|
1039 |
+
1589609023435071495
|
1040 |
+
1589408559028408321
|
1041 |
+
1589484934699569153
|
1042 |
+
1589430117880954880
|
main.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import gradio as gr
|
3 |
+
from transformers import AutoModelForSequenceClassification
|
4 |
+
from transformers import BertweetTokenizer
|
5 |
+
from transformers import pipeline
|
6 |
+
from functools import lru_cache
|
7 |
+
|
8 |
+
from classification.model_with_only_language_models.text_preprocessing import clean_tweet
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
texts = [
|
13 |
+
'tl;dr\n\nHumans are just ChatGPT Wrappers in sunglasses\n \n& I couldn’t be more optimistic about the future as a result\n\nThank you \n@ekang426322\n for an exceptionally curated day at BUIDL Europe!\n 🫶',
|
14 |
+
'USD0++ discovered a new source of yield — depeg. \n\nRespect to the innovation\n',
|
15 |
+
'here you can see 4 ai agents \n@dongossen100\n , me, \n@WorldWideWarden16\n and \n@provenauthority291\n discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence',
|
16 |
+
'\n arrived to lisbon, building energy is the air',
|
17 |
+
"\n received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\n\ncan't wait to address it all\n\nmain points:\n\n- show rank in X DMs to quickly filter out inbox\n\n- rank labels are too distracting (already fixed) \n\n- add an option for users to toggle on/off scores inside the feed\n\n- add a percentile label, e.g. qw 801 (Top 0.1%)\n\n- enable others to add reviews to impact the rank \n\n- explain in detail how rankings are calculated \n\n- show breakdowns of people in DeFi, DePin, Memecoins etc.\n\n- make X Rank opensource \n\n- create a web version\n\np.s. the current version is just a tiny step in our roadmap for the next two months. \n\nthank you for the feedback \n@socialfi_panda101\n \n@adamkillam100\n \n@FamKien106\n \n@antongotchi104\n \n@kliuless128\n \n@0xsudogm163\n \n@monosarin120\n \n@flb_xyz56\n 🫶\n ",
|
18 |
+
'ai agents are in the air\n\nand web3 is trained to sniff out alpha',
|
19 |
+
'While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀',
|
20 |
+
]
|
21 |
+
# import pandas as pd
|
22 |
+
# pd.DataFrame({'texts': texts}).to_csv('examples.csv')
|
23 |
+
|
24 |
+
|
25 |
+
CHECKPOINT = "classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt"
|
26 |
+
MODEL_NAME = "vinai/bertweet-base"
|
27 |
+
|
28 |
+
|
29 |
+
class Tokenizer(BertweetTokenizer):
|
30 |
+
def __init__(self, *args, **kwargs):
|
31 |
+
return super().__init__(*args, **kwargs)
|
32 |
+
|
33 |
+
def __call__(self, *args, **kwargs):
|
34 |
+
return super().__call__(*args, max_length=100, **kwargs)
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
def get_model():
|
39 |
+
device = 'cpu'
|
40 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
|
41 |
+
model.load_state_dict(torch.load(CHECKPOINT, map_location=device))
|
42 |
+
tokenizer = Tokenizer.from_pretrained(MODEL_NAME, truncation=True, max_length=120)
|
43 |
+
return tokenizer, model
|
44 |
+
|
45 |
+
|
46 |
+
@lru_cache(1)
|
47 |
+
def get_pipeline():
|
48 |
+
tokenizer, model = get_model()
|
49 |
+
return pipeline(
|
50 |
+
'text-classification',
|
51 |
+
model=model,
|
52 |
+
tokenizer=tokenizer,
|
53 |
+
device="cpu",
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
def evaluate(text: str) -> float:
|
58 |
+
pipe = get_pipeline()
|
59 |
+
res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)
|
60 |
+
LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]
|
61 |
+
# print(f"{LABEL_1_result:7.2%}")
|
62 |
+
return LABEL_1_result
|
63 |
+
|
64 |
+
|
65 |
+
# def serve():
|
66 |
+
# pipe()
|
67 |
+
# for text in texts:
|
68 |
+
# res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)
|
69 |
+
# LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]
|
70 |
+
# print(f"{LABEL_1_result:7.2%}")
|
71 |
+
|
72 |
+
|
73 |
+
def greet(text: str):
|
74 |
+
chance: float = evaluate(text)
|
75 |
+
return f"Chance to become viral: {chance:.2%}"
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
demo = gr.Interface(
|
80 |
+
fn=greet,
|
81 |
+
inputs=["text"],
|
82 |
+
outputs=["text"],
|
83 |
+
examples=[[t] for t in texts],
|
84 |
+
)
|
85 |
+
|
86 |
+
demo.launch()
|
metric_analysis/1-standardize_metrics.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from glob import glob
|
3 |
+
from sklearn import metrics
|
4 |
+
from statistics import harmonic_mean
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
files = glob('output_original/*.csv')
|
9 |
+
theoretical = 1357228
|
10 |
+
|
11 |
+
dfs = []
|
12 |
+
|
13 |
+
for file in files:
|
14 |
+
filename = file.split('/')[-1]
|
15 |
+
df = pd.read_csv(file)
|
16 |
+
df.columns = ['tpr', 'new_tweets', 'threshold']
|
17 |
+
df['fpr'] = df['new_tweets'] / df['new_tweets'].max()
|
18 |
+
df['fpr2'] = df['new_tweets'] / theoretical
|
19 |
+
df = df.sort_values(by = ['tpr', 'new_tweets'])
|
20 |
+
df = df.drop_duplicates(subset = ['tpr'], keep = 'first')
|
21 |
+
df.to_csv('output_standardized/%s' % filename, index = False)
|
22 |
+
df['metric'] = filename.split('.csv')[0]
|
23 |
+
roc1 = metrics.auc(df['fpr'], df['tpr'])
|
24 |
+
roc2 = metrics.auc(df['fpr2'], df['tpr'])
|
25 |
+
df['roc1'] = roc1
|
26 |
+
df['roc2'] = roc2
|
27 |
+
|
28 |
+
#roc3
|
29 |
+
df95 = df.copy()
|
30 |
+
df95 = df95[df95.fpr2 <= 0.016]
|
31 |
+
df95['fpr2'] = df95['fpr2']*(1/0.016)
|
32 |
+
tprmax = df95.tpr.max()
|
33 |
+
if(tprmax < 1):
|
34 |
+
fpr2_max = df95.fpr2.max()
|
35 |
+
multipli = 1/fpr2_max
|
36 |
+
tpr_interpolated = tprmax*multipli
|
37 |
+
|
38 |
+
tpr = df95['tpr']
|
39 |
+
fpr = df95['fpr2']
|
40 |
+
tpr.loc[-1] = tpr_interpolated
|
41 |
+
fpr.loc[-1] = 1
|
42 |
+
|
43 |
+
roc95 = metrics.auc(fpr, tpr)
|
44 |
+
|
45 |
+
df['roc95'] = roc95
|
46 |
+
df['fpr3'] = df.fpr2*(1/0.016)
|
47 |
+
df['harmonic'] = harmonic_mean([roc95,roc1])
|
48 |
+
dfs.append(df)
|
49 |
+
|
50 |
+
df = pd.concat(dfs)
|
51 |
+
df.to_csv('merged_outputs.csv', index = False)
|
52 |
+
|
53 |
+
|
metric_analysis/2023-precision-recall-update.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from scipy.stats import hmean
|
3 |
+
|
4 |
+
df = pd.read_csv('merged_outputs.csv')
|
5 |
+
df['recall'] = df['tpr']
|
6 |
+
df['tp'] = df['tpr']*1008
|
7 |
+
df['fp'] = df['new_tweets'] - df['tp']
|
8 |
+
df['precision'] = df['tp'] / (df['tp'] + df['fp'])
|
9 |
+
|
10 |
+
df['f1'] = hmean(df[['precision', 'recall']], axis=1)
|
11 |
+
df['f2'] = 5 * (df['precision'] * df['recall']) / ((4 * df['precision'] ) + df['recall'])
|
12 |
+
df['f3'] = 10 * (df['precision'] * df['recall']) / ((9 * df['precision'] ) + df['recall'])
|
13 |
+
df['f4'] = 17 * (df['precision'] * df['recall']) / ((16 * df['precision'] ) + df['recall'])
|
14 |
+
df['f5'] = 26 * (df['precision'] * df['recall']) / ((25 * df['precision'] ) + df['recall'])
|
15 |
+
|
16 |
+
# df['f1'] = harmonic_mean([df['precision'], df['recall']])
|
17 |
+
metric_names = {
|
18 |
+
'hard_threshold_viral_covered_vs_new_tweets_labeled' : 'RT > T',
|
19 |
+
'virality_avg_retweets_viral_covered_vs_new_tweets_labeled' : 'RT > Avg. RT',
|
20 |
+
'log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled' : 'log(RT / Followers)',
|
21 |
+
'virality_median_retweets_viral_covered_vs_new_tweets_labeled 2': 'RT > Med. RT',
|
22 |
+
'retweets_over_log_followers_viral_covered_vs_new_tweets_labeled': 'RT / log(Followers)',
|
23 |
+
'roberta_paper_metric_viral_covered_vs_new_tweets_labeled': 'Influence Score',
|
24 |
+
'virality_followers_viral_covered_vs_new_tweets_labeled': 'RT / Followers',
|
25 |
+
'log_retweets_over_followers_viral_covered_vs_new_tweets_labeled': 'log(RT) / Followers',
|
26 |
+
'virality_median_retweets_viral_covered_vs_new_tweets_labeled': 'unused',
|
27 |
+
'virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled': 'RT Percentile'
|
28 |
+
}
|
29 |
+
|
30 |
+
df['metric_name'] = '?'
|
31 |
+
for key, name in metric_names.items():
|
32 |
+
df.loc[df.metric == key, 'metric_name'] = name
|
33 |
+
|
34 |
+
df.to_csv('all_metric_stats.csv', index = False)
|
35 |
+
print()
|
metric_analysis/output_original/hard_threshold_viral_covered_vs_new_tweets_labeled.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
metric_analysis/output_original/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_log_retweets_over_followers,nb_of_tweets_labeled_as_viral_log_retweets_over_followers,thresholds_log_retweets_over_followers
|
2 |
+
1.0,319615,2.0823825573913686e-05
|
3 |
+
0.9890873015873016,273938,4.051692731176368e-05
|
4 |
+
0.9791666666666666,255601,4.912336231094642e-05
|
5 |
+
0.9692460317460317,232131,6.06772297454726e-05
|
6 |
+
0.9593253968253969,215154,6.953399127171892e-05
|
7 |
+
0.9494047619047619,198735,7.906615255441045e-05
|
8 |
+
0.939484126984127,177935,9.274401243552317e-05
|
9 |
+
0.9295634920634921,163454,0.00010376301156781099
|
10 |
+
0.9196428571428571,153038,0.00011162448201235416
|
11 |
+
0.9097222222222222,142348,0.00012118386659212715
|
12 |
+
0.8998015873015873,133957,0.00012903905621694119
|
13 |
+
0.8898809523809523,127440,0.0001359807274111013
|
14 |
+
0.8799603174603174,119119,0.00014553053532917479
|
15 |
+
0.8700396825396826,113262,0.00015434076599951405
|
16 |
+
0.8601190476190477,109675,0.0001597665218121121
|
17 |
+
0.8492063492063492,107026,0.0001638177776107626
|
18 |
+
0.8392857142857143,102033,0.00017177097081931192
|
19 |
+
0.8293650793650794,97923,0.0001784615254643663
|
20 |
+
0.8194444444444444,94063,0.00018575504552597574
|
21 |
+
0.8095238095238095,90777,0.00019285167032283564
|
22 |
+
0.7996031746031746,87559,0.00019934327697333082
|
23 |
+
0.7896825396825397,84808,0.0002054679665547375
|
24 |
+
0.7797619047619048,78571,0.00021844458141730431
|
25 |
+
0.7698412698412699,75718,0.000226263674225816
|
26 |
+
0.7599206349206349,72948,0.00023435184719918162
|
27 |
+
0.75,70894,0.00024075728743866513
|
28 |
+
0.7400793650793651,67584,0.0002501977740079047
|
29 |
+
0.7301587301587301,62958,0.00026505673893843605
|
30 |
+
0.7202380952380952,58502,0.0002800657421942939
|
31 |
+
0.7093253968253969,56801,0.0002862937577127316
|
32 |
+
0.6994047619047619,55365,0.00029176513580133967
|
33 |
+
0.689484126984127,53140,0.00030080706952046094
|
34 |
+
0.6795634920634921,49826,0.00031641255088538567
|
35 |
+
0.6696428571428571,47808,0.00032637026326093506
|
36 |
+
0.6597222222222222,44674,0.0003421251688588788
|
37 |
+
0.6498015873015873,42986,0.00035116572142065137
|
38 |
+
0.6398809523809523,42099,0.00035691935841298465
|
39 |
+
0.6299603174603174,41027,0.00036494634545917705
|
40 |
+
0.6200396825396826,39620,0.00037486982308993465
|
41 |
+
0.6101190476190477,37806,0.0003870781771608389
|
42 |
+
0.6001984126984127,36276,0.00039804193717933356
|
43 |
+
0.5902777777777778,34629,0.00041120044887010685
|
44 |
+
0.5803571428571429,32866,0.00042635419690807295
|
45 |
+
0.5694444444444444,31354,0.00043719376387239573
|
46 |
+
0.5595238095238095,29882,0.0004524594289673326
|
47 |
+
0.5496031746031746,28535,0.00046812040833873927
|
48 |
+
0.5396825396825397,26667,0.00048767647518345024
|
49 |
+
0.5297619047619048,26013,0.0004976799255063901
|
50 |
+
0.5198412698412699,25296,0.0005061834084447068
|
51 |
+
0.5099206349206349,24228,0.0005212916903156303
|
52 |
+
0.5,23032,0.0005395738872296509
|
53 |
+
0.49007936507936506,21566,0.0005647021922270769
|
54 |
+
0.4801587301587302,20284,0.0005895765740513234
|
55 |
+
0.47023809523809523,19552,0.0006047211659253739
|
56 |
+
0.4603174603174603,18354,0.0006301131324195934
|
57 |
+
0.4503968253968254,17340,0.000652506927330073
|
58 |
+
0.44047619047619047,16628,0.0006743321938094685
|
59 |
+
0.4305555555555556,15822,0.000696980606374073
|
60 |
+
0.41964285714285715,15421,0.0007106285418656998
|
61 |
+
0.4097222222222222,14806,0.0007321509024842147
|
62 |
+
0.3998015873015873,13622,0.0007683790460187478
|
63 |
+
0.3898809523809524,12745,0.0008026821281700681
|
64 |
+
0.37996031746031744,11952,0.000838786912177239
|
65 |
+
0.37003968253968256,11298,0.0008656145917666774
|
66 |
+
0.3601190476190476,10941,0.0008840972038673189
|
67 |
+
0.3501984126984127,10089,0.0009323687651642673
|
68 |
+
0.3402777777777778,9730,0.0009582618477334784
|
69 |
+
0.33035714285714285,8697,0.001031829515028949
|
70 |
+
0.32043650793650796,8209,0.0010706819674658472
|
71 |
+
0.310515873015873,7611,0.001133266050563012
|
72 |
+
0.3005952380952381,6991,0.001199729393075923
|
73 |
+
0.2906746031746032,6705,0.0012345399503041437
|
74 |
+
0.27976190476190477,6015,0.0013358734220584026
|
75 |
+
0.2698412698412698,5224,0.0014599380842198974
|
76 |
+
0.25992063492063494,4872,0.0015278241082717179
|
77 |
+
0.25,4524,0.001633968898128735
|
78 |
+
0.2400793650793651,4018,0.00176387022399823
|
79 |
+
0.23015873015873015,3623,0.001904940963842261
|
80 |
+
0.22023809523809523,3242,0.0020431615576899315
|
81 |
+
0.21031746031746032,2985,0.0021472681131348547
|
82 |
+
0.2003968253968254,2658,0.0022891258443659395
|
83 |
+
0.19047619047619047,2303,0.002497439020684527
|
84 |
+
0.18055555555555555,2070,0.0026847167747472557
|
85 |
+
0.17063492063492064,1624,0.0030947814200735606
|
86 |
+
0.16071428571428573,1401,0.0033397443279497524
|
87 |
+
0.15079365079365079,1139,0.0036561499334258323
|
88 |
+
0.13988095238095238,939,0.004087702547298808
|
89 |
+
0.12996031746031747,822,0.004439768080440451
|
90 |
+
0.12003968253968254,670,0.0050640430518192414
|
91 |
+
0.11011904761904762,548,0.005569573349817609
|
92 |
+
0.1001984126984127,520,0.005863037495295288
|
93 |
+
0.09027777777777778,451,0.006424002096785331
|
94 |
+
0.08035714285714286,350,0.007426007042593891
|
95 |
+
0.07043650793650794,274,0.0082195927599472
|
96 |
+
0.060515873015873016,181,0.010618942533330624
|
97 |
+
0.050595238095238096,128,0.01316034253303508
|
98 |
+
0.040674603174603176,87,0.01728591025833494
|
99 |
+
0.030753968253968252,58,0.021158446941601766
|
100 |
+
0.020833333333333332,31,0.030542020193855542
|
101 |
+
0.010912698412698412,14,0.04581366291335541
|
102 |
+
0.000992063492063492,1,0.11101775162012853
|
metric_analysis/output_original/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_log_retweets_over_log_followers,nb_of_tweets_labeled_as_viral_log_retweets_over_log_followers,thresholds_log_retweets_over_log_followers
|
2 |
+
1.0,31041,0.6240002852947716
|
3 |
+
0.9890873015873016,16933,0.7230443483570232
|
4 |
+
0.9791666666666666,14286,0.7484793180463
|
5 |
+
0.9692460317460317,12783,0.7640086373583896
|
6 |
+
0.9593253968253969,12034,0.7726483778092785
|
7 |
+
0.9494047619047619,11239,0.7818005566060252
|
8 |
+
0.939484126984127,10668,0.7889122408794803
|
9 |
+
0.9295634920634921,10231,0.7936671192146495
|
10 |
+
0.9196428571428571,9768,0.7997056994974557
|
11 |
+
0.9097222222222222,9436,0.8038518801656075
|
12 |
+
0.8998015873015873,9235,0.8063837662960968
|
13 |
+
0.8898809523809523,8823,0.8121718746758682
|
14 |
+
0.8799603174603174,8537,0.8158236353302109
|
15 |
+
0.8700396825396826,8191,0.8201902577121126
|
16 |
+
0.8601190476190477,7913,0.8243482587654483
|
17 |
+
0.8492063492063492,7660,0.8282315008546971
|
18 |
+
0.8392857142857143,7401,0.8321194850614858
|
19 |
+
0.8293650793650794,7125,0.8367692646747009
|
20 |
+
0.8194444444444444,6820,0.8417075589387941
|
21 |
+
0.8095238095238095,6728,0.8431442930562224
|
22 |
+
0.7996031746031746,6515,0.8466514160755445
|
23 |
+
0.7896825396825397,6358,0.8493448528501626
|
24 |
+
0.7797619047619048,6129,0.8531910961698003
|
25 |
+
0.7698412698412699,5987,0.8559459884429319
|
26 |
+
0.7599206349206349,5814,0.8589762954909695
|
27 |
+
0.75,5607,0.8625694432138333
|
28 |
+
0.7400793650793651,5439,0.8656201351014363
|
29 |
+
0.7301587301587301,5187,0.8702700733610532
|
30 |
+
0.7202380952380952,5043,0.8733238070593741
|
31 |
+
0.7093253968253969,4908,0.8762024305187027
|
32 |
+
0.6994047619047619,4644,0.8816572865395638
|
33 |
+
0.689484126984127,4421,0.8867262402778523
|
34 |
+
0.6795634920634921,4343,0.8884327201390895
|
35 |
+
0.6696428571428571,4234,0.8909887319607015
|
36 |
+
0.6597222222222222,4098,0.8946920241479999
|
37 |
+
0.6498015873015873,3976,0.8971731862315426
|
38 |
+
0.6398809523809523,3866,0.9000547970039445
|
39 |
+
0.6299603174603174,3718,0.90335209700547
|
40 |
+
0.6200396825396826,3581,0.9074973981023631
|
41 |
+
0.6101190476190477,3415,0.9118727064143507
|
42 |
+
0.6001984126984127,3203,0.918171233112645
|
43 |
+
0.5902777777777778,3057,0.9223133679230249
|
44 |
+
0.5803571428571429,2954,0.9257993297998169
|
45 |
+
0.5694444444444444,2785,0.9314740928203532
|
46 |
+
0.5595238095238095,2614,0.9370237927380607
|
47 |
+
0.5496031746031746,2523,0.9403613239507382
|
48 |
+
0.5396825396825397,2450,0.9431937683554847
|
49 |
+
0.5297619047619048,2376,0.9461150743517935
|
50 |
+
0.5198412698412699,2275,0.9504137942708433
|
51 |
+
0.5099206349206349,2189,0.9537739623026252
|
52 |
+
0.5,2095,0.9570957173459096
|
53 |
+
0.49007936507936506,2006,0.9611672660302412
|
54 |
+
0.4801587301587302,1905,0.9655758384839201
|
55 |
+
0.47023809523809523,1831,0.9686141408612416
|
56 |
+
0.4603174603174603,1781,0.9715297580192088
|
57 |
+
0.4503968253968254,1692,0.9762955472165364
|
58 |
+
0.44047619047619047,1633,0.9790934785418005
|
59 |
+
0.4305555555555556,1561,0.9831903649682059
|
60 |
+
0.41964285714285715,1499,0.9865132333995609
|
61 |
+
0.4097222222222222,1417,0.9918971124207675
|
62 |
+
0.3998015873015873,1341,0.9972496209976179
|
63 |
+
0.3898809523809524,1281,1.0016513621903245
|
64 |
+
0.37996031746031744,1237,1.0053147119107226
|
65 |
+
0.37003968253968256,1160,1.012688867600473
|
66 |
+
0.3601190476190476,1074,1.0190172621424873
|
67 |
+
0.3501984126984127,1035,1.0230244967357478
|
68 |
+
0.3402777777777778,984,1.0277467024345215
|
69 |
+
0.33035714285714285,914,1.035761826178403
|
70 |
+
0.32043650793650796,871,1.0416438955273315
|
71 |
+
0.310515873015873,812,1.048103007385223
|
72 |
+
0.3005952380952381,770,1.0524500566259047
|
73 |
+
0.2906746031746032,721,1.060571954677911
|
74 |
+
0.27976190476190477,663,1.0691477573528578
|
75 |
+
0.2698412698412698,609,1.0775125729219717
|
76 |
+
0.25992063492063494,573,1.0840944112778226
|
77 |
+
0.25,540,1.0893285138897673
|
78 |
+
0.2400793650793651,508,1.0950675043632472
|
79 |
+
0.23015873015873015,472,1.1020695947838788
|
80 |
+
0.22023809523809523,436,1.1083128293352051
|
81 |
+
0.21031746031746032,399,1.1160912776174
|
82 |
+
0.2003968253968254,362,1.1254166588926875
|
83 |
+
0.19047619047619047,334,1.1355234542124109
|
84 |
+
0.18055555555555555,294,1.1487927586475322
|
85 |
+
0.17063492063492064,270,1.1596741156004886
|
86 |
+
0.16071428571428573,238,1.183222859975457
|
87 |
+
0.15079365079365079,221,1.191433139144886
|
88 |
+
0.13988095238095238,201,1.2048894357033295
|
89 |
+
0.12996031746031747,181,1.2226659947251466
|
90 |
+
0.12003968253968254,163,1.237761303530253
|
91 |
+
0.11011904761904762,149,1.2546164327696954
|
92 |
+
0.1001984126984127,124,1.2884022960547663
|
93 |
+
0.09027777777777778,113,1.3061771441754433
|
94 |
+
0.08035714285714286,100,1.3250067925542386
|
95 |
+
0.07043650793650794,86,1.342219079933877
|
96 |
+
0.060515873015873016,70,1.3793064865985623
|
97 |
+
0.050595238095238096,59,1.397787734828259
|
98 |
+
0.040674603174603176,45,1.4578807811516883
|
99 |
+
0.030753968253968252,34,1.5487864668450748
|
100 |
+
0.020833333333333332,23,1.6432671810611028
|
101 |
+
0.010912698412698412,11,1.7468918034538925
|
102 |
+
0.000992063492063492,1,2.143417655682428
|
metric_analysis/output_original/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_retweets_over_log_followers,nb_of_tweets_labeled_as_viral_retweets_over_log_followers,thresholds_retweets_over_log_followers
|
2 |
+
1.0,24647,96.18719446805103
|
3 |
+
0.9890873015873016,16064,195.41188564042682
|
4 |
+
0.9791666666666666,13979,244.9501617824125
|
5 |
+
0.9692460317460317,13013,272.4577979927763
|
6 |
+
0.9593253968253969,12079,304.2604884972614
|
7 |
+
0.9494047619047619,11501,326.7121276970904
|
8 |
+
0.939484126984127,10899,351.11195753265446
|
9 |
+
0.9295634920634921,10240,379.36847411850994
|
10 |
+
0.9196428571428571,10033,389.38471901077924
|
11 |
+
0.9097222222222222,9560,411.73768652001644
|
12 |
+
0.8998015873015873,9298,425.4498120093774
|
13 |
+
0.8898809523809523,8904,446.87607462645417
|
14 |
+
0.8799603174603174,8452,476.55673451118935
|
15 |
+
0.8700396825396826,8235,491.0665617707924
|
16 |
+
0.8601190476190477,8058,502.77030023552385
|
17 |
+
0.8492063492063492,7973,509.5041443087782
|
18 |
+
0.8392857142857143,7800,523.7004888975509
|
19 |
+
0.8293650793650794,7603,539.923690961875
|
20 |
+
0.8194444444444444,7357,560.2436517890462
|
21 |
+
0.8095238095238095,7157,576.0696671571288
|
22 |
+
0.7996031746031746,7011,587.5531675251212
|
23 |
+
0.7896825396825397,6797,608.3009252958204
|
24 |
+
0.7797619047619048,6566,631.0274981581211
|
25 |
+
0.7698412698412699,6444,641.222980944397
|
26 |
+
0.7599206349206349,6301,655.3005304817065
|
27 |
+
0.75,6151,671.9318091327671
|
28 |
+
0.7400793650793651,6008,687.8141314252327
|
29 |
+
0.7301587301587301,5846,706.1316731892126
|
30 |
+
0.7202380952380952,5736,718.2535271058875
|
31 |
+
0.7093253968253969,5524,745.3425802314556
|
32 |
+
0.6994047619047619,5400,760.2850310752603
|
33 |
+
0.689484126984127,5234,781.4194696938304
|
34 |
+
0.6795634920634921,5109,800.5203812652611
|
35 |
+
0.6696428571428571,5037,811.3520395758455
|
36 |
+
0.6597222222222222,4901,832.7804952963619
|
37 |
+
0.6498015873015873,4786,849.9546866922489
|
38 |
+
0.6398809523809523,4672,870.5048665882368
|
39 |
+
0.6299603174603174,4558,889.5148546157745
|
40 |
+
0.6200396825396826,4423,913.4868359265707
|
41 |
+
0.6101190476190477,4366,923.3244117903994
|
42 |
+
0.6001984126984127,4297,934.7004834106867
|
43 |
+
0.5902777777777778,4189,954.8779497176456
|
44 |
+
0.5803571428571429,4107,972.2112451667427
|
45 |
+
0.5694444444444444,4022,994.8012552878178
|
46 |
+
0.5595238095238095,3948,1010.3955348603816
|
47 |
+
0.5496031746031746,3868,1027.9967506835908
|
48 |
+
0.5396825396825397,3733,1060.6487591725645
|
49 |
+
0.5297619047619048,3664,1078.0676366493603
|
50 |
+
0.5198412698412699,3581,1099.8695901550593
|
51 |
+
0.5099206349206349,3454,1133.927886665909
|
52 |
+
0.5,3403,1146.3720700339513
|
53 |
+
0.49007936507936506,3309,1173.011743073717
|
54 |
+
0.4801587301587302,3237,1192.4468751700258
|
55 |
+
0.47023809523809523,3181,1210.132391580637
|
56 |
+
0.4603174603174603,3144,1221.3638625370725
|
57 |
+
0.4503968253968254,3040,1252.4280115900583
|
58 |
+
0.44047619047619047,2982,1273.901632813587
|
59 |
+
0.4305555555555556,2900,1301.7988554511148
|
60 |
+
0.41964285714285715,2824,1327.1136527390647
|
61 |
+
0.4097222222222222,2693,1366.6130386272125
|
62 |
+
0.3998015873015873,2629,1392.3103276130262
|
63 |
+
0.3898809523809524,2589,1408.5466934453984
|
64 |
+
0.37996031746031744,2500,1440.5856788553633
|
65 |
+
0.37003968253968256,2406,1483.3271367722798
|
66 |
+
0.3601190476190476,2357,1506.1373828761034
|
67 |
+
0.3501984126984127,2310,1527.467028791344
|
68 |
+
0.3402777777777778,2204,1578.2817061633984
|
69 |
+
0.33035714285714285,2150,1609.3989544747326
|
70 |
+
0.32043650793650796,2076,1658.641956527047
|
71 |
+
0.310515873015873,1984,1711.5371500241918
|
72 |
+
0.3005952380952381,1918,1753.2702562983825
|
73 |
+
0.2906746031746032,1857,1791.705607659909
|
74 |
+
0.27976190476190477,1813,1818.039475418674
|
75 |
+
0.2698412698412698,1753,1852.2105720993286
|
76 |
+
0.25992063492063494,1693,1888.1800931525004
|
77 |
+
0.25,1646,1924.0614094625103
|
78 |
+
0.2400793650793651,1595,1957.2434949168635
|
79 |
+
0.23015873015873015,1508,2016.6788884308942
|
80 |
+
0.22023809523809523,1424,2083.2008109637877
|
81 |
+
0.21031746031746032,1354,2133.4300161851493
|
82 |
+
0.2003968253968254,1296,2181.038558435724
|
83 |
+
0.19047619047619047,1253,2218.5893192510925
|
84 |
+
0.18055555555555555,1187,2298.6494790579704
|
85 |
+
0.17063492063492064,1153,2343.3129244431043
|
86 |
+
0.16071428571428573,1089,2409.9784637782077
|
87 |
+
0.15079365079365079,1043,2465.783943579372
|
88 |
+
0.13988095238095238,995,2512.2825954364475
|
89 |
+
0.12996031746031747,903,2644.884840422699
|
90 |
+
0.12003968253968254,847,2720.031974699821
|
91 |
+
0.11011904761904762,776,2855.125879628761
|
92 |
+
0.1001984126984127,691,3057.4880652118327
|
93 |
+
0.09027777777777778,634,3191.2654159655003
|
94 |
+
0.08035714285714286,579,3300.3631730935804
|
95 |
+
0.07043650793650794,522,3443.658749880984
|
96 |
+
0.060515873015873016,450,3670.968448884167
|
97 |
+
0.050595238095238096,366,4018.4875984146765
|
98 |
+
0.040674603174603176,280,4475.92163141317
|
99 |
+
0.030753968253968252,253,4696.706526876823
|
100 |
+
0.020833333333333332,164,5331.458638244872
|
101 |
+
0.010912698412698412,82,6411.553184403061
|
102 |
+
0.000992063492063492,14,10241.747512102276
|
metric_analysis/output_original/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_roberta_paper_metric,nb_of_tweets_labeled_as_viral_roberta_paper_metric,thresholds_roberta_paper_metric
|
2 |
+
1.0,783980,9.052889576573479e-05
|
3 |
+
0.9890873015873016,606979,0.0003409942258809795
|
4 |
+
0.9791666666666666,474954,0.0008589343284472877
|
5 |
+
0.9692460317460317,341935,0.002285587114824846
|
6 |
+
0.9593253968253969,225314,0.0059620822317802535
|
7 |
+
0.9494047619047619,156999,0.011989569411313709
|
8 |
+
0.939484126984127,115968,0.020634770088828545
|
9 |
+
0.9295634920634921,90944,0.03106544666018859
|
10 |
+
0.9196428571428571,67477,0.04963223513274843
|
11 |
+
0.9097222222222222,53428,0.07044762254504668
|
12 |
+
0.8998015873015873,44279,0.0931967814168663
|
13 |
+
0.8898809523809523,35102,0.13081025220011888
|
14 |
+
0.8799603174603174,28663,0.1745575677179806
|
15 |
+
0.8700396825396826,24972,0.21250227357914633
|
16 |
+
0.8601190476190477,22479,0.2465673793342506
|
17 |
+
0.8492063492063492,20166,0.2869145938607858
|
18 |
+
0.8392857142857143,19304,0.304132160348205
|
19 |
+
0.8293650793650794,17811,0.3390347868176622
|
20 |
+
0.8194444444444444,16677,0.3732660056369872
|
21 |
+
0.8095238095238095,15401,0.4148045252952486
|
22 |
+
0.7996031746031746,14505,0.4477972728644806
|
23 |
+
0.7896825396825397,13194,0.5055156298192968
|
24 |
+
0.7797619047619048,12508,0.5409698646849892
|
25 |
+
0.7698412698412699,11931,0.5754220771104159
|
26 |
+
0.7599206349206349,11316,0.6182192984983755
|
27 |
+
0.75,10726,0.6646019719884612
|
28 |
+
0.7400793650793651,10314,0.7019883742889699
|
29 |
+
0.7301587301587301,9788,0.750920049260784
|
30 |
+
0.7202380952380952,9007,0.8391207342688993
|
31 |
+
0.7093253968253969,8603,0.888609564785895
|
32 |
+
0.6994047619047619,8206,0.9414799820328927
|
33 |
+
0.689484126984127,7661,1.0212425782882102
|
34 |
+
0.6795634920634921,7289,1.0890041480074235
|
35 |
+
0.6696428571428571,6780,1.1855272483643158
|
36 |
+
0.6597222222222222,6467,1.2602164579934503
|
37 |
+
0.6498015873015873,6161,1.327281088505106
|
38 |
+
0.6398809523809523,5882,1.4051718048191333
|
39 |
+
0.6299603174603174,5321,1.5835118067715899
|
40 |
+
0.6200396825396826,4966,1.722560123794516
|
41 |
+
0.6101190476190477,4686,1.8317436498146948
|
42 |
+
0.6001984126984127,4484,1.9245385016106697
|
43 |
+
0.5902777777777778,4249,2.051449708767312
|
44 |
+
0.5803571428571429,4128,2.125304741103856
|
45 |
+
0.5694444444444444,3687,2.4125368969101033
|
46 |
+
0.5595238095238095,3468,2.583086678098461
|
47 |
+
0.5496031746031746,3248,2.7815895186317805
|
48 |
+
0.5396825396825397,3152,2.8954501822960066
|
49 |
+
0.5297619047619048,3081,2.955250022079203
|
50 |
+
0.5198412698412699,2908,3.1785787340504648
|
51 |
+
0.5099206349206349,2707,3.428223279567193
|
52 |
+
0.5,2593,3.59870356795998
|
53 |
+
0.49007936507936506,2502,3.7528407022738315
|
54 |
+
0.4801587301587302,2384,3.9638262404902136
|
55 |
+
0.47023809523809523,2157,4.4262194471708876
|
56 |
+
0.4603174603174603,2041,4.7390803022779195
|
57 |
+
0.4503968253968254,1956,4.94650983750502
|
58 |
+
0.44047619047619047,1898,5.125327551050057
|
59 |
+
0.4305555555555556,1743,5.658337935222293
|
60 |
+
0.41964285714285715,1669,5.972958366751632
|
61 |
+
0.4097222222222222,1590,6.348934235182757
|
62 |
+
0.3998015873015873,1476,6.895417771804837
|
63 |
+
0.3898809523809524,1381,7.553164524632679
|
64 |
+
0.37996031746031744,1318,7.95447265160399
|
65 |
+
0.37003968253968256,1182,8.841857620201926
|
66 |
+
0.3601190476190476,1082,9.709458682853722
|
67 |
+
0.3501984126984127,1013,10.492987863082893
|
68 |
+
0.3402777777777778,967,11.162073579916996
|
69 |
+
0.33035714285714285,914,11.924377006525063
|
70 |
+
0.32043650793650796,868,12.499962631113736
|
71 |
+
0.310515873015873,830,13.104004547610508
|
72 |
+
0.3005952380952381,776,14.116974236232336
|
73 |
+
0.2906746031746032,710,15.75957395134853
|
74 |
+
0.27976190476190477,656,17.503118276572664
|
75 |
+
0.2698412698412698,617,18.715704509161537
|
76 |
+
0.25992063492063494,589,19.614803085421624
|
77 |
+
0.25,530,21.76810557585511
|
78 |
+
0.2400793650793651,493,23.383787555656212
|
79 |
+
0.23015873015873015,440,27.26674857348407
|
80 |
+
0.22023809523809523,413,29.10383773593751
|
81 |
+
0.21031746031746032,376,32.10956578521053
|
82 |
+
0.2003968253968254,354,34.49758353529179
|
83 |
+
0.19047619047619047,327,37.27337021039718
|
84 |
+
0.18055555555555555,300,41.873141561514856
|
85 |
+
0.17063492063492064,272,48.023909569922445
|
86 |
+
0.16071428571428573,241,54.36857680039037
|
87 |
+
0.15079365079365079,216,61.84780146447067
|
88 |
+
0.13988095238095238,197,70.16356545125636
|
89 |
+
0.12996031746031747,178,82.08798118734508
|
90 |
+
0.12003968253968254,164,90.38189281594519
|
91 |
+
0.11011904761904762,149,99.31178628400392
|
92 |
+
0.1001984126984127,128,114.91598022184297
|
93 |
+
0.09027777777777778,114,126.08462267664385
|
94 |
+
0.08035714285714286,97,157.76121015148178
|
95 |
+
0.07043650793650794,81,195.60887797794308
|
96 |
+
0.060515873015873016,68,220.9913721970096
|
97 |
+
0.050595238095238096,57,261.8047511787649
|
98 |
+
0.040674603174603176,47,319.3778539107914
|
99 |
+
0.030753968253968252,34,463.6375267632411
|
100 |
+
0.020833333333333332,22,642.3126335309646
|
101 |
+
0.010912698412698412,11,913.2204554544222
|
102 |
+
0.000992063492063492,1,3533.739728797425
|
metric_analysis/output_original/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_virality_avg_retweets,nb_of_tweets_labeled_as_viral_virality_avg_retweets,thresholds_virality_avg_retweets
|
2 |
+
1.0,99470,0.7599240299001586
|
3 |
+
0.9890873015873016,62079,1.5480748118436458
|
4 |
+
0.9791666666666666,50271,2.0852622686277305
|
5 |
+
0.9692460317460317,43642,2.5354184580479755
|
6 |
+
0.9593253968253969,37923,3.0599798377048133
|
7 |
+
0.9494047619047619,34261,3.4972565994343037
|
8 |
+
0.939484126984127,30526,4.072360076314477
|
9 |
+
0.9295634920634921,28223,4.501034715181624
|
10 |
+
0.9196428571428571,26082,4.999137001078749
|
11 |
+
0.9097222222222222,24560,5.39307122938644
|
12 |
+
0.8998015873015873,21629,6.278415213300843
|
13 |
+
0.8898809523809523,20244,6.807979662864933
|
14 |
+
0.8809523809523809,19770,7.0
|
15 |
+
0.8700396825396826,17741,7.978472899264016
|
16 |
+
0.8601190476190477,17036,8.371478269233975
|
17 |
+
0.8492063492063492,15873,9.0580071984413
|
18 |
+
0.8392857142857143,14621,9.997273862423672
|
19 |
+
0.8293650793650794,13090,11.388669220662166
|
20 |
+
0.8194444444444444,11889,12.681181379795396
|
21 |
+
0.8095238095238095,11077,13.707925776186906
|
22 |
+
0.7996031746031746,9920,15.580116677414129
|
23 |
+
0.7896825396825397,9532,16.27014209744153
|
24 |
+
0.7797619047619048,9205,16.915461328983927
|
25 |
+
0.7698412698412699,8483,18.548537529332275
|
26 |
+
0.7599206349206349,8060,19.641355294192593
|
27 |
+
0.75,7589,20.906896682604128
|
28 |
+
0.7400793650793651,6999,22.874625271719133
|
29 |
+
0.7301587301587301,6475,25.012739959261822
|
30 |
+
0.7202380952380952,6079,26.954204438555614
|
31 |
+
0.7093253968253969,5808,28.183043893453764
|
32 |
+
0.6994047619047619,5385,30.602140549030686
|
33 |
+
0.689484126984127,5052,32.567920421715776
|
34 |
+
0.6795634920634921,4677,35.74808318428063
|
35 |
+
0.6696428571428571,4393,38.21854701763988
|
36 |
+
0.6597222222222222,4164,40.57703645974158
|
37 |
+
0.6498015873015873,3886,43.93985563230335
|
38 |
+
0.6398809523809523,3665,46.847492839618525
|
39 |
+
0.6299603174603174,3524,49.07891508810281
|
40 |
+
0.6200396825396826,3193,54.78245772103394
|
41 |
+
0.6101190476190477,3103,56.50224523966051
|
42 |
+
0.6001984126984127,2892,60.763416151670796
|
43 |
+
0.5902777777777778,2770,63.546233103760386
|
44 |
+
0.5803571428571429,2584,68.03407090113993
|
45 |
+
0.5694444444444444,2484,70.65733437208469
|
46 |
+
0.5595238095238095,2391,73.26624858192154
|
47 |
+
0.5496031746031746,2252,78.06253536357954
|
48 |
+
0.5396825396825397,2052,86.08126921842629
|
49 |
+
0.5297619047619048,1975,88.97259652512547
|
50 |
+
0.5198412698412699,1831,96.64859215298337
|
51 |
+
0.5099206349206349,1723,102.7628034997729
|
52 |
+
0.5,1611,110.04173671014203
|
53 |
+
0.49007936507936506,1523,117.28492475872802
|
54 |
+
0.4801587301587302,1444,124.34249552435475
|
55 |
+
0.47023809523809523,1369,130.09518134268558
|
56 |
+
0.4603174603174603,1290,140.8240668782626
|
57 |
+
0.4503968253968254,1200,150.88390964671086
|
58 |
+
0.44047619047619047,1145,157.99851670144255
|
59 |
+
0.4305555555555556,1101,164.50275477412396
|
60 |
+
0.41964285714285715,1034,175.0797930343889
|
61 |
+
0.4097222222222222,963,187.94005117417709
|
62 |
+
0.3998015873015873,920,193.56605517288256
|
63 |
+
0.3898809523809524,875,202.1903749398219
|
64 |
+
0.37996031746031744,822,216.09849275103548
|
65 |
+
0.37003968253968256,767,235.05164885003057
|
66 |
+
0.3601190476190476,730,249.2392647610653
|
67 |
+
0.3501984126984127,693,264.6301245685592
|
68 |
+
0.3402777777777778,657,275.9762698041734
|
69 |
+
0.33035714285714285,623,292.8722993281888
|
70 |
+
0.32043650793650796,578,310.3159564904467
|
71 |
+
0.310515873015873,546,330.0766694728744
|
72 |
+
0.3005952380952381,507,362.2200443714963
|
73 |
+
0.2906746031746032,474,384.5033519093221
|
74 |
+
0.27976190476190477,444,408.7457694650668
|
75 |
+
0.2698412698412698,421,446.810815150956
|
76 |
+
0.25992063492063494,384,485.52769942049525
|
77 |
+
0.25,367,506.6784740394677
|
78 |
+
0.2400793650793651,348,533.9551789024323
|
79 |
+
0.23015873015873015,327,551.4285181823311
|
80 |
+
0.22023809523809523,303,599.4004450619773
|
81 |
+
0.21031746031746032,286,634.4445375768611
|
82 |
+
0.2003968253968254,266,691.4859160249838
|
83 |
+
0.19047619047619047,248,741.7820711022046
|
84 |
+
0.18055555555555555,232,786.2690567569583
|
85 |
+
0.17063492063492064,219,815.3028458797158
|
86 |
+
0.16071428571428573,201,866.7503880337599
|
87 |
+
0.15079365079365079,182,976.0304653073382
|
88 |
+
0.13988095238095238,171,1058.1367647948662
|
89 |
+
0.12996031746031747,156,1119.7867538390494
|
90 |
+
0.12003968253968254,136,1253.92530228473
|
91 |
+
0.11011904761904762,123,1367.2655234393055
|
92 |
+
0.1001984126984127,112,1440.1384708594214
|
93 |
+
0.09027777777777778,102,1482.542530248446
|
94 |
+
0.08035714285714286,88,1637.9180724687665
|
95 |
+
0.07043650793650794,74,1862.5270685892517
|
96 |
+
0.060515873015873016,62,2003.0277986460685
|
97 |
+
0.050595238095238096,51,2152.268107221841
|
98 |
+
0.040674603174603176,41,2303.2090263636537
|
99 |
+
0.030753968253968252,31,2500.869915344092
|
100 |
+
0.020833333333333332,21,2581.3315487148006
|
101 |
+
0.010912698412698412,11,2754.925734752474
|
102 |
+
0.000992063492063492,1,3128.6932364568866
|
metric_analysis/output_original/virality_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_virality_followers,nb_of_tweets_labeled_as_viral_virality_followers,thresholds_virality_followers
|
2 |
+
1.0,48760,0.009847748921535678
|
3 |
+
0.9890873015873016,20751,0.043596964289750774
|
4 |
+
0.9791666666666666,16478,0.06253956872764575
|
5 |
+
0.9692460317460317,14613,0.07495774715579945
|
6 |
+
0.9593253968253969,14007,0.079949907336287
|
7 |
+
0.9494047619047619,12755,0.09122480283361462
|
8 |
+
0.939484126984127,12184,0.09777349815621926
|
9 |
+
0.9295634920634921,11792,0.10219187496413727
|
10 |
+
0.9196428571428571,11156,0.10996057722208087
|
11 |
+
0.9097222222222222,10647,0.11691437777426382
|
12 |
+
0.8998015873015873,10242,0.12351575701361578
|
13 |
+
0.8898809523809523,9888,0.1296152659458055
|
14 |
+
0.8799603174603174,9485,0.13690525138144047
|
15 |
+
0.8700396825396826,9156,0.14254183437978457
|
16 |
+
0.8601190476190477,8785,0.1499039332865125
|
17 |
+
0.8492063492063492,8428,0.15736230391691272
|
18 |
+
0.8392857142857143,8146,0.16459621222010382
|
19 |
+
0.8293650793650794,7952,0.16919448066654186
|
20 |
+
0.8194444444444444,7671,0.17584905054593755
|
21 |
+
0.8095238095238095,7453,0.18177820463978472
|
22 |
+
0.7996031746031746,7172,0.19122035156187825
|
23 |
+
0.7896825396825397,6925,0.19884880531962626
|
24 |
+
0.7797619047619048,6648,0.20896555791065008
|
25 |
+
0.7698412698412699,6505,0.21502254115615835
|
26 |
+
0.7599206349206349,6314,0.22201613044061455
|
27 |
+
0.75,6103,0.22969755001706385
|
28 |
+
0.7400793650793651,5863,0.2395665365261546
|
29 |
+
0.7301587301587301,5642,0.24941243620086148
|
30 |
+
0.7202380952380952,5257,0.2691313357520127
|
31 |
+
0.7093253968253969,5083,0.2786949054424083
|
32 |
+
0.6994047619047619,4873,0.29185505269258716
|
33 |
+
0.689484126984127,4709,0.3015478411563975
|
34 |
+
0.6795634920634921,4530,0.31431907587712016
|
35 |
+
0.6696428571428571,4365,0.3263783963721026
|
36 |
+
0.6597222222222222,4190,0.3419197998258842
|
37 |
+
0.6498015873015873,4074,0.35240449858579814
|
38 |
+
0.6398809523809523,3906,0.36738570635841483
|
39 |
+
0.6299603174603174,3846,0.37350307338950794
|
40 |
+
0.6200396825396826,3714,0.38569974894121606
|
41 |
+
0.6101190476190477,3433,0.4182156945481237
|
42 |
+
0.6001984126984127,3261,0.4398674447582947
|
43 |
+
0.5902777777777778,3160,0.45260475188842475
|
44 |
+
0.5803571428571429,3009,0.4744476851437965
|
45 |
+
0.5694444444444444,2811,0.5065435623907723
|
46 |
+
0.5595238095238095,2651,0.5376141635489305
|
47 |
+
0.5496031746031746,2548,0.554468003063123
|
48 |
+
0.5396825396825397,2480,0.569913190896405
|
49 |
+
0.5297619047619048,2381,0.5911688811978919
|
50 |
+
0.5198412698412699,2275,0.6145070157601543
|
51 |
+
0.5099206349206349,2189,0.6347704849378184
|
52 |
+
0.5,2102,0.6582912665179754
|
53 |
+
0.49007936507936506,2000,0.6861455175535648
|
54 |
+
0.4801587301587302,1910,0.7136122363660038
|
55 |
+
0.47023809523809523,1840,0.7363839449817713
|
56 |
+
0.4603174603174603,1755,0.7707349266582455
|
57 |
+
0.4503968253968254,1690,0.7949358527010969
|
58 |
+
0.44047619047619047,1623,0.8251487414647105
|
59 |
+
0.4305555555555556,1550,0.8543538482681353
|
60 |
+
0.41964285714285715,1492,0.8837801041456175
|
61 |
+
0.4097222222222222,1407,0.933151427225055
|
62 |
+
0.3998015873015873,1340,0.9728949369054694
|
63 |
+
0.3898809523809524,1281,1.0159094482787683
|
64 |
+
0.37996031746031744,1238,1.0517923348956835
|
65 |
+
0.37003968253968256,1157,1.126624449492463
|
66 |
+
0.3601190476190476,1079,1.1946824800319331
|
67 |
+
0.3501984126984127,1035,1.2350532566427777
|
68 |
+
0.3402777777777778,973,1.3111586957558676
|
69 |
+
0.33035714285714285,930,1.3788231439711065
|
70 |
+
0.32043650793650796,868,1.4648138997453877
|
71 |
+
0.310515873015873,804,1.5657617737942116
|
72 |
+
0.3005952380952381,757,1.6483081903461123
|
73 |
+
0.2906746031746032,713,1.7528878661684644
|
74 |
+
0.27976190476190477,663,1.8714128881700722
|
75 |
+
0.2698412698412698,612,2.01276188638082
|
76 |
+
0.25992063492063494,567,2.1149449648249323
|
77 |
+
0.25,532,2.212570952273289
|
78 |
+
0.2400793650793651,516,2.262779417926834
|
79 |
+
0.23015873015873015,489,2.3775402418530684
|
80 |
+
0.22023809523809523,440,2.5712123367735846
|
81 |
+
0.21031746031746032,412,2.6883909458302067
|
82 |
+
0.2003968253968254,378,2.9011109573007374
|
83 |
+
0.19047619047619047,343,3.1589097951921734
|
84 |
+
0.18055555555555555,318,3.3862837057887427
|
85 |
+
0.17063492063492064,286,3.653621381908462
|
86 |
+
0.16071428571428573,255,4.081553139911354
|
87 |
+
0.15079365079365079,233,4.498533912223866
|
88 |
+
0.13988095238095238,209,5.123369293423201
|
89 |
+
0.12996031746031747,181,5.865184280255489
|
90 |
+
0.12003968253968254,164,6.426211688059448
|
91 |
+
0.11011904761904762,150,7.225324262618784
|
92 |
+
0.1001984126984127,132,8.241192277609564
|
93 |
+
0.09027777777777778,114,8.966047810728663
|
94 |
+
0.08035714285714286,100,9.792804922314204
|
95 |
+
0.07043650793650794,88,11.423473275192162
|
96 |
+
0.060515873015873016,73,13.329307594828814
|
97 |
+
0.050595238095238096,56,17.36539923954372
|
98 |
+
0.040674603174603176,45,20.418879061412717
|
99 |
+
0.030753968253968252,34,27.75797742299627
|
100 |
+
0.020833333333333332,22,41.26621697088786
|
101 |
+
0.010912698412698412,11,59.904126358909444
|
102 |
+
0.000992063492063492,1,162.90697674418604
|
metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_virality_median_retweets,nb_of_tweets_labeled_as_viral_virality_median_retweets,thresholds_virality_median_retweets
|
2 |
+
1.0,134409,1.0
|
3 |
+
0.9876923076923076,107699,1.5609094071743144
|
4 |
+
0.9784615384615385,100735,1.9992355751561925
|
5 |
+
0.9692307692307692,90951,2.2612983428008073
|
6 |
+
0.96,85185,2.7031504264095916
|
7 |
+
0.9476923076923077,77011,3.2563426595787153
|
8 |
+
0.9384615384615385,65714,4.641389084785888
|
9 |
+
0.9292307692307692,61820,5.1008903893765485
|
10 |
+
0.92,52119,7.312447220141592
|
11 |
+
0.9076923076923077,46235,9.17001356841133
|
12 |
+
0.8984615384615384,44033,10.027246243440574
|
13 |
+
0.8892307692307693,39488,12.639793663256718
|
14 |
+
0.88,32972,17.645820519988817
|
15 |
+
0.8676923076923077,28429,23.0279127002554
|
16 |
+
0.8584615384615385,27034,25.41807646695045
|
17 |
+
0.8492307692307692,24366,30.565914650720305
|
18 |
+
0.84,22694,34.49191625472521
|
19 |
+
0.8276923076923077,20576,40.511283764967985
|
20 |
+
0.8184615384615385,19538,44.345175283888736
|
21 |
+
0.8092307692307692,17102,55.05751773049646
|
22 |
+
0.8,16463,58.88816251076612
|
23 |
+
0.7876923076923077,15830,62.71639784946234
|
24 |
+
0.7784615384615384,14743,70.05540389053742
|
25 |
+
0.7692307692307693,14247,74.15760000000002
|
26 |
+
0.76,12714,88.54265957446803
|
27 |
+
0.7507692307692307,12267,93.8173076923077
|
28 |
+
0.7384615384615385,11823,99.04954407294835
|
29 |
+
0.7292307692307692,10868,113.25071895424837
|
30 |
+
0.72,10400,121.42667780562527
|
31 |
+
0.7107692307692308,9775,133.6674772036474
|
32 |
+
0.6984615384615385,9504,138.80271646859083
|
33 |
+
0.6892307692307692,9241,144.15345029239765
|
34 |
+
0.68,8443,165.98497588652486
|
35 |
+
0.6707692307692308,8079,175.3568580560256
|
36 |
+
0.6584615384615384,7985,178.01515789473683
|
37 |
+
0.6492307692307693,7801,184.9621954484605
|
38 |
+
0.64,7367,201.76998989694889
|
39 |
+
0.6307692307692307,6802,226.3273758865248
|
40 |
+
0.6184615384615385,6055,263.0394202898551
|
41 |
+
0.6092307692307692,5681,289.52545311268716
|
42 |
+
0.6,5261,320.21842105263147
|
43 |
+
0.5907692307692308,4972,345.265
|
44 |
+
0.5784615384615385,4849,357.453793103448
|
45 |
+
0.5692307692307692,4667,379.78126543209873
|
46 |
+
0.56,4518,396.6374331550802
|
47 |
+
0.5507692307692308,4115,451.9967654986525
|
48 |
+
0.5384615384615384,3471,561.6797385620916
|
49 |
+
0.5292307692307693,3301,600.444358974359
|
50 |
+
0.52,3127,644.4646666666665
|
51 |
+
0.5107692307692308,2919,700.650976744186
|
52 |
+
0.5015384615384615,2736,753.530612244898
|
53 |
+
0.48923076923076925,2619,793.9771428571429
|
54 |
+
0.48,2396,887.333684210527
|
55 |
+
0.4707692307692308,2231,975.3399999999999
|
56 |
+
0.46153846153846156,2071,1087.9872268907568
|
57 |
+
0.4492307692307692,1975,1153.6727272727283
|
58 |
+
0.44,1823,1287.986666666667
|
59 |
+
0.4307692307692308,1755,1346.0
|
60 |
+
0.42153846153846153,1607,1516.499999999999
|
61 |
+
0.40923076923076923,1447,1719.88
|
62 |
+
0.4,1407,1778.9000000000003
|
63 |
+
0.39076923076923076,1368,1840.1704347826085
|
64 |
+
0.38153846153846155,1292,1982.1173333333327
|
65 |
+
0.36923076923076925,1271,2013.0342857142857
|
66 |
+
0.36,1195,2140.38
|
67 |
+
0.3507692307692308,1152,2251.7999999999997
|
68 |
+
0.3415384615384615,1125,2290.6666666666665
|
69 |
+
0.3292307692307692,1048,2474.848000000001
|
70 |
+
0.32,997,2616.640000000003
|
71 |
+
0.31076923076923074,946,2800.0149999999994
|
72 |
+
0.30153846153846153,878,3054.0666666666657
|
73 |
+
0.28923076923076924,845,3166.7999999999993
|
74 |
+
0.28,784,3394.6466666666665
|
75 |
+
0.27076923076923076,775,3428.8199999999997
|
76 |
+
0.26153846153846155,719,3691.0899999999992
|
77 |
+
0.2523076923076923,645,4030.0
|
78 |
+
0.24,609,4260.660000000002
|
79 |
+
0.23076923076923078,526,4824.393333333336
|
80 |
+
0.22153846153846155,487,5280.888
|
81 |
+
0.2123076923076923,456,5618.76
|
82 |
+
0.2,419,6033.5599999999995
|
83 |
+
0.19076923076923077,384,6516.36
|
84 |
+
0.18153846153846154,338,7073.840000000001
|
85 |
+
0.1723076923076923,317,7362.599999999994
|
86 |
+
0.16,299,7523.599999999994
|
87 |
+
0.15076923076923077,278,7806.799999999999
|
88 |
+
0.14153846153846153,241,8540.986666666664
|
89 |
+
0.13230769230769232,233,8752.44
|
90 |
+
0.12,209,9376.240000000002
|
91 |
+
0.11076923076923077,194,9990.320000000002
|
92 |
+
0.10153846153846154,179,10952.400000000009
|
93 |
+
0.09230769230769231,167,11616.800000000007
|
94 |
+
0.08,135,13536.373333333351
|
95 |
+
0.07076923076923076,120,14684.879999999996
|
96 |
+
0.06153846153846154,106,16057.560000000001
|
97 |
+
0.052307692307692305,95,17173.200000000004
|
98 |
+
0.04,77,19165.079999999958
|
99 |
+
0.03076923076923077,65,21221.51999999996
|
100 |
+
0.021538461538461538,52,24319.559999999983
|
101 |
+
0.012307692307692308,31,28349.359999999968
|
102 |
+
0.003076923076923077,5,56750.0
|
metric_analysis/output_original/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_virality_median_retweets,nb_of_tweets_labeled_as_viral_virality_median_retweets,thresholds_virality_median_retweets
|
2 |
+
1.0,511764,0.0
|
3 |
+
1.0,511764,0.0
|
4 |
+
1.0,511764,0.0
|
5 |
+
1.0,511764,0.0
|
6 |
+
1.0,511764,0.0
|
7 |
+
1.0,511764,0.0
|
8 |
+
1.0,511764,0.0
|
9 |
+
1.0,511764,0.0
|
10 |
+
1.0,511764,0.0
|
11 |
+
1.0,511764,0.0
|
12 |
+
1.0,511764,0.0
|
13 |
+
1.0,511764,0.0
|
14 |
+
1.0,511764,0.0
|
15 |
+
1.0,511764,0.0
|
16 |
+
1.0,511764,0.0
|
17 |
+
1.0,511764,0.0
|
18 |
+
1.0,511764,0.0
|
19 |
+
1.0,511764,0.0
|
20 |
+
1.0,511764,0.0
|
21 |
+
1.0,511764,0.0
|
22 |
+
1.0,511764,0.0
|
23 |
+
1.0,511764,0.0
|
24 |
+
1.0,511764,0.0
|
25 |
+
1.0,511764,0.0
|
26 |
+
1.0,511764,0.0
|
27 |
+
1.0,511764,0.0
|
28 |
+
1.0,511764,0.0
|
29 |
+
1.0,511764,0.0
|
30 |
+
1.0,511764,0.0
|
31 |
+
1.0,511764,0.0
|
32 |
+
1.0,511764,0.0
|
33 |
+
1.0,511764,0.0
|
34 |
+
1.0,511764,0.0
|
35 |
+
1.0,511764,0.0
|
36 |
+
1.0,511764,0.0
|
37 |
+
1.0,511764,0.0
|
38 |
+
1.0,511764,0.0
|
39 |
+
1.0,511764,0.0
|
40 |
+
1.0,511764,0.0
|
41 |
+
1.0,511764,0.0
|
42 |
+
1.0,511764,0.0
|
43 |
+
1.0,511764,0.0
|
44 |
+
1.0,511764,0.0
|
45 |
+
1.0,511764,0.0
|
46 |
+
1.0,511764,0.0
|
47 |
+
1.0,511764,0.0
|
48 |
+
1.0,511764,0.0
|
49 |
+
1.0,511764,0.0
|
50 |
+
1.0,511764,0.0
|
51 |
+
1.0,511764,0.0
|
52 |
+
1.0,511764,0.0
|
53 |
+
1.0,511764,0.0
|
54 |
+
1.0,511764,0.0
|
55 |
+
1.0,511764,0.0
|
56 |
+
1.0,511764,0.0
|
57 |
+
1.0,511764,0.0
|
58 |
+
1.0,511764,0.0
|
59 |
+
1.0,511764,0.0
|
60 |
+
1.0,511764,0.0
|
61 |
+
1.0,511764,0.0
|
62 |
+
1.0,511764,0.0
|
63 |
+
1.0,511764,0.0
|
64 |
+
1.0,511764,0.0
|
65 |
+
1.0,511764,0.0
|
66 |
+
1.0,511764,0.0
|
67 |
+
1.0,511764,0.0
|
68 |
+
1.0,511764,0.0
|
69 |
+
1.0,511764,0.0
|
70 |
+
0.32242063492063494,134409,1.0
|
71 |
+
0.310515873015873,86051,2.6456794555995926
|
72 |
+
0.3005952380952381,61961,5.027143821742062
|
73 |
+
0.2906746031746032,45428,9.636950619740496
|
74 |
+
0.27976190476190477,28429,23.02300286355545
|
75 |
+
0.2698412698412698,22643,34.6215403148756
|
76 |
+
0.25992063492063494,17026,55.781783181357575
|
77 |
+
0.25,14577,71.5297497155859
|
78 |
+
0.2400793650793651,12130,95.42360795606041
|
79 |
+
0.23015873015873015,9916,130.25747244296335
|
80 |
+
0.22023809523809523,8532,163.15705
|
81 |
+
0.21031746031746032,7881,181.93631481481526
|
82 |
+
0.2003968253968254,6185,256.67272727272757
|
83 |
+
0.19047619047619047,4979,344.9141220238097
|
84 |
+
0.18055555555555555,4507,397.92005347593505
|
85 |
+
0.17063492063492064,3285,605.9448076923071
|
86 |
+
0.16071428571428573,2717,759.2236734693877
|
87 |
+
0.15079365079365079,2189,1004.0008333333315
|
88 |
+
0.13988095238095238,1789,1316.0649999999996
|
89 |
+
0.12996031746031747,1422,1754.7858823529411
|
90 |
+
0.12003968253968254,1274,2010.7097142857142
|
91 |
+
0.11011904761904762,1129,2285.5833333333335
|
92 |
+
0.1001984126984127,947,2795.887500000001
|
93 |
+
0.09027777777777778,783,3395.8991666666666
|
94 |
+
0.08035714285714286,643,4035.7200000000007
|
95 |
+
0.07043650793650794,476,5388.830000000004
|
96 |
+
0.060515873015873016,370,6665.859999999988
|
97 |
+
0.050595238095238096,284,7694.966666666665
|
98 |
+
0.040674603174603176,219,9203.879999999965
|
99 |
+
0.030753968253968252,172,11420.029999999995
|
100 |
+
0.020833333333333332,108,15734.440000000006
|
101 |
+
0.010912698412698412,68,20769.80999999998
|
102 |
+
0.000992063492063492,5,56750.0
|
metric_analysis/output_original/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
percentage_of_viral_covered_virality_retweet_percentile_per_user,nb_of_tweets_labeled_as_viral_virality_retweet_percentile_per_user,thresholds_virality_retweet_percentile_per_user
|
2 |
+
1.0,1357228,0.0
|
3 |
+
1.0,1357165,0.01
|
4 |
+
1.0,1357075,0.02
|
5 |
+
1.0,1356795,0.03
|
6 |
+
1.0,1356479,0.04
|
7 |
+
1.0,1355978,0.05
|
8 |
+
1.0,1355779,0.06
|
9 |
+
1.0,1355441,0.07
|
10 |
+
1.0,1355077,0.08
|
11 |
+
1.0,1353860,0.09
|
12 |
+
1.0,1353400,0.1
|
13 |
+
1.0,1352555,0.11
|
14 |
+
1.0,1351992,0.12
|
15 |
+
1.0,1351289,0.13
|
16 |
+
1.0,1349915,0.14
|
17 |
+
1.0,1348735,0.15
|
18 |
+
1.0,1348367,0.16
|
19 |
+
1.0,1347269,0.17
|
20 |
+
1.0,1345947,0.18
|
21 |
+
1.0,1344634,0.19
|
22 |
+
1.0,1342750,0.2
|
23 |
+
1.0,1341753,0.21
|
24 |
+
1.0,1340292,0.22
|
25 |
+
1.0,1336558,0.23
|
26 |
+
1.0,1335271,0.24
|
27 |
+
1.0,1333139,0.25
|
28 |
+
1.0,1331241,0.26
|
29 |
+
1.0,1328699,0.27
|
30 |
+
1.0,1326128,0.28
|
31 |
+
1.0,1324353,0.29
|
32 |
+
1.0,1323643,0.3
|
33 |
+
1.0,1320614,0.31
|
34 |
+
1.0,1319676,0.32
|
35 |
+
1.0,1314107,0.33
|
36 |
+
1.0,1307821,0.34
|
37 |
+
1.0,1305415,0.35
|
38 |
+
1.0,1298334,0.36
|
39 |
+
1.0,1297104,0.37
|
40 |
+
1.0,1293457,0.38
|
41 |
+
1.0,1289359,0.39
|
42 |
+
1.0,1286088,0.4
|
43 |
+
1.0,1282661,0.41
|
44 |
+
1.0,1277658,0.42
|
45 |
+
1.0,1272885,0.43
|
46 |
+
1.0,1266459,0.44
|
47 |
+
1.0,1260552,0.45
|
48 |
+
1.0,1253282,0.46
|
49 |
+
1.0,1249633,0.47
|
50 |
+
1.0,1247341,0.48
|
51 |
+
1.0,1243766,0.49
|
52 |
+
1.0,1236825,0.5
|
53 |
+
1.0,1225501,0.51
|
54 |
+
1.0,1214100,0.52
|
55 |
+
1.0,1203663,0.53
|
56 |
+
1.0,1194775,0.54
|
57 |
+
1.0,1189887,0.55
|
58 |
+
1.0,1183528,0.56
|
59 |
+
1.0,1174373,0.57
|
60 |
+
0.9990079365079365,1162858,0.58
|
61 |
+
0.9990079365079365,1149726,0.59
|
62 |
+
0.9990079365079365,1138170,0.6
|
63 |
+
0.9990079365079365,1123765,0.61
|
64 |
+
0.9990079365079365,1108718,0.62
|
65 |
+
0.9990079365079365,1093939,0.63
|
66 |
+
0.9990079365079365,1072643,0.64
|
67 |
+
0.9970238095238095,1054184,0.65
|
68 |
+
0.9970238095238095,1029756,0.66
|
69 |
+
0.996031746031746,1014722,0.67
|
70 |
+
0.996031746031746,998191,0.68
|
71 |
+
0.996031746031746,976124,0.69
|
72 |
+
0.996031746031746,954849,0.7
|
73 |
+
0.996031746031746,937666,0.71
|
74 |
+
0.9950396825396826,912003,0.72
|
75 |
+
0.9950396825396826,881940,0.73
|
76 |
+
0.9950396825396826,850593,0.74
|
77 |
+
0.9950396825396826,822465,0.75
|
78 |
+
0.9940476190476191,795899,0.76
|
79 |
+
0.9940476190476191,775478,0.77
|
80 |
+
0.9940476190476191,741226,0.78
|
81 |
+
0.9940476190476191,707711,0.79
|
82 |
+
0.9940476190476191,680288,0.8
|
83 |
+
0.9940476190476191,646190,0.81
|
84 |
+
0.9930555555555556,610579,0.82
|
85 |
+
0.9910714285714286,574204,0.83
|
86 |
+
0.9880952380952381,528701,0.84
|
87 |
+
0.9861111111111112,490552,0.85
|
88 |
+
0.9851190476190477,462519,0.86
|
89 |
+
0.9831349206349206,422834,0.87
|
90 |
+
0.9791666666666666,375135,0.88
|
91 |
+
0.9771825396825397,339615,0.89
|
92 |
+
0.9742063492063492,310028,0.9
|
93 |
+
0.9662698412698413,272234,0.91
|
94 |
+
0.9583333333333334,238956,0.92
|
95 |
+
0.9523809523809523,203539,0.93
|
96 |
+
0.9424603174603174,160095,0.94
|
97 |
+
0.9305555555555556,117906,0.95
|
98 |
+
0.9206349206349206,92538,0.96
|
99 |
+
0.9067460317460317,61903,0.97
|
100 |
+
0.876984126984127,46379,0.98
|
101 |
+
0.816468253968254,20605,0.99
|
102 |
+
0.503968253968254,814,1.0
|
metric_analysis/output_standardized/hard_threshold_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,843 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,23,96321.9309930993,0.0008195845062894203,1.6946305263375057e-05
|
3 |
+
0.0019841269841269,34,88482.72157215721,0.0012115597049495777,2.505105995455443e-05
|
4 |
+
0.0029761904761904,40,83740.72127212721,0.001425364358764209,2.947183524065227e-05
|
5 |
+
0.0039682539682539,48,80460.75757575758,0.001710437230517051,3.536620228878272e-05
|
6 |
+
0.0049603174603174,49,80095.24842484249,0.001746071339486156,3.6102998169799034e-05
|
7 |
+
0.0059523809523809,62,76728.71677167717,0.002209314756084524,4.568134462301102e-05
|
8 |
+
0.0069444444444444,77,71342.26612661267,0.0027438263906211027,5.673328283825562e-05
|
9 |
+
0.0079365079365079,86,69033.78727872788,0.0030645333713430496,6.336444576740239e-05
|
10 |
+
0.0089285714285714,102,66003.9087908791,0.0036346791148487334,7.515317986366329e-05
|
11 |
+
0.0099206349206349,104,65898.10351035104,0.0037059473327869436,7.662677162569591e-05
|
12 |
+
0.0109126984126984,106,65667.25562556257,0.0037772155507251543,7.810036338772852e-05
|
13 |
+
0.0119047619047619,114,64118.65106510651,0.004062288422477996,8.399473043585897e-05
|
14 |
+
0.0128968253968253,122,63320.30213021302,0.004347361294230838,8.988909748398942e-05
|
15 |
+
0.0138888888888888,131,61848.64686468647,0.0046680682749527845,9.652026041313618e-05
|
16 |
+
0.0148809523809523,142,60136.52505250525,0.0050600434736129424,0.00010462501510431557
|
17 |
+
0.0158730158730158,173,56943.12931293129,0.006164700851655204,0.00012746568741582107
|
18 |
+
0.0168650793650793,176,56433.3402340234,0.00627160317856252,0.00012967607505887
|
19 |
+
0.0178571428571428,180,56135.16171617162,0.006414139614438941,0.0001326232585829352
|
20 |
+
0.0188492063492063,187,55000.1596159616,0.0066635783772226774,0.00013778082975004938
|
21 |
+
0.0198412698412698,199,54028.67476747675,0.0070911876848519404,0.00014662238032224506
|
22 |
+
0.0208333333333333,207,53268.800480048005,0.007376260556604782,0.0001525167473703755
|
23 |
+
0.0218253968253968,210,53105.283228322834,0.0074831628835120975,0.00015472713501342442
|
24 |
+
0.0228174603174603,233,50604.43114311432,0.008302747389801518,0.00017167344027679948
|
25 |
+
0.0238095238095238,238,50383.20192019202,0.008480917934647045,0.000175357419681881
|
26 |
+
0.0248015873015873,244,49584.85298529853,0.008694722588461675,0.00017977819496797884
|
27 |
+
0.0257936507936507,264,47968.917791779175,0.00940740476784378,0.00019451411258830499
|
28 |
+
0.0267857142857142,274,47449.5100510051,0.009763745857534832,0.00020188207139846805
|
29 |
+
0.0287698412698412,279,47103.23822382238,0.009941916402380358,0.0002055660508035496
|
30 |
+
0.0297619047619047,287,46304.88928892889,0.010226989274133201,0.00021146041785168005
|
31 |
+
0.0307539682539682,291,46054.804080408045,0.010369525710009622,0.00021440760137574527
|
32 |
+
0.0317460317460317,323,43890.60516051605,0.011509817197020988,0.00023798506956826707
|
33 |
+
0.0327380952380952,332,43659.75727572757,0.011830524177742935,0.00024461623249741386
|
34 |
+
0.0337301587301587,356,42486.280528052805,0.012685742793001461,0.00026229933364180523
|
35 |
+
0.0347222222222222,358,42447.80588058806,0.012757011010939671,0.0002637729254038378
|
36 |
+
0.0357142857142857,369,42053.44074407441,0.013148986209599828,0.0002718776800950172
|
37 |
+
0.0367063492063492,373,41610.98229822982,0.01329152264547625,0.0002748248636190824
|
38 |
+
0.0376984126984126,374,41601.36363636364,0.013327156754445355,0.00027556165950009875
|
39 |
+
0.0386904761904761,379,41466.70237023702,0.013505327299290881,0.00027924563890518024
|
40 |
+
0.0396825396825396,383,41322.42244224423,0.013647863735167302,0.0002821928224292455
|
41 |
+
0.0406746031746031,394,40995.38793879388,0.01403983893382746,0.00029029757712042487
|
42 |
+
0.0416666666666666,409,40754.921392139215,0.014574350568364038,0.00030134951533566945
|
43 |
+
0.0426587301587301,417,40485.59885988599,0.01485942344011688,0.00030724388238379994
|
44 |
+
0.0436507936507936,425,40168.18301830183,0.015144496311869721,0.0003131382494319304
|
45 |
+
0.0446428571428571,428,39821.91119111912,0.015251398638777038,0.0003153486370749793
|
46 |
+
0.0456349206349206,437,39494.876687668766,0.015572105619498984,0.00032197980000412606
|
47 |
+
0.0466269841269841,455,38936.99429942994,0.01621351958094288,0.0003352421258624196
|
48 |
+
0.0476190476190476,458,38840.80768076808,0.016320421907850192,0.0003374525135054685
|
49 |
+
0.0486111111111111,469,38456.06120612061,0.016712397106510353,0.00034555726819664786
|
50 |
+
0.0496031746031746,471,38398.349234923495,0.01678366532444856,0.0003470308599586805
|
51 |
+
0.050595238095238,476,38205.97599759976,0.01696183586929409,0.000350714839363762
|
52 |
+
0.0515873015873015,483,37811.61086108611,0.017211274632077826,0.0003558724105308762
|
53 |
+
0.052579365079365,488,37542.28832883288,0.01738944517692335,0.0003595563899359577
|
54 |
+
0.0535714285714285,508,36888.21932193219,0.018102127356305456,0.00037429230755628385
|
55 |
+
0.054563492063492,523,36407.28622862286,0.018636638990842034,0.00038534424577152843
|
56 |
+
0.0555555555555555,525,36311.099609961,0.018707907208780246,0.0003868178375335611
|
57 |
+
0.056547619047619,551,35743.598559855986,0.01963439404197698,0.00040597453043998504
|
58 |
+
0.0575396825396825,563,35474.27602760276,0.02006200334960624,0.0004148160810121807
|
59 |
+
0.058531746031746,568,35349.23342334233,0.02024017389445177,0.00041850006041726227
|
60 |
+
0.0595238095238095,570,35329.99609960996,0.020311442112389978,0.00041997365217929486
|
61 |
+
0.060515873015873,597,34589.35913591359,0.02127356305455582,0.0004398671409667351
|
62 |
+
0.0615079365079365,620,33656.34893489349,0.022093147560845242,0.0004568134462301102
|
63 |
+
0.0625,624,33540.92499249925,0.022235683996721663,0.00045976062975417543
|
64 |
+
0.0634920634920634,632,33348.55175517552,0.022520756868474504,0.00046565499680230587
|
65 |
+
0.0644841269841269,640,33223.509150915095,0.022805829740227344,0.0004715493638504363
|
66 |
+
0.0654761904761904,641,33204.27182718272,0.022841463849196452,0.00047228615973145266
|
67 |
+
0.0664682539682539,653,33021.51725172518,0.023269073156825713,0.00048112771030364834
|
68 |
+
0.0674603174603174,655,32983.04260426042,0.023340341374763925,0.00048260130206568094
|
69 |
+
0.0684523809523809,657,32963.80528052805,0.023411609592702134,0.00048407489382771353
|
70 |
+
0.0694444444444444,658,32954.18661866187,0.023447243701671238,0.00048481168970872983
|
71 |
+
0.0704365079365079,659,32906.09330933093,0.023482877810640346,0.0004855484855897462
|
72 |
+
0.0724206349206349,663,32896.47464746475,0.023625414246516766,0.0004884956691138114
|
73 |
+
0.0734126984126984,665,32809.90669066907,0.023696682464454975,0.000489969260875844
|
74 |
+
0.0744047619047619,671,32704.10141014101,0.023910487118269607,0.0004943900361619419
|
75 |
+
0.0753968253968253,676,32579.058805880588,0.024088657663115135,0.0004980740155670234
|
76 |
+
0.0763888888888889,695,32155.83768376837,0.024765705733528133,0.0005120731373063332
|
77 |
+
0.0773809523809523,706,31934.608460846084,0.02515768093218829,0.0005201778919975126
|
78 |
+
0.0783730158730158,716,31722.997899789974,0.025514022021879343,0.0005275458508076756
|
79 |
+
0.0793650793650793,718,31703.760576057604,0.025585290239817555,0.0005290194425697082
|
80 |
+
0.0803571428571428,722,31655.667266726672,0.025727826675693975,0.0005319666260937734
|
81 |
+
0.0813492063492063,734,31299.77677767777,0.026155435983323237,0.0005408081766659691
|
82 |
+
0.0823412698412698,740,31107.40354035404,0.02636924063713787,0.000545228951952067
|
83 |
+
0.0833333333333333,748,30915.030303030304,0.02665431350889071,0.0005511233190001975
|
84 |
+
0.0843253968253968,754,30866.93699369937,0.026868118162705342,0.0005555440942862953
|
85 |
+
0.0853174603174603,760,30780.36903690369,0.027081922816519974,0.0005599648695723932
|
86 |
+
0.0873015873015873,776,30280.198619861985,0.027652068560025656,0.0005717536036686541
|
87 |
+
0.0882936507936508,784,30145.537353735373,0.027937141431778497,0.0005776479707167845
|
88 |
+
0.0892857142857142,801,29847.358835883588,0.028542921284253286,0.0005901735006940617
|
89 |
+
0.0902777777777777,804,29828.12151215121,0.028649823611160603,0.0005923838883371106
|
90 |
+
0.0912698412698412,806,29780.02820282028,0.028721091829098815,0.0005938574800991432
|
91 |
+
0.0922619047619047,842,29097.10321032103,0.030003919751986602,0.0006203821318157303
|
92 |
+
0.0932539682539682,847,28856.63666366637,0.030182090296832127,0.0006240661112208118
|
93 |
+
0.0942460317460317,856,28721.975397539754,0.030502797277554075,0.0006306972741499586
|
94 |
+
0.0952380952380952,861,28616.1701170117,0.0306809678223996,0.0006343812535550401
|
95 |
+
0.0962301587301587,869,28443.034203420342,0.030966040694152444,0.0006402756206031706
|
96 |
+
0.0972222222222222,870,28433.41554155416,0.03100167480312155,0.0006410124164841869
|
97 |
+
0.0982142857142857,874,28356.466246624663,0.03114421123899797,0.0006439596000082521
|
98 |
+
0.0992063492063492,889,28000.57575757576,0.03167872287353455,0.0006550115382234967
|
99 |
+
0.1001984126984127,905,27760.10921092109,0.03224886861704023,0.0006668002723197576
|
100 |
+
0.1011904761904761,913,27596.59195919592,0.03253394148879307,0.0006726946393678881
|
101 |
+
0.1021825396825396,915,27481.16801680168,0.03260520970673128,0.0006741682311299207
|
102 |
+
0.1031746031746031,929,27269.557455745577,0.033104087232298754,0.000684483373464149
|
103 |
+
0.1041666666666666,967,26836.717671767172,0.03445818337312476,0.0007124816169427686
|
104 |
+
0.1051587301587301,968,26827.09900990099,0.03449381748209386,0.000713218412823785
|
105 |
+
0.1061507936507936,972,26682.81908190819,0.03463635391797028,0.0007161655963478502
|
106 |
+
0.1071428571428571,978,26634.725772577254,0.03485015857178491,0.0007205863716339481
|
107 |
+
0.1081349206349206,983,26528.920492049205,0.03502832911663044,0.0007242703510390295
|
108 |
+
0.1091269841269841,998,26355.78457845785,0.035562840751167016,0.0007353222892542741
|
109 |
+
0.1101190476190476,1002,26307.691269126917,0.03570537718704344,0.0007382694727783394
|
110 |
+
0.1111111111111111,1007,26269.216621662166,0.03588354773188896,0.0007419534521834209
|
111 |
+
0.1121031746031746,1015,26153.792679267928,0.03616862060364181,0.0007478478192315514
|
112 |
+
0.1130952380952381,1025,26086.462046204622,0.03652496169333286,0.0007552157780417144
|
113 |
+
0.1140873015873015,1030,26028.7500750075,0.036703132238178386,0.000758899757446796
|
114 |
+
0.115079365079365,1050,25807.52085208521,0.03741581441756049,0.0007736356750671222
|
115 |
+
0.1160714285714285,1056,25759.427542754274,0.03762961907137512,0.0007780564503532199
|
116 |
+
0.117063492063492,1074,25567.05430543054,0.038271033032819014,0.0007913187762115134
|
117 |
+
0.1180555555555555,1075,25557.43564356436,0.03830666714178812,0.0007920555720925298
|
118 |
+
0.119047619047619,1077,25538.198319831983,0.03837793535972633,0.0007935291638545624
|
119 |
+
0.1200396825396825,1084,25461.24902490249,0.038627374122510064,0.0007986867350216765
|
120 |
+
0.121031746031746,1089,25413.15571557156,0.03880554466735559,0.000802370714426758
|
121 |
+
0.1220238095238095,1097,25307.350435043503,0.03909061753910843,0.0008082650814748885
|
122 |
+
0.123015873015873,1106,25211.16381638164,0.039411324519830385,0.0008148962444040353
|
123 |
+
0.1240079365079365,1112,25105.358535853586,0.03962512917364501,0.0008193170196901332
|
124 |
+
0.125,1118,25047.646564656465,0.03983893382745964,0.000823737794976231
|
125 |
+
0.1259920634920635,1120,25038.02790279028,0.03991020204539786,0.0008252113867382635
|
126 |
+
0.1269841269841269,1128,24912.985298529853,0.0401952749171507,0.000831105753786394
|
127 |
+
0.1279761904761904,1132,24884.12931293129,0.040337811353027116,0.0008340529373104592
|
128 |
+
0.1289682539682539,1134,24874.510651065106,0.04040907957096533,0.0008355265290724918
|
129 |
+
0.1299603174603174,1136,24864.89198919892,0.04048034778890354,0.0008370001208345245
|
130 |
+
0.1309523809523809,1138,24855.27332733273,0.04055161600684175,0.0008384737125965571
|
131 |
+
0.1319444444444444,1143,24797.56135613561,0.040729786551687276,0.0008421576920016387
|
132 |
+
0.1329365079365079,1168,24451.289528952897,0.0416206392759149,0.0008605775890270463
|
133 |
+
0.1339285714285714,1183,24172.348334833485,0.04215515091045149,0.0008716295272422909
|
134 |
+
0.1349206349206349,1186,24162.729672967296,0.0422620532373588,0.0008738399148853399
|
135 |
+
0.1369047619047619,1194,24105.01770177018,0.04254712610911164,0.0008797342819334702
|
136 |
+
0.1378968253968254,1196,24076.161716171617,0.04261839432704985,0.0008812078736955029
|
137 |
+
0.1388888888888889,1198,24056.924392439243,0.042689662544988065,0.0008826814654575355
|
138 |
+
0.1398809523809523,1200,24037.68706870687,0.04276093076292627,0.0008841550572195681
|
139 |
+
0.1408730158730158,1202,24018.4497449745,0.04283219898086448,0.0008856286489816007
|
140 |
+
0.1418650793650793,1210,23931.881788178816,0.04311727185261732,0.0008915230160297312
|
141 |
+
0.1428571428571428,1217,23797.220522052205,0.04336671061540106,0.0008966805871968454
|
142 |
+
0.1438492063492063,1221,23777.98319831983,0.04350924705127748,0.0008996277707209106
|
143 |
+
0.1448412698412698,1240,23556.75397539754,0.044186295121690485,0.0009136268924602204
|
144 |
+
0.1458333333333333,1242,23537.51665166517,0.04425756333962869,0.000915100484222253
|
145 |
+
0.1478174603174603,1249,23470.18601860186,0.04450700210241243,0.0009202580553893671
|
146 |
+
0.1488095238095238,1258,23383.61806180618,0.044827709083134375,0.000926889218318514
|
147 |
+
0.1498015873015873,1266,23287.43144314432,0.045112781954887216,0.0009327835853666443
|
148 |
+
0.1507936507936507,1271,23268.194119411943,0.045290952499732744,0.0009364675647717259
|
149 |
+
0.1517857142857142,1273,23248.95679567957,0.04536222071767095,0.0009379411565337585
|
150 |
+
0.1527777777777778,1287,23095.058205820584,0.045861098243238425,0.0009482562988679869
|
151 |
+
0.1547619047619047,1307,22931.54095409541,0.04657378042262053,0.0009629922164883129
|
152 |
+
0.1557539682539682,1308,22921.922292229225,0.046609414531589635,0.0009637290123693293
|
153 |
+
0.1577380952380952,1312,22912.30363036304,0.04675195096746606,0.0009666761958933945
|
154 |
+
0.1587301587301587,1315,22902.68496849685,0.04685885329437337,0.0009688865835364434
|
155 |
+
0.1597222222222222,1317,22883.44764476448,0.04693012151231159,0.000970360175298476
|
156 |
+
0.1607142857142857,1319,22873.82898289829,0.047001389730249796,0.0009718337670605086
|
157 |
+
0.1617063492063492,1320,22864.210321032104,0.0470370238392189,0.000972570562941525
|
158 |
+
0.1626984126984127,1326,22816.11701170117,0.04725082849303353,0.0009769913382276227
|
159 |
+
0.1636904761904762,1337,22719.930393039303,0.047642803691693686,0.0009850960929188022
|
160 |
+
0.1646825396825396,1342,22691.074407440745,0.047820974236539214,0.0009887800723238837
|
161 |
+
0.1656746031746031,1352,22585.26912691269,0.04817731532623027,0.0009961480311340468
|
162 |
+
0.1666666666666666,1356,22479.46384638464,0.04831985176210669,0.000999095214658112
|
163 |
+
0.1676587301587301,1366,22325.565256525653,0.048676192851797744,0.001006463173468275
|
164 |
+
0.1686507936507936,1367,22315.946594659465,0.04871182696076685,0.0010071999693492914
|
165 |
+
0.1706349206349206,1371,22296.70927092709,0.048854363396643265,0.0010101471528733566
|
166 |
+
0.1716269841269841,1390,22133.19201920192,0.049531411467056266,0.0010241462746126665
|
167 |
+
0.1726190476190476,1396,22065.86138613861,0.0497452161208709,0.0010285670498987644
|
168 |
+
0.1736111111111111,1403,21998.53075307531,0.049994654883654635,0.0010337246210658784
|
169 |
+
0.1746031746031746,1409,21960.05610561056,0.05020845953746927,0.0010381453963519763
|
170 |
+
0.175595238095238,1418,21911.96279627963,0.05052916651819121,0.001044776559281123
|
171 |
+
0.1765873015873016,1421,21883.106810681067,0.050636068845098525,0.0010469869469241719
|
172 |
+
0.1775793650793651,1425,21863.869486948694,0.05077860528097495,0.001049934130448237
|
173 |
+
0.1785714285714285,1430,21786.920192019203,0.05095677582582048,0.0010536181098533186
|
174 |
+
0.179563492063492,1433,21767.68286828683,0.05106367815272779,0.0010558284974963676
|
175 |
+
0.1805555555555555,1435,21738.82688268827,0.051134946370666,0.0010573020892584001
|
176 |
+
0.181547619047619,1442,21690.73357335733,0.051384385133449735,0.0010624596604255144
|
177 |
+
0.1825396825396825,1446,21661.877587758776,0.05152692156932616,0.0010654068439495796
|
178 |
+
0.183531746031746,1451,21633.021602160217,0.05170509211417169,0.0010690908233546611
|
179 |
+
0.1845238095238095,1467,21479.123012301232,0.05227523785767737,0.001080879557450922
|
180 |
+
0.185515873015873,1487,21325.224422442243,0.052987920037059474,0.0010956154750712483
|
181 |
+
0.1865079365079365,1493,21296.368436843684,0.05320172469087411,0.001100036250357346
|
182 |
+
0.1875,1498,21238.656465646563,0.05337989523571963,0.0011037202297624275
|
183 |
+
0.1884920634920635,1500,21219.419141914197,0.05345116345365784,0.0011051938215244602
|
184 |
+
0.1894841269841269,1502,21200.18181818182,0.05352243167159605,0.0011066674132864927
|
185 |
+
0.1914682539682539,1507,21180.944494449446,0.05370060221644158,0.0011103513926915742
|
186 |
+
0.1924603174603174,1509,21171.325832583258,0.05377187043437979,0.001111824984453607
|
187 |
+
0.1934523809523809,1512,21113.61386138614,0.0538787727612871,0.0011140353720966558
|
188 |
+
0.1954365079365079,1517,21075.139213921397,0.05405694330613263,0.0011177193515017373
|
189 |
+
0.1964285714285714,1518,21065.520552055204,0.054092577415101734,0.0011184561473827537
|
190 |
+
0.1974206349206349,1527,20950.096609660966,0.054413284395823686,0.0011250873103119004
|
191 |
+
0.1984126984126984,1533,20911.62196219622,0.05462708904963831,0.0011295080855979983
|
192 |
+
0.1994047619047619,1543,20815.435343534355,0.05498343013932937,0.0011368760444081614
|
193 |
+
0.2003968253968254,1545,20786.579357935792,0.055054698357267576,0.0011383496361701939
|
194 |
+
0.2013888888888889,1549,20738.48604860486,0.055197234793144,0.0011412968196942593
|
195 |
+
0.2023809523809523,1551,20709.6300630063,0.05526850301108221,0.0011427704114562918
|
196 |
+
0.2033730158730158,1555,20651.91809180918,0.05541103944695863,0.001145717594980357
|
197 |
+
0.2043650793650793,1589,20344.12091209121,0.056622599151908204,0.0011707686549349114
|
198 |
+
0.2053571428571428,1594,20296.02760276028,0.05680076969675373,0.001174452634339993
|
199 |
+
0.2063492063492063,1605,20161.366336633662,0.05719274489541389,0.0011825573890311724
|
200 |
+
0.2073412698412698,1624,20065.1797179718,0.05786979296582689,0.0011965565107704822
|
201 |
+
0.2083333333333333,1640,19940.13711371137,0.058439938709332576,0.0012083452448667432
|
202 |
+
0.2093253968253968,1641,19930.518451845182,0.05847557281830168,0.0012090820407477593
|
203 |
+
0.2103174603174603,1660,19843.950495049507,0.059152620888714674,0.0012230811624870692
|
204 |
+
0.2113095238095238,1689,19680.433243324333,0.06018601004881873,0.001244448243036542
|
205 |
+
0.2123015873015873,1694,19641.958595859585,0.06036418059366425,0.0012481322224416236
|
206 |
+
0.2132936507936507,1698,19593.865286528653,0.06050671702954068,0.0012510794059656888
|
207 |
+
0.2142857142857142,1714,19507.297329732974,0.06107686277304636,0.0012628681400619498
|
208 |
+
0.2152777777777778,1715,19497.67866786679,0.06111249688201546,0.0012636049359429661
|
209 |
+
0.2172619047619047,1721,19468.822682268223,0.061326301535830095,0.001268025711229064
|
210 |
+
0.2182539682539682,1722,19459.20402040204,0.0613619356447992,0.0012687625071100802
|
211 |
+
0.2192460317460317,1740,19334.16141614161,0.0620033496062431,0.0012820248329683738
|
212 |
+
0.2202380952380952,1745,19314.92409240924,0.062181520151088625,0.0012857088123734554
|
213 |
+
0.2212301587301587,1748,19295.686768676867,0.06228842247799594,0.0012879192000165042
|
214 |
+
0.2222222222222222,1752,19276.449444944494,0.06243095891387236,0.0012908663835405694
|
215 |
+
0.2232142857142857,1757,19228.35613561356,0.06260912945871788,0.001294550362945651
|
216 |
+
0.2242063492063492,1758,19218.737473747376,0.06264476356768699,0.0012952871588266673
|
217 |
+
0.2251984126984127,1768,19151.406840684067,0.06300110465737804,0.0013026551176368304
|
218 |
+
0.2261904761904762,1774,19122.55085508551,0.06321490931119267,0.0013070758929229283
|
219 |
+
0.2271825396825396,1775,19103.31353135313,0.06325054342016177,0.0013078126888039444
|
220 |
+
0.2281746031746031,1784,19084.07620762076,0.06357125040088372,0.0013144438517330912
|
221 |
+
0.2291666666666666,1787,19074.457545754576,0.06367815272779104,0.0013166542393761402
|
222 |
+
0.2301587301587301,1790,19045.601560156018,0.06378505505469836,0.001318864627019189
|
223 |
+
0.2311507936507936,1796,19016.74557455745,0.06399885970851299,0.001323285402305287
|
224 |
+
0.2321428571428571,1804,18987.889588958897,0.06428393258026584,0.0013291797693534173
|
225 |
+
0.2331349206349206,1810,18939.796279627964,0.06449773723408046,0.0013336005446395152
|
226 |
+
0.2341269841269841,1812,18930.177617761776,0.06456900545201867,0.001335074136401548
|
227 |
+
0.2351190476190476,1822,18862.84698469847,0.06492534654170973,0.001342442095211711
|
228 |
+
0.2361111111111111,1833,18805.13501350135,0.06531732174036989,0.0013505468499028902
|
229 |
+
0.2371031746031746,1837,18776.27902790279,0.0654598581762463,0.0013534940334269554
|
230 |
+
0.238095238095238,1860,18631.99909990999,0.06627944268253572,0.0013704403386903307
|
231 |
+
0.2400793650793651,1870,18545.43114311431,0.06663578377222677,0.0013778082975004938
|
232 |
+
0.2410714285714285,1887,18449.24452445245,0.06724156362470156,0.0013903338274777709
|
233 |
+
0.242063492063492,1889,18439.625862586257,0.06731283184263978,0.0013918074192398036
|
234 |
+
0.2430555555555555,1898,18410.7698769877,0.06763353882336172,0.0013984385821689503
|
235 |
+
0.2450396825396825,1903,18372.29522952295,0.06781170936820725,0.0014021225615740319
|
236 |
+
0.246031746031746,1914,18285.727272727272,0.0682036845668674,0.001410227316265211
|
237 |
+
0.2470238095238095,1933,18179.92199219922,0.06888073263728041,0.001424226438004521
|
238 |
+
0.248015873015873,1943,18141.447344734475,0.06923707372697145,0.001431594396814684
|
239 |
+
0.2490079365079365,1951,18102.972697269728,0.0695221465987243,0.0014374887638628146
|
240 |
+
0.25,1959,18074.11671167117,0.06980721947047715,0.001443383130910945
|
241 |
+
0.2509920634920635,1970,17977.9300930093,0.0701991946691373,0.0014514878856021244
|
242 |
+
0.251984126984127,1972,17968.311431143113,0.0702704628870755,0.001452961477364157
|
243 |
+
0.2529761904761904,1979,17929.83678367837,0.07051990164985925,0.0014581190485312711
|
244 |
+
0.2549603174603174,1983,17872.12481248125,0.07066243808573566,0.0014610662320553363
|
245 |
+
0.2559523809523809,2017,17708.607560756074,0.07187399779068525,0.0014861172920098908
|
246 |
+
0.2569444444444444,2019,17679.751575157516,0.07194526600862346,0.0014875908837719233
|
247 |
+
0.2579365079365079,2022,17670.13291329133,0.07205216833553077,0.0014898012714149723
|
248 |
+
0.2589285714285714,2042,17583.56495649565,0.07276485051491287,0.0015045371890352985
|
249 |
+
0.2599206349206349,2072,17410.429042904292,0.07383387378398604,0.0015266410654657875
|
250 |
+
0.2609126984126984,2082,17343.098409840983,0.07419021487367708,0.0015340090242759506
|
251 |
+
0.2628968253968254,2096,17304.62376237624,0.07468909239924455,0.001544324166610179
|
252 |
+
0.2648809523809524,2108,17246.911791179118,0.07511670170687382,0.0015531657171823747
|
253 |
+
0.2658730158730158,2111,17218.05580558056,0.07522360403378113,0.0015553761048254235
|
254 |
+
0.2668650793650793,2119,17169.962496249624,0.07550867690553398,0.0015612704718735541
|
255 |
+
0.2678571428571428,2124,17102.631863186318,0.0756868474503795,0.0015649544512786355
|
256 |
+
0.2688492063492063,2144,17006.445244524453,0.07639952962976161,0.0015796903688989616
|
257 |
+
0.2698412698412698,2150,16977.589258925895,0.07661333428357624,0.0015841111441850595
|
258 |
+
0.2708333333333333,2152,16958.351935193517,0.07668460250151445,0.0015855847359470922
|
259 |
+
0.2718253968253968,2165,16862.165316531653,0.07714784591811281,0.0015951630824003042
|
260 |
+
0.2738095238095238,2167,16852.546654665464,0.07721911413605102,0.0015966366741623369
|
261 |
+
0.2757936507936508,2170,16833.309330933094,0.07732601646295835,0.0015988470618053857
|
262 |
+
0.2767857142857143,2181,16737.12271227123,0.0777179916616185,0.0016069518164965651
|
263 |
+
0.2777777777777778,2187,16717.885388538853,0.07793179631543314,0.0016113725917826628
|
264 |
+
0.2787698412698413,2207,16621.698769876988,0.07864447849481523,0.001626108509402989
|
265 |
+
0.2797619047619047,2210,16602.461446144614,0.07875138082172255,0.001628318897046038
|
266 |
+
0.2807539682539682,2246,16410.088208820882,0.08003420874461034,0.001654843548762625
|
267 |
+
0.2817460317460317,2251,16381.232223222323,0.08021237928945586,0.0016585275281677065
|
268 |
+
0.2827380952380952,2252,16371.613561356136,0.08024801339842497,0.0016592643240487229
|
269 |
+
0.2837301587301587,2257,16333.13891389139,0.0804261839432705,0.0016629483034538044
|
270 |
+
0.2847222222222222,2273,16236.952295229525,0.08099632968677618,0.0016747370375500652
|
271 |
+
0.2857142857142857,2276,16217.71497149715,0.0811032320136835,0.0016769474251931143
|
272 |
+
0.2867063492063492,2280,16198.477647764776,0.08124576844955991,0.0016798946087171794
|
273 |
+
0.2876984126984127,2289,16179.240324032404,0.08156647543028187,0.0016865257716463262
|
274 |
+
0.2886904761904761,2300,16121.528352835285,0.08195845062894203,0.0016946305263375056
|
275 |
+
0.2896825396825397,2304,16111.909690969098,0.08210098706481844,0.0016975777098615708
|
276 |
+
0.2906746031746032,2320,16034.960396039603,0.08267113280832412,0.0017093664439578318
|
277 |
+
0.2916666666666667,2328,16006.104410441045,0.08295620568007697,0.0017152608110059622
|
278 |
+
0.2926587301587302,2345,15909.91779177918,0.08356198553255176,0.0017277863409832393
|
279 |
+
0.2946428571428571,2354,15881.061806180618,0.08388269251327371,0.001734417503912386
|
280 |
+
0.2956349206349206,2363,15804.112511251124,0.08420339949399565,0.001741048666841533
|
281 |
+
0.2966269841269841,2381,15707.92589258926,0.08484481345543955,0.0017543109926998264
|
282 |
+
0.2976190476190476,2384,15698.307230723072,0.08495171578234686,0.0017565213803428753
|
283 |
+
0.2986111111111111,2391,15679.0699069907,0.0852011545451306,0.0017616789515099895
|
284 |
+
0.2996031746031746,2397,15659.832583258329,0.08541495919894523,0.0017660997267960872
|
285 |
+
0.3005952380952381,2401,15640.595259525951,0.08555749563482165,0.0017690469103201526
|
286 |
+
0.302579365079365,2408,15611.739273927393,0.0858069343976054,0.0017742044814872667
|
287 |
+
0.304563492063492,2416,15573.264626462646,0.08609200726935823,0.0017800988485353973
|
288 |
+
0.3055555555555556,2418,15554.027302730274,0.08616327548729644,0.0017815724402974297
|
289 |
+
0.3075396825396825,2427,15525.171317131711,0.08648398246801839,0.0017882036032265765
|
290 |
+
0.308531746031746,2439,15477.07800780078,0.08691159177564765,0.0017970451537987723
|
291 |
+
0.3095238095238095,2450,15438.603360336034,0.08730356697430781,0.0018051499084899515
|
292 |
+
0.310515873015873,2451,15409.747374737472,0.08733920108327692,0.0018058867043709678
|
293 |
+
0.3115079365079365,2453,15400.128712871288,0.08741046930121513,0.0018073602961330005
|
294 |
+
0.3125,2465,15342.416741674167,0.08783807860884439,0.0018162018467051961
|
295 |
+
0.3134920634920635,2473,15294.323432343235,0.08812315148059723,0.0018220962137533267
|
296 |
+
0.314484126984127,2476,15284.704770477048,0.08823005380750454,0.0018243066013963756
|
297 |
+
0.3154761904761904,2477,15275.086108610862,0.08826568791647364,0.001825043397277392
|
298 |
+
0.3164682539682539,2488,15217.37413741374,0.0886576631151338,0.0018331481519685711
|
299 |
+
0.3174603174603174,2498,15178.899489948995,0.08901400420482486,0.0018405161107787342
|
300 |
+
0.3184523809523809,2501,15150.043504350437,0.08912090653173217,0.0018427264984217833
|
301 |
+
0.3194444444444444,2511,15130.80618061806,0.08947724762142323,0.0018500944572319464
|
302 |
+
0.3204365079365079,2520,15101.950195019504,0.08979795460214518,0.001856725620161093
|
303 |
+
0.3224206349206349,2525,15092.331533153316,0.0899761251469907,0.0018604095995661746
|
304 |
+
0.3244047619047619,2532,15073.094209420942,0.09022556390977443,0.0018655671707332887
|
305 |
+
0.3253968253968254,2550,14976.907590759076,0.09086697787121834,0.0018788294965915824
|
306 |
+
0.3263888888888889,2558,14938.43294329433,0.09115205074297117,0.0018847238636397127
|
307 |
+
0.3273809523809524,2562,14928.814281428144,0.09129458717884759,0.001887671047163778
|
308 |
+
0.3283730158730158,2574,14871.102310231025,0.09172219648647685,0.0018965125977359737
|
309 |
+
0.3293650793650793,2581,14832.627662766276,0.0919716352492606,0.0019016701689030877
|
310 |
+
0.3303571428571428,2600,14755.678367836785,0.09264868331967359,0.0019156692906423976
|
311 |
+
0.3323412698412698,2605,14746.059705970598,0.09282685386451912,0.0019193532700474791
|
312 |
+
0.3333333333333333,2615,14717.203720372036,0.09318319495421017,0.0019267212288576422
|
313 |
+
0.3343253968253968,2640,14611.398439843984,0.0940740476784378,0.00194514112588305
|
314 |
+
0.3363095238095238,2649,14582.542454245424,0.09439475465915975,0.0019517722888121967
|
315 |
+
0.3373015873015873,2662,14534.449144914492,0.09485799807575812,0.0019613506352654086
|
316 |
+
0.3382936507936508,2673,14495.974497449744,0.09524997327441827,0.001969455389956588
|
317 |
+
0.3402777777777778,2677,14486.35583558356,0.0953925097102947,0.0019724025734806534
|
318 |
+
0.3412698412698413,2696,14390.169216921691,0.09606955778070769,0.0019864016952199632
|
319 |
+
0.3422619047619047,2703,14351.694569456946,0.09631899654349144,0.0019915592663870773
|
320 |
+
0.3432539682539682,2706,14342.07590759076,0.09642589887039875,0.001993769654030126
|
321 |
+
0.3442460317460317,2715,14303.601260126014,0.09674660585112069,0.002000400816959273
|
322 |
+
0.3452380952380952,2718,14293.982598259829,0.09685350817802801,0.0020026112046023217
|
323 |
+
0.3472222222222222,2728,14274.745274527451,0.09720984926771906,0.0020099791634124848
|
324 |
+
0.3482142857142857,2743,14197.79597959796,0.09774436090225563,0.0020210311016277296
|
325 |
+
0.3492063492063492,2747,14178.558655865589,0.09788689733813206,0.0020239782851517946
|
326 |
+
0.3501984126984127,2751,14168.9399939994,0.09802943377400848,0.00202692546867586
|
327 |
+
0.3511904761904761,2760,14120.846684668468,0.09835014075473043,0.0020335566316050067
|
328 |
+
0.3531746031746032,2770,14063.134713471349,0.09870648184442148,0.00204092459041517
|
329 |
+
0.3551587301587302,2773,14053.51605160516,0.0988133841713288,0.0020431349780582185
|
330 |
+
0.3561507936507936,2776,14043.897389738971,0.09892028649823612,0.0020453453657012675
|
331 |
+
0.3581349206349206,2780,14034.278727872788,0.09906282293411253,0.002048292549225333
|
332 |
+
0.3591269841269841,2783,14024.6600660066,0.09916972526101984,0.0020505029368683815
|
333 |
+
0.3601190476190476,2791,14005.422742274228,0.09945479813277269,0.0020563973039165124
|
334 |
+
0.3611111111111111,2799,13976.566756675667,0.09973987100452553,0.0020622916709646427
|
335 |
+
0.3630952380952381,2807,13938.092109210922,0.10002494387627837,0.002068186038012773
|
336 |
+
0.3640873015873015,2822,13870.761476147614,0.10055945551081495,0.0020792379762280175
|
337 |
+
0.365079365079365,2830,13841.905490549056,0.1008445283825678,0.0020851323432761483
|
338 |
+
0.3660714285714285,2841,13803.430843084308,0.10123650358122795,0.0020932370979673274
|
339 |
+
0.367063492063492,2852,13774.574857485748,0.1016284787798881,0.002101341852658507
|
340 |
+
0.3680555555555556,2877,13668.769576957697,0.10251933150411574,0.0021197617496839147
|
341 |
+
0.369047619047619,2880,13659.15091509151,0.10262623383102305,0.0021219721373269634
|
342 |
+
0.3700396825396825,2893,13630.29492949295,0.10308947724762142,0.0021315504837801755
|
343 |
+
0.371031746031746,2908,13591.820282028202,0.103623988882158,0.00214260242199542
|
344 |
+
0.3720238095238095,2909,13582.201620162015,0.1036596229911271,0.0021433392178764363
|
345 |
+
0.3740079365079365,2915,13562.964296429644,0.10387342764494174,0.0021477599931625344
|
346 |
+
0.375,2933,13514.87098709871,0.10451484160638563,0.002161022319020828
|
347 |
+
0.376984126984127,2944,13466.777677767775,0.1049068168050458,0.0021691270737120073
|
348 |
+
0.3779761904761904,2957,13428.30303030303,0.10537006022164416,0.002178705420165219
|
349 |
+
0.3799603174603174,2968,13399.447044704471,0.10576203542030431,0.0021868101748563984
|
350 |
+
0.3809523809523809,2975,13370.591059105913,0.10601147418308805,0.0021919677460235125
|
351 |
+
0.3829365079365079,2986,13332.116411641164,0.10640344938174821,0.002200072500714692
|
352 |
+
0.3849206349206349,2994,13303.260426042603,0.10668852225350105,0.0022059668677628223
|
353 |
+
0.3859126984126984,2998,13293.641764176418,0.10683105868937748,0.0022089140512868877
|
354 |
+
0.3869047619047619,3005,13255.167116711673,0.10708049745216121,0.0022140716224540017
|
355 |
+
0.3878968253968254,3009,13245.548454845484,0.10722303388803762,0.002217018805978067
|
356 |
+
0.3888888888888889,3019,13216.692469246926,0.10757937497772868,0.00222438676478823
|
357 |
+
0.3898809523809524,3034,13158.980498049805,0.10811388661226526,0.0022354387030034746
|
358 |
+
0.3908730158730158,3064,13053.175217521752,0.10918290988133841,0.002257542579433964
|
359 |
+
0.3918650793650793,3068,13043.556555655565,0.10932544631721484,0.0022604897629580293
|
360 |
+
0.3928571428571428,3076,13014.700570057006,0.10961051918896768,0.0022663841300061597
|
361 |
+
0.3938492063492063,3085,12985.844584458446,0.10993122616968963,0.0022730152929353064
|
362 |
+
0.3948412698412698,3090,12966.607260726072,0.11010939671453515,0.0022766992723403877
|
363 |
+
0.3968253968253968,3094,12956.988598859883,0.11025193315041157,0.002279646455864453
|
364 |
+
0.3978174603174603,3098,12947.3699369937,0.110394469586288,0.0022825936393885186
|
365 |
+
0.3988095238095238,3104,12928.132613261329,0.11060827424010262,0.0022870144146746162
|
366 |
+
0.3998015873015873,3109,12918.51395139514,0.11078644478494815,0.0022906983940796976
|
367 |
+
0.4007936507936508,3112,12908.895289528951,0.11089334711185547,0.0022929087817227466
|
368 |
+
0.4017857142857143,3134,12851.183318331834,0.11167729750917578,0.0023091182911051055
|
369 |
+
0.4027777777777778,3141,12831.94599459946,0.11192673627195952,0.0023142758622722195
|
370 |
+
0.4037698412698413,3161,12774.23402340234,0.11263941845134162,0.0023290117798925457
|
371 |
+
0.4047619047619047,3167,12745.37803780378,0.11285322310515626,0.0023334325551786434
|
372 |
+
0.4057539682539682,3177,12706.903390339034,0.1132095641948473,0.0023408005139888065
|
373 |
+
0.4067460317460317,3184,12678.047404740477,0.11345900295763105,0.002345958085155921
|
374 |
+
0.4077380952380952,3187,12668.428742874288,0.11356590528453836,0.0023481684727989695
|
375 |
+
0.4087301587301587,3190,12658.8100810081,0.11367280761144567,0.0023503788604420186
|
376 |
+
0.4097222222222222,3194,12649.191419141916,0.1138153440473221,0.002353326043966084
|
377 |
+
0.4107142857142857,3200,12620.335433543354,0.11402914870113673,0.0023577468192521817
|
378 |
+
0.4117063492063492,3207,12591.479447944794,0.11427858746392046,0.0023629043904192957
|
379 |
+
0.4136904761904761,3211,12581.860786078609,0.11442112389979689,0.002365851573943361
|
380 |
+
0.4146825396825397,3220,12562.623462346235,0.11474183088051883,0.002372482736872508
|
381 |
+
0.4156746031746032,3224,12543.38613861386,0.11488436731639526,0.002375429920396573
|
382 |
+
0.4166666666666667,3235,12495.29282928293,0.1152763425150554,0.0023835346750877523
|
383 |
+
0.4176587301587302,3240,12466.436843684369,0.11545451305990094,0.002387218654492834
|
384 |
+
0.4186507936507936,3253,12418.343534353437,0.11591775647649931,0.0023967970009460457
|
385 |
+
0.4196428571428571,3256,12408.72487248725,0.11602465880340662,0.002399007388589095
|
386 |
+
0.4206349206349206,3261,12399.106210621065,0.11620282934825214,0.0024026913679941766
|
387 |
+
0.4216269841269841,3271,12379.86888688869,0.1165591704379432,0.0024100593268043392
|
388 |
+
0.4226190476190476,3275,12370.250225022502,0.11670170687381962,0.0024130065103284046
|
389 |
+
0.4236111111111111,3277,12351.012901290129,0.11677297509175782,0.0024144801020904373
|
390 |
+
0.4246031746031746,3280,12341.394239423942,0.11687987741866515,0.0024166904897334864
|
391 |
+
0.4255952380952381,3285,12331.775577557755,0.11705804796351067,0.0024203744691385677
|
392 |
+
0.4265873015873015,3290,12322.156915691568,0.1172362185083562,0.0024240584485436495
|
393 |
+
0.427579365079365,3304,12283.682268226825,0.11773509603392367,0.0024343735908778775
|
394 |
+
0.4285714285714285,3317,12254.826282628264,0.11819833945052204,0.0024439519373310897
|
395 |
+
0.429563492063492,3322,12245.207620762076,0.11837650999536757,0.002447635916736171
|
396 |
+
0.4305555555555556,3333,12197.114311431144,0.11876848519402772,0.0024557406714273504
|
397 |
+
0.431547619047619,3365,12120.16501650165,0.1199087766810391,0.0024793181396198724
|
398 |
+
0.4325396825396825,3375,12100.927692769275,0.12026511777073014,0.0024866860984300355
|
399 |
+
0.4345238095238095,3398,12014.359735973598,0.12108470227701956,0.0025036324036934103
|
400 |
+
0.435515873015873,3402,12004.74107410741,0.12122723871289598,0.0025065795872174757
|
401 |
+
0.4365079365079365,3414,11975.88508850885,0.12165484802052524,0.0025154211377896715
|
402 |
+
0.4375,3432,11927.791779177918,0.12229626198196913,0.002528683463647965
|
403 |
+
0.439484126984127,3463,11831.605160516052,0.12340091936001141,0.00255152413595947
|
404 |
+
0.4404761904761904,3465,11821.986498649863,0.12347218757794962,0.002552997727721503
|
405 |
+
0.4414682539682539,3485,11773.893189318933,0.12418486975733171,0.002567733645341829
|
406 |
+
0.4424603174603174,3499,11754.65586558656,0.12468374728289919,0.0025780487876760575
|
407 |
+
0.4434523809523809,3507,11735.418541854186,0.12496882015465204,0.002583943154724188
|
408 |
+
0.4454365079365079,3513,11725.799879988,0.12518262480846667,0.0025883639300102856
|
409 |
+
0.4464285714285714,3526,11687.325232523252,0.12564586822506504,0.0025979422764634977
|
410 |
+
0.4484126984126984,3536,11658.469246924691,0.1260022093147561,0.002605310235273661
|
411 |
+
0.4494047619047619,3545,11639.23192319232,0.12632291629547804,0.0026119413982028075
|
412 |
+
0.4503968253968254,3567,11571.901290129012,0.12710686669279836,0.0026281509075851664
|
413 |
+
0.4513888888888889,3571,11562.282628262828,0.12724940312867478,0.0026310980911092314
|
414 |
+
0.4523809523809524,3574,11552.66396639664,0.12735630545558208,0.0026333084787522804
|
415 |
+
0.4533730158730158,3582,11543.045304530451,0.12764137832733494,0.002639202845800411
|
416 |
+
0.4543650793650793,3585,11533.426642664266,0.12774828065424224,0.00264141323344346
|
417 |
+
0.4553571428571428,3589,11523.80798079808,0.12789081709011865,0.002644360416967525
|
418 |
+
0.4563492063492063,3604,11485.333333333334,0.12842532872465523,0.0026554123551827697
|
419 |
+
0.4583333333333333,3608,11475.714671467147,0.12856786516053167,0.0026583595387068347
|
420 |
+
0.4593253968253968,3626,11418.002700270026,0.12920927912197555,0.002671621864565128
|
421 |
+
0.4603174603174603,3632,11389.146714671468,0.12942308377579018,0.0026760426398512263
|
422 |
+
0.4613095238095238,3636,11369.909390939094,0.12956562021166662,0.0026789898233752912
|
423 |
+
0.4623015873015873,3648,11341.053405340534,0.12999322951929587,0.002687831373947487
|
424 |
+
0.4632936507936508,3658,11331.434743474349,0.13034957060898691,0.00269519933275765
|
425 |
+
0.4642857142857143,3678,11292.9600960096,0.13106225278836903,0.0027099352503779763
|
426 |
+
0.4652777777777778,3693,11254.485448544854,0.1315967644229056,0.0027209871885932207
|
427 |
+
0.4662698412698413,3708,11158.298829882988,0.13213127605744218,0.0027320391268084655
|
428 |
+
0.4672619047619047,3714,11139.061506150616,0.1323450807112568,0.0027364599020945632
|
429 |
+
0.4682539682539682,3716,11129.44284428443,0.13241634892919502,0.002737933493856596
|
430 |
+
0.4692460317460317,3720,11119.824182418242,0.13255888536507143,0.0027408806773806613
|
431 |
+
0.4712301587301587,3726,11110.205520552056,0.13277269001888609,0.002745301452666759
|
432 |
+
0.4732142857142857,3735,11090.968196819682,0.13309339699960804,0.0027519326155959058
|
433 |
+
0.4742063492063492,3741,11081.349534953495,0.13330720165342266,0.0027563533908820034
|
434 |
+
0.4761904761904761,3745,11071.730873087308,0.13344973808929908,0.002759300574406069
|
435 |
+
0.4771825396825397,3757,11023.637563756376,0.13387734739692833,0.0027681421249782646
|
436 |
+
0.4781746031746032,3766,10985.162916291629,0.13419805437765028,0.0027747732879074114
|
437 |
+
0.4791666666666667,3771,10965.925592559255,0.13437622492249582,0.0027784572673124927
|
438 |
+
0.4801587301587302,3774,10956.30693069307,0.13448312724940312,0.0027806676549555417
|
439 |
+
0.4811507936507936,3785,10927.45094509451,0.13487510244806328,0.002788772409646721
|
440 |
+
0.4821428571428571,3790,10917.832283228325,0.1350532729929088,0.0027924563890518025
|
441 |
+
0.4831349206349206,3803,10888.976297629762,0.13551651640950718,0.0028020347355050147
|
442 |
+
0.4841269841269841,3808,10879.357635763576,0.13569468695435272,0.002805718714910096
|
443 |
+
0.4851190476190476,3813,10869.73897389739,0.13587285749919822,0.0028094026943151777
|
444 |
+
0.4871031746031746,3823,10850.501650165015,0.1362291985888893,0.002816770653125341
|
445 |
+
0.4880952380952381,3826,10831.264326432643,0.1363361009157966,0.00281898104076839
|
446 |
+
0.490079365079365,3831,10802.408340834085,0.13651427146064213,0.002822665020173471
|
447 |
+
0.492063492063492,3835,10792.789678967896,0.13665680789651855,0.0028256122036975366
|
448 |
+
0.4930555555555556,3840,10773.552355235524,0.13683497844136408,0.002829296183102618
|
449 |
+
0.494047619047619,3846,10754.31503150315,0.1370487830951787,0.0028337169583887156
|
450 |
+
0.4950396825396825,3856,10725.45904590459,0.13740512418486975,0.0028410849171988787
|
451 |
+
0.496031746031746,3868,10706.221722172217,0.13783273349249903,0.0028499264677710745
|
452 |
+
0.4970238095238095,3877,10686.984398439845,0.13815344047322098,0.0028565576307002212
|
453 |
+
0.4990079365079365,3891,10658.128412841284,0.13865231799878844,0.0028668727730344497
|
454 |
+
0.5,3896,10648.509750975098,0.13883048854363397,0.002870556752439531
|
455 |
+
0.5009920634920635,3913,10610.03510351035,0.13943626839610876,0.002883082282416808
|
456 |
+
0.501984126984127,3940,10552.323132313231,0.1403983893382746,0.002902975771204249
|
457 |
+
0.5029761904761905,3954,10523.46714671467,0.14089726686384207,0.002913290913538477
|
458 |
+
0.503968253968254,3960,10513.848484848484,0.1411110715176567,0.002917711688824575
|
459 |
+
0.5049603174603174,3973,10494.611161116112,0.14157431493425507,0.0029272900352777867
|
460 |
+
0.5059523809523809,4000,10398.424542454246,0.1425364358764209,0.002947183524065227
|
461 |
+
0.5069444444444444,4013,10369.568556855686,0.14299967929301927,0.002956761870518439
|
462 |
+
0.5079365079365079,4021,10350.331233123312,0.14328475216477213,0.0029626562375665694
|
463 |
+
0.5089285714285714,4041,10302.23792379238,0.1439974343441542,0.0029773921551868956
|
464 |
+
0.5099206349206349,4045,10292.619261926193,0.14413997078003066,0.002980339338710961
|
465 |
+
0.5119047619047619,4051,10283.000600060006,0.14435377543384528,0.0029847601139970587
|
466 |
+
0.5138888888888888,4061,10263.763276327632,0.14471011652353633,0.0029921280728072218
|
467 |
+
0.5148809523809523,4075,10234.907290729074,0.1452089940491038,0.0030024432151414503
|
468 |
+
0.5168650793650794,4080,10225.288628862889,0.14538716459394932,0.0030061271945465316
|
469 |
+
0.5178571428571429,4084,10215.6699669967,0.14552970102982574,0.003009074378070597
|
470 |
+
0.5188492063492064,4089,10206.051305130512,0.14570787157467127,0.0030127583574756783
|
471 |
+
0.5218253968253969,4101,10186.81398139814,0.14613548088230055,0.003021599908047874
|
472 |
+
0.5238095238095238,4108,10177.195319531951,0.14638491964508427,0.003026757479214988
|
473 |
+
0.5248015873015873,4117,10148.339333933394,0.14670562662580622,0.003033388642144135
|
474 |
+
0.5277777777777778,4129,10129.10201020102,0.1471332359334355,0.0030422301927163307
|
475 |
+
0.5287698412698413,4139,10109.864686468649,0.14748957702312654,0.0030495981515264938
|
476 |
+
0.5307539682539683,4145,10100.24602460246,0.14770338167694116,0.0030540189268125914
|
477 |
+
0.5317460317460317,4168,10052.152715271528,0.14852296618323058,0.0030709652320759667
|
478 |
+
0.5327380952380952,4175,10042.53405340534,0.14877240494601432,0.0030761228032430807
|
479 |
+
0.5337301587301587,4185,10023.296729672968,0.1491287460357054,0.003083490762053244
|
480 |
+
0.5347222222222222,4225,9955.966096609662,0.1505541103944696,0.003112962597293896
|
481 |
+
0.5357142857142857,4240,9927.1101110111,0.15108862202900616,0.0031240145355091405
|
482 |
+
0.5367063492063492,4250,9907.872787278728,0.1514449631186972,0.0031313824943193036
|
483 |
+
0.5376984126984127,4262,9888.635463546354,0.15187257242632649,0.0031402240448914994
|
484 |
+
0.5386904761904762,4291,9840.542154215422,0.15290596158643052,0.0031615911254409723
|
485 |
+
0.5406746031746031,4303,9811.68616861686,0.1533335708940598,0.003170432676013168
|
486 |
+
0.5416666666666666,4309,9792.448844884488,0.15354737554787443,0.003174853451299266
|
487 |
+
0.5426587301587301,4332,9744.355535553555,0.15436696005416384,0.003191799756562641
|
488 |
+
0.5436507936507936,4339,9725.118211821182,0.15461639881694758,0.003196957327729755
|
489 |
+
0.5446428571428571,4370,9648.16891689169,0.15572105619498985,0.0032197980000412607
|
490 |
+
0.5456349206349206,4376,9638.550255025502,0.15593486084880448,0.0032242187753273584
|
491 |
+
0.5466269841269841,4395,9609.694269426942,0.15661190891921747,0.003238217897066668
|
492 |
+
0.5476190476190477,4411,9590.45694569457,0.15718205466272317,0.003250006631162929
|
493 |
+
0.5496031746031746,4430,9542.363636363636,0.15785910273313616,0.0032640057529022388
|
494 |
+
0.5505952380952381,4435,9532.74497449745,0.15803727327798167,0.0032676897323073205
|
495 |
+
0.5515873015873016,4446,9513.507650765076,0.15842924847664183,0.0032757944869985
|
496 |
+
0.5525793650793651,4453,9503.88898889889,0.15867868723942558,0.003280952058165614
|
497 |
+
0.5535714285714286,4466,9484.651665166515,0.15914193065602394,0.003290530404618826
|
498 |
+
0.5555555555555556,4475,9465.414341434143,0.1594626376367459,0.003297161567547973
|
499 |
+
0.5565476190476191,4489,9446.17701770177,0.15996151516231336,0.003307476709882201
|
500 |
+
0.5575396825396826,4496,9436.558355835585,0.1602109539250971,0.0033126342810493154
|
501 |
+
0.560515873015873,4507,9417.321032103211,0.16060292912375726,0.0033207390357404944
|
502 |
+
0.564484126984127,4524,9398.083708370836,0.16120870897623205,0.003333264565717772
|
503 |
+
0.5654761904761905,4556,9349.990399039903,0.1623490004632434,0.0033568420339102935
|
504 |
+
0.566468253968254,4566,9330.753075307532,0.16270534155293448,0.0033642099927204566
|
505 |
+
0.5674603174603174,4574,9321.134413441345,0.1629904144246873,0.003370104359768587
|
506 |
+
0.5684523809523809,4582,9311.515751575158,0.16327548729644015,0.0033759987268167178
|
507 |
+
0.5694444444444444,4586,9301.897089708971,0.16341802373231656,0.0033789459103407827
|
508 |
+
0.5704365079365079,4597,9282.659765976598,0.16380999893097672,0.003387050665031962
|
509 |
+
0.5724206349206349,4603,9263.422442244224,0.16402380358479135,0.00339147144031806
|
510 |
+
0.5734126984126984,4608,9253.803780378035,0.16420197412963689,0.0033951554197231416
|
511 |
+
0.5753968253968254,4631,9215.329132913292,0.1650215586359263,0.003412101724986517
|
512 |
+
0.5763888888888888,4635,9205.710471047105,0.16516409507180274,0.003415048908510582
|
513 |
+
0.5773809523809523,4651,9176.854485448544,0.1657342408153084,0.0034268376426068426
|
514 |
+
0.5783730158730159,4656,9167.235823582358,0.16591241136015394,0.0034305216220119244
|
515 |
+
0.5803571428571429,4666,9157.61716171617,0.166268752449845,0.0034378895808220874
|
516 |
+
0.5813492063492064,4681,9128.761176117612,0.16680326408438156,0.003448941519037332
|
517 |
+
0.5823412698412699,4684,9119.142514251424,0.1669101664112889,0.003451151906680381
|
518 |
+
0.5833333333333334,4707,9080.667866786678,0.1677297509175783,0.003468098211943756
|
519 |
+
0.5843253968253969,4722,9042.193219321933,0.16826426255211488,0.0034791501501590005
|
520 |
+
0.5853174603174603,4727,9032.574557455746,0.16844243309696041,0.003482834129564082
|
521 |
+
0.5863095238095238,4731,9022.95589558956,0.16858496953283683,0.0034857813130881473
|
522 |
+
0.5892857142857143,4747,8984.481248124812,0.16915511527634253,0.003497570047184408
|
523 |
+
0.5912698412698413,4757,8965.243924392438,0.16951145636603357,0.003504938005994571
|
524 |
+
0.5922619047619048,4781,8926.769276927693,0.1703666749812921,0.0035226211071389627
|
525 |
+
0.5932539682539683,4791,8917.150615061506,0.17072301607098314,0.003529989065949126
|
526 |
+
0.5952380952380952,4795,8907.531953195321,0.17086555250685956,0.0035329362494731908
|
527 |
+
0.5962301587301587,4802,8897.913291329132,0.1711149912696433,0.0035380938206403052
|
528 |
+
0.5972222222222222,4808,8869.057305730574,0.17132879592345793,0.003542514595926403
|
529 |
+
0.5982142857142857,4818,8849.8199819982,0.171685137013149,0.003549882554736566
|
530 |
+
0.5992063492063492,4830,8830.582658265826,0.17211274632077825,0.003558724105308762
|
531 |
+
0.6001984126984127,4845,8792.10801080108,0.17264725795531483,0.003569776043524006
|
532 |
+
0.6011904761904762,4857,8772.870687068707,0.17307486726294408,0.003578617594096202
|
533 |
+
0.6021825396825397,4871,8753.633363336334,0.17357374478851156,0.00358893273643043
|
534 |
+
0.6031746031746031,4875,8744.014701470147,0.17371628122438798,0.0035918799199544955
|
535 |
+
0.6041666666666666,4881,8734.39603960396,0.1739300858782026,0.003596300695240593
|
536 |
+
0.6071428571428571,4913,8695.921392139215,0.17507037736521397,0.003619878163433115
|
537 |
+
0.6081349206349206,4932,8657.446744674467,0.175747425435627,0.003633877285172425
|
538 |
+
0.6091269841269841,4942,8647.82808280828,0.17610376652531803,0.003641245243982588
|
539 |
+
0.6101190476190477,4965,8618.97209720972,0.17692335103160745,0.0036581915492459633
|
540 |
+
0.6121031746031746,4973,8599.734773477347,0.1772084239033603,0.0036640859162940936
|
541 |
+
0.6130952380952381,4998,8561.260126012601,0.17809927662758793,0.003682505813319501
|
542 |
+
0.6140873015873016,5022,8532.40414041404,0.17895449524284646,0.0037001889144638927
|
543 |
+
0.6160714285714286,5048,8484.310831083108,0.1798809820760432,0.0037193456073703166
|
544 |
+
0.6170634920634921,5054,8474.692169216922,0.18009478672985782,0.0037237663826564142
|
545 |
+
0.6180555555555556,5089,8426.598859885988,0.1813419805437765,0.0037495542384919853
|
546 |
+
0.6190476190476191,5116,8388.124212421242,0.18230410148594234,0.0037694477272794255
|
547 |
+
0.6200396825396826,5119,8378.505550555055,0.18241100381284966,0.0037716581149224745
|
548 |
+
0.6220238095238095,5125,8368.886888688869,0.1826248084666643,0.003776078890208572
|
549 |
+
0.623015873015873,5129,8359.268226822682,0.1827673449025407,0.0037790260737326376
|
550 |
+
0.6240079365079365,5144,8330.412241224123,0.18330185653707728,0.003790078011947882
|
551 |
+
0.625,5156,8311.17491749175,0.18372946584470656,0.003798919562520078
|
552 |
+
0.6259920634920635,5162,8301.556255625563,0.1839432704985212,0.0038033403378061755
|
553 |
+
0.626984126984127,5166,8291.937593759376,0.1840858069343976,0.003806287521330241
|
554 |
+
0.628968253968254,5179,8282.318931893189,0.18454905035099597,0.0038158658677834526
|
555 |
+
0.6299603174603174,5187,8272.700270027002,0.1848341232227488,0.003821760234831583
|
556 |
+
0.6309523809523809,5194,8263.081608160817,0.18508356198553255,0.0038269178059986975
|
557 |
+
0.6329365079365079,5233,8195.75097509751,0.18647329223532766,0.0038556528453583335
|
558 |
+
0.6339285714285714,5256,8166.89498949895,0.18729287674161707,0.0038725991506217083
|
559 |
+
0.6349206349206349,5264,8157.276327632763,0.1875779496133699,0.0038784935176698386
|
560 |
+
0.6359126984126984,5278,8138.0390039003905,0.1880768271389374,0.003888808660004067
|
561 |
+
0.6369047619047619,5286,8128.420342034204,0.18836190001069023,0.0038947030270521975
|
562 |
+
0.6378968253968254,5292,8118.801680168017,0.18857570466450485,0.003899123802338295
|
563 |
+
0.6388888888888888,5311,8099.564356435643,0.18925275273491787,0.003913122924077605
|
564 |
+
0.6408730158730159,5326,8080.327032703271,0.18978726436945445,0.00392417486229285
|
565 |
+
0.6428571428571429,5351,8051.47104710471,0.19067811709368207,0.003942594759318257
|
566 |
+
0.6448412698412699,5394,7993.759075907591,0.1922103837793536,0.003974276982201958
|
567 |
+
0.6458333333333334,5409,7974.521752175217,0.19274489541389017,0.003985328920417203
|
568 |
+
0.6468253968253969,5426,7955.284428442845,0.19335067526636496,0.003997854450394481
|
569 |
+
0.6478174603174603,5444,7926.428442844284,0.19399208922780886,0.004011116776252774
|
570 |
+
0.6488095238095238,5452,7916.8097809780975,0.1942771620995617,0.004017011143300904
|
571 |
+
0.6507936507936508,5474,7887.953795379538,0.19506111249688202,0.004033220652683263
|
572 |
+
0.6517857142857143,5492,7868.716471647165,0.19570252645832592,0.0040464829785415565
|
573 |
+
0.6537698412698413,5515,7849.479147914792,0.19652211096461533,0.004063429283804932
|
574 |
+
0.6547619047619048,5535,7820.623162316232,0.19723479314399744,0.004078165201425258
|
575 |
+
0.6557539682539683,5548,7811.004500450045,0.1976980365605958,0.00408774354787847
|
576 |
+
0.6567460317460317,5564,7791.767176717672,0.19826818230410148,0.004099532281974731
|
577 |
+
0.6587301587301587,5585,7762.911191119112,0.19901649859245268,0.004115004995476073
|
578 |
+
0.6597222222222222,5590,7753.292529252925,0.19919466913729822,0.004118688974881155
|
579 |
+
0.6607142857142857,5598,7734.055205520553,0.19947974200905105,0.0041245833419292855
|
580 |
+
0.6626984126984127,5619,7714.817881788179,0.20022805829740228,0.004140056055430628
|
581 |
+
0.6636904761904762,5640,7685.961896189619,0.2009763745857535,0.0041555287689319705
|
582 |
+
0.6646825396825397,5646,7676.343234323433,0.2011901792395681,0.004159949544218068
|
583 |
+
0.6666666666666666,5658,7666.724572457246,0.2016177885471974,0.004168791094790264
|
584 |
+
0.6686507936507936,5674,7647.487248724872,0.20218793429070306,0.004180579828886525
|
585 |
+
0.6696428571428571,5687,7637.868586858686,0.20265117770730143,0.004190158175339737
|
586 |
+
0.6706349206349206,5709,7609.012601260126,0.20343512810462175,0.004206367684722095
|
587 |
+
0.6716269841269841,5720,7599.393939393939,0.2038271033032819,0.004214472439413274
|
588 |
+
0.6726190476190477,5753,7560.919291929193,0.20500302889926236,0.004238786703486813
|
589 |
+
0.6736111111111112,5780,7522.444644464446,0.20596514984142822,0.004258680192274253
|
590 |
+
0.6755952380952381,5794,7503.207320732074,0.20646402736699568,0.004268995334608482
|
591 |
+
0.6765873015873016,5798,7493.588658865887,0.20660656380287212,0.004271942518132546
|
592 |
+
0.6775793650793651,5803,7483.9699969997,0.20678473434771763,0.0042756264975376285
|
593 |
+
0.6795634920634921,5811,7474.351335133513,0.20706980721947046,0.0042815208645857585
|
594 |
+
0.6805555555555556,5822,7464.732673267326,0.20746178241813062,0.004289625619276938
|
595 |
+
0.6815476190476191,5839,7445.495349534954,0.2080675622706054,0.0043021511492542155
|
596 |
+
0.6825396825396826,5854,7426.25802580258,0.208602073905142,0.0043132030874694595
|
597 |
+
0.683531746031746,5864,7416.639363936394,0.20895841499483306,0.004320571046279623
|
598 |
+
0.6865079365079365,5875,7407.020702070207,0.20935039019349322,0.004328675800970802
|
599 |
+
0.6884920634920635,5899,7368.546054605461,0.21020560880875175,0.004346358902115194
|
600 |
+
0.689484126984127,5912,7349.308730873087,0.21066885222535012,0.004355937248568405
|
601 |
+
0.6904761904761905,5925,7339.690069006901,0.2111320956419485,0.004365515595021617
|
602 |
+
0.691468253968254,5956,7310.834083408341,0.21223675301999073,0.004388356267333123
|
603 |
+
0.6924603174603174,5969,7291.596759675967,0.2126999964365891,0.004397934613786335
|
604 |
+
0.6934523809523809,5986,7272.359435943595,0.2133057762890639,0.004410460143763613
|
605 |
+
0.6954365079365079,5994,7253.122112211221,0.21359084916081672,0.004416354510811743
|
606 |
+
0.6964285714285714,6002,7243.503450345034,0.21387592203256958,0.0044222488778598735
|
607 |
+
0.6974206349206349,6013,7233.884788478848,0.21426789723122974,0.004430353632551053
|
608 |
+
0.6984126984126984,6016,7224.266126612661,0.21437479955813704,0.0044325640201941015
|
609 |
+
0.6994047619047619,6029,7214.647464746475,0.2148380429747354,0.004442142366647314
|
610 |
+
0.7003968253968254,6049,7195.410141014101,0.21555072515411752,0.00445687828426764
|
611 |
+
0.7013888888888888,6053,7185.791479147915,0.21569326158999394,0.004459825467791705
|
612 |
+
0.7023809523809523,6065,7176.172817281728,0.21612087089762322,0.004468667018363901
|
613 |
+
0.7043650793650794,6085,7156.935493549355,0.2168335530770053,0.004483402935984227
|
614 |
+
0.7063492063492064,6094,7147.316831683168,0.21715426005772726,0.0044900340989133735
|
615 |
+
0.7073412698412699,6123,7118.460846084608,0.21818764921783132,0.0045114011794628464
|
616 |
+
0.7083333333333334,6134,7108.842184218422,0.21857962441649145,0.004519505934154026
|
617 |
+
0.7093253968253969,6145,7099.223522352236,0.2189715996151516,0.004527610688845205
|
618 |
+
0.7103174603174603,6155,7089.604860486049,0.21932794070484268,0.004534978647655368
|
619 |
+
0.7113095238095238,6180,7060.7488748874885,0.2202187934290703,0.0045533985446807755
|
620 |
+
0.7123015873015873,6212,7031.892889288929,0.22135908491608167,0.004576976012873298
|
621 |
+
0.7132936507936508,6229,7012.655565556555,0.22196486476855645,0.0045895015428505746
|
622 |
+
0.7142857142857143,6239,6993.418241824183,0.22232120585824752,0.004596869501660738
|
623 |
+
0.7152777777777778,6247,6983.799579957996,0.22260627873000036,0.004602763868708868
|
624 |
+
0.7162698412698413,6254,6974.180918091809,0.2228557174927841,0.004607921439875983
|
625 |
+
0.7172619047619048,6266,6964.562256225623,0.22328332680041335,0.004616762990448178
|
626 |
+
0.7182539682539683,6278,6945.324932493249,0.22371093610804263,0.004625604541020374
|
627 |
+
0.7192460317460317,6310,6916.46894689469,0.224851227595054,0.004649182009212896
|
628 |
+
0.7202380952380952,6320,6906.850285028503,0.22520756868474504,0.004656549968023059
|
629 |
+
0.7232142857142857,6330,6897.231623162316,0.22556390977443608,0.004663917926833222
|
630 |
+
0.7251984126984127,6362,6868.375637563757,0.22670420126144747,0.004687495395025744
|
631 |
+
0.7261904761904762,6375,6849.138313831383,0.22716744467804584,0.004697073741478956
|
632 |
+
0.7271825396825397,6380,6839.519651965196,0.22734561522289135,0.004700757720884037
|
633 |
+
0.7281746031746031,6398,6820.282328232824,0.22798702918433525,0.004714020046742331
|
634 |
+
0.7291666666666666,6404,6810.663666366637,0.22820083383814987,0.004718440822028429
|
635 |
+
0.7301587301587301,6430,6781.807680768077,0.2291273206713466,0.004737597514934852
|
636 |
+
0.7311507936507936,6438,6772.18901890189,0.22941239354309945,0.004743491881982983
|
637 |
+
0.7321428571428571,6453,6752.951695169517,0.22994690517763602,0.004754543820198228
|
638 |
+
0.7331349206349206,6461,6743.33303330333,0.23023197804938889,0.004760438187246358
|
639 |
+
0.7371031746031746,6471,6733.714371437144,0.23058831913907993,0.004767806146056521
|
640 |
+
0.7400793650793651,6476,6724.095709570957,0.23076648968392546,0.004771490125461603
|
641 |
+
0.7410714285714286,6524,6676.002400240024,0.2324769269144425,0.004806856327750385
|
642 |
+
0.7420634920634921,6534,6666.383738373837,0.23283326800413356,0.0048142242865605485
|
643 |
+
0.7440476190476191,6572,6637.527752775278,0.23418736414495955,0.004842222530039168
|
644 |
+
0.7450396825396826,6660,6560.578457845784,0.2373231657342408,0.004907060567568603
|
645 |
+
0.7470238095238095,6667,6550.959795979598,0.23757260449702455,0.004912218138735717
|
646 |
+
0.748015873015873,6675,6541.341134113412,0.2378576773687774,0.004918112505783848
|
647 |
+
0.75,6733,6474.010501050105,0.2399244556889855,0.0049608466668827934
|
648 |
+
0.7509920634920635,6749,6454.773177317732,0.24049460143249118,0.004972635400979054
|
649 |
+
0.753968253968254,6780,6425.917191719172,0.24159925881053343,0.00499547607329056
|
650 |
+
0.7549603174603174,6803,6406.679867986799,0.24241884331682287,0.005012422378553935
|
651 |
+
0.7559523809523809,6855,6358.586558655866,0.24427181698321634,0.005050735764366783
|
652 |
+
0.7569444444444444,6875,6339.349234923492,0.24498449916259843,0.005065471681987109
|
653 |
+
0.7579365079365079,6899,6320.111911191119,0.24583971777785696,0.0050831547831315
|
654 |
+
0.7599206349206349,6931,6291.255925592559,0.24698000926486832,0.0051067322513240225
|
655 |
+
0.7638888888888888,6976,6252.781278127813,0.24858354416847805,0.005139888065969756
|
656 |
+
0.7648809523809523,7021,6204.687968796879,0.2501870790720878,0.00517304388061549
|
657 |
+
0.7658730158730159,7036,6195.0693069306935,0.25072159070662436,0.005184095818830735
|
658 |
+
0.7668650793650794,7062,6166.213321332133,0.2516480775398211,0.005203252511737158
|
659 |
+
0.7678571428571429,7072,6156.594659465946,0.2520044186295122,0.005210620470547322
|
660 |
+
0.7688492063492064,7088,6146.97599759976,0.25257456437301784,0.005222409204643582
|
661 |
+
0.7698412698412699,7106,6127.738673867387,0.25321597833446174,0.005235671530501876
|
662 |
+
0.7708333333333334,7167,6060.408040804081,0.2553896589815772,0.00528061607924387
|
663 |
+
0.7718253968253969,7196,6041.170717071707,0.2564230481416812,0.005301983159793343
|
664 |
+
0.7738095238095238,7231,6012.314731473148,0.2576702419555999,0.005327771015628914
|
665 |
+
0.7748015873015873,7244,6002.696069606961,0.25813348537219827,0.005337349362082126
|
666 |
+
0.7757936507936508,7273,5983.458745874587,0.2591668745323023,0.005358716442631599
|
667 |
+
0.7767857142857143,7289,5964.221422142215,0.25973702027580803,0.00537050517672786
|
668 |
+
0.7777777777777778,7328,5925.746774677468,0.2611267505256031,0.005399240216087496
|
669 |
+
0.7807539682539683,7415,5848.797479747975,0.26422691800591525,0.005463341457735914
|
670 |
+
0.7817460317460317,7427,5839.178817881788,0.26465452731354455,0.0054721830083081105
|
671 |
+
0.7827380952380952,7435,5829.560156015602,0.2649396001852974,0.0054780773753562405
|
672 |
+
0.7857142857142857,7449,5819.941494149415,0.2654384777108648,0.005488392517690469
|
673 |
+
0.7867063492063492,7465,5810.322832283228,0.26600862345437054,0.00550018125178673
|
674 |
+
0.7876984126984127,7474,5800.7041704170415,0.2663293304350925,0.005506812414715877
|
675 |
+
0.7886904761904762,7512,5771.848184818482,0.2676834265759185,0.0055348106581944966
|
676 |
+
0.7906746031746031,7521,5762.229522952295,0.26800413355664043,0.005541441821123643
|
677 |
+
0.7916666666666666,7533,5752.610861086108,0.2684317428642697,0.005550283371695839
|
678 |
+
0.7926587301587301,7568,5714.136213621362,0.2696789366781884,0.00557607122753141
|
679 |
+
0.7936507936507936,7597,5694.898889888989,0.2707123258382924,0.005597438308080883
|
680 |
+
0.7946428571428571,7604,5685.2802280228025,0.2709617646010761,0.005602595879247997
|
681 |
+
0.7956349206349206,7674,5627.5682568256825,0.2734561522289135,0.005654171590919138
|
682 |
+
0.7966269841269841,7682,5617.949594959496,0.27374122510066634,0.005660065957967269
|
683 |
+
0.7986111111111112,7707,5598.712271227123,0.27463207782489396,0.005678485854992676
|
684 |
+
0.7996031746031746,7723,5579.474947494749,0.2752022235683997,0.005690274589088937
|
685 |
+
0.8005952380952381,7762,5541.000300030003,0.2765919538181948,0.005719009628448573
|
686 |
+
0.8015873015873016,7779,5521.762976297629,0.2771977336706696,0.00573153515842585
|
687 |
+
0.8025793650793651,7807,5492.90699069907,0.2781954887218045,0.005752165443094307
|
688 |
+
0.8045634920634921,7831,5473.669666966697,0.27905070733706305,0.0057698485442386985
|
689 |
+
0.8065476190476191,7869,5444.813681368137,0.28040480347788904,0.005797846787717318
|
690 |
+
0.8075396825396826,7879,5435.19501950195,0.2807611445675801,0.005805214746527481
|
691 |
+
0.808531746031746,7894,5425.576357635764,0.28129565620211666,0.005816266684742726
|
692 |
+
0.8095238095238095,7924,5396.720372037204,0.2823646794711898,0.0058383705611732145
|
693 |
+
0.8125,7950,5377.483048304831,0.28329116630438655,0.005857527254079639
|
694 |
+
0.8134920634920635,7965,5367.864386438644,0.28382567793892316,0.005868579192294884
|
695 |
+
0.8154761904761905,8009,5319.771077107711,0.28539357873356375,0.0059009982110596005
|
696 |
+
0.8174603174603174,8022,5310.152415241524,0.2858568221501621,0.005910576557512813
|
697 |
+
0.8184523809523809,8037,5300.533753375337,0.2863913337846987,0.0059216284957280575
|
698 |
+
0.8204365079365079,8045,5290.915091509151,0.28667640665645155,0.005927522862776188
|
699 |
+
0.8214285714285714,8110,5242.821782178218,0.2889926237394434,0.005975414595042248
|
700 |
+
0.8224206349206349,8153,5204.347134713471,0.2905248904251149,0.006007096817925949
|
701 |
+
0.8234126984126984,8167,5194.728472847285,0.2910237679506824,0.006017411960260177
|
702 |
+
0.8244047619047619,8206,5165.872487248725,0.2924134982004775,0.006046146999619814
|
703 |
+
0.8263888888888888,8220,5156.253825382539,0.29291237572604495,0.006056462141954042
|
704 |
+
0.8293650793650794,8237,5146.635163516352,0.29351815557851973,0.006068987671931319
|
705 |
+
0.8303571428571429,8248,5137.016501650165,0.2939101307771799,0.006077092426622498
|
706 |
+
0.8333333333333334,8266,5127.397839783978,0.2945515447386238,0.006090354752480791
|
707 |
+
0.8343253968253969,8280,5117.779177917792,0.29505042226419126,0.00610066989481502
|
708 |
+
0.8353174603174603,8320,5088.923192319232,0.2964757866229555,0.006130141730055673
|
709 |
+
0.8363095238095238,8349,5060.067206720672,0.2975091757830595,0.0061515088106051455
|
710 |
+
0.8373015873015873,8367,5050.448544854486,0.2981505897445034,0.006164771136463439
|
711 |
+
0.8382936507936508,8413,5021.592559255926,0.2997897587570823,0.0061986637469901885
|
712 |
+
0.8392857142857143,8498,4973.499249924993,0.30281865801945623,0.0062612913968765746
|
713 |
+
0.8412698412698413,8514,4963.880588058806,0.3033888037629619,0.006273080130972836
|
714 |
+
0.8432539682539683,8525,4954.261926192619,0.3037807789616221,0.006281184885664015
|
715 |
+
0.8442460317460317,8553,4935.024602460246,0.304778534012757,0.006301815170332472
|
716 |
+
0.8462301587301587,8577,4915.787278727873,0.3056337526280155,0.006319498271476863
|
717 |
+
0.8472222222222222,8606,4896.549954995499,0.3066671417881196,0.006340865352026336
|
718 |
+
0.8482142857142857,8703,4838.837983798379,0.3101236503581228,0.006412334552484918
|
719 |
+
0.8511904761904762,8749,4809.98199819982,0.31176281937070166,0.006446227163011668
|
720 |
+
0.8531746031746031,8785,4790.744674467447,0.3130456472935894,0.006472751814728255
|
721 |
+
0.8541666666666666,8840,4752.2700270027,0.3150055232868902,0.006513275588184152
|
722 |
+
0.8551587301587301,8861,4742.651365136513,0.31575383957524145,0.006528748301685494
|
723 |
+
0.8561507936507936,8872,4733.032703270327,0.3161458147739016,0.006536853056376674
|
724 |
+
0.8571428571428571,8964,4675.320732073207,0.3194241527990593,0.006604638277430174
|
725 |
+
0.8581349206349206,8990,4656.083408340834,0.320350639632256,0.006623794970336598
|
726 |
+
0.8591269841269841,9015,4636.846084608461,0.32124149235648364,0.006642214867362006
|
727 |
+
0.8621031746031746,9041,4617.608760876088,0.3221679791896804,0.00666137156026843
|
728 |
+
0.8630952380952381,9058,4607.990099009901,0.32277375904215516,0.0066738970902457066
|
729 |
+
0.8640873015873016,9077,4598.3714371437145,0.32345080711256813,0.006687896211985017
|
730 |
+
0.8650793650793651,9102,4579.134113411341,0.3243416598367958,0.006706316109010424
|
731 |
+
0.8660714285714286,9133,4559.896789678968,0.325446317214838,0.00672915678132193
|
732 |
+
0.8670634920634921,9143,4550.278127812781,0.3258026583045291,0.006736524740132093
|
733 |
+
0.8680555555555556,9159,4540.6594659465945,0.32637280404803476,0.006748313474228353
|
734 |
+
0.8690476190476191,9255,4492.566156615661,0.3297936785090689,0.006819045878805919
|
735 |
+
0.8700396825396826,9274,4482.947494749475,0.3304707265794819,0.006833045000545229
|
736 |
+
0.871031746031746,9309,4463.710171017102,0.3317179203934006,0.0068588328563808
|
737 |
+
0.8720238095238095,9347,4444.472847284729,0.3330720165342266,0.00688683109985942
|
738 |
+
0.873015873015873,9363,4434.854185418542,0.33364216227773225,0.0068986198339556805
|
739 |
+
0.8740079365079365,9404,4415.616861686169,0.33510316074546553,0.006928828465077349
|
740 |
+
0.8759920634920635,9422,4405.998199819982,0.33574457470690944,0.006942090790935642
|
741 |
+
0.876984126984127,9433,4396.379537953795,0.3361365499055696,0.006950195545626822
|
742 |
+
0.8779761904761905,9467,4377.142214221422,0.3373481096105192,0.006975246605581376
|
743 |
+
0.878968253968254,9508,4357.904890489049,0.3388091080782525,0.007005455236703045
|
744 |
+
0.8799603174603174,9546,4338.667566756676,0.3401632042190785,0.007033453480181665
|
745 |
+
0.8809523809523809,9558,4329.048904890489,0.3405908135267078,0.00704229503075386
|
746 |
+
0.8819444444444444,9741,4232.862286228623,0.347111855468054,0.007177128676979844
|
747 |
+
0.8829365079365079,9778,4213.624962496249,0.34843031749991094,0.007204390124577448
|
748 |
+
0.8839285714285714,9799,4204.006300630063,0.34917863378826214,0.00721986283807879
|
749 |
+
0.8849206349206349,9819,4194.387638763877,0.3498913159676442,0.007234598755699116
|
750 |
+
0.8859126984126984,9847,4184.76897689769,0.3508890710187792,0.007255229040367572
|
751 |
+
0.8869047619047619,9916,4155.91299129913,0.35334782453764746,0.007306067956157698
|
752 |
+
0.8878968253968254,10014,4107.819681968197,0.35683996721661976,0.007378273952497296
|
753 |
+
0.8888888888888888,10036,4098.20102010201,0.3576239176139401,0.007394483461879655
|
754 |
+
0.8898809523809523,10056,4088.582358235824,0.35833659979332216,0.007409219379499981
|
755 |
+
0.8918650793650794,10074,4078.963696369637,0.35897801375476607,0.007422481705358274
|
756 |
+
0.8938492063492064,10087,4069.34503450345,0.35944125717136444,0.007432060051811486
|
757 |
+
0.8958333333333334,10125,4050.107710771077,0.3607953533121904,0.007460058295290106
|
758 |
+
0.8998015873015873,10210,4011.6330633063303,0.36382425257456436,0.007522685945176492
|
759 |
+
0.9017857142857144,10234,4002.014401440144,0.3646794711898229,0.007540369046320884
|
760 |
+
0.9027777777777778,10324,3963.5397539753976,0.3678865409970424,0.007606680675612351
|
761 |
+
0.9037698412698412,10343,3953.921092109211,0.36856358906745534,0.007620679797351661
|
762 |
+
0.9057539682539684,10366,3944.302430243024,0.3693831735737448,0.007637626102615036
|
763 |
+
0.9067460317460316,10389,3934.6837683768376,0.3702027580800342,0.007654572407878411
|
764 |
+
0.9077380952380952,10402,3925.0651065106513,0.3706660014966326,0.007664150754331623
|
765 |
+
0.9087301587301588,10455,3896.2091209120913,0.37255460927199513,0.0077032009360254875
|
766 |
+
0.9107142857142856,10497,3876.971797179718,0.3740512418486976,0.0077341463630281725
|
767 |
+
0.9117063492063492,10569,3848.115811581158,0.37661689769447315,0.007787195666461346
|
768 |
+
0.9126984126984128,10659,3800.022502250225,0.3798239675016926,0.007853507295752814
|
769 |
+
0.9136904761904762,10674,3790.403840384039,0.3803584791362292,0.007864559233968059
|
770 |
+
0.9146825396825397,10692,3780.785178517852,0.38099989309767307,0.007877821559826352
|
771 |
+
0.9156746031746033,10820,3723.073207320732,0.3855610590457186,0.00797213143259644
|
772 |
+
0.91765873015873,10845,3713.4545454545455,0.3864519117699462,0.007990551329621847
|
773 |
+
0.9186507936507936,10885,3694.217221722172,0.3878772761287104,0.0080200231648625
|
774 |
+
0.9196428571428572,10960,3665.361236123612,0.3905498343013933,0.008075282855938722
|
775 |
+
0.9206349206349206,10987,3655.742574257426,0.39151195524355914,0.008095176344726162
|
776 |
+
0.921626984126984,11011,3646.123912391239,0.39236717385881764,0.008112859445870554
|
777 |
+
0.9236111111111112,11040,3626.886588658866,0.39340056301892173,0.008134226526420027
|
778 |
+
0.9246031746031746,11156,3588.41194119412,0.39753411965933794,0.008219694848617919
|
779 |
+
0.9265873015873016,11178,3578.793279327933,0.39831807005665826,0.008235904358000277
|
780 |
+
0.9275793650793652,11213,3559.55595559556,0.3995652638705769,0.008261692213835848
|
781 |
+
0.9305555555555556,11247,3549.937293729373,0.4007768235755265,0.008286743273790403
|
782 |
+
0.9315476190476192,11314,3521.0813081308133,0.40316430887645655,0.008336108597818494
|
783 |
+
0.9325396825396826,11366,3501.84398439844,0.40501728254285,0.008374421983631343
|
784 |
+
0.933531746031746,11386,3492.2253225322534,0.4057299647222321,0.008389157901251668
|
785 |
+
0.9345238095238096,11411,3482.6066606660665,0.4066208174464597,0.008407577798277076
|
786 |
+
0.935515873015873,11430,3472.98799879988,0.40729786551687275,0.008421576920016386
|
787 |
+
0.9365079365079364,11488,3444.13201320132,0.4093646438370809,0.008464311081115332
|
788 |
+
0.9375,11516,3434.5133513351334,0.4103623988882158,0.008484941365783788
|
789 |
+
0.9384920634920636,11601,3396.038703870387,0.4133912981505897,0.008547569015670175
|
790 |
+
0.939484126984127,11666,3367.182718271827,0.41570751523358157,0.008595460747936235
|
791 |
+
0.9404761904761904,11715,3347.945394539454,0.41745358657306775,0.008631563746106033
|
792 |
+
0.941468253968254,11806,3319.089408940894,0.42069629048925633,0.008698612171278517
|
793 |
+
0.9424603174603174,11891,3290.2334233423344,0.42372518975163026,0.008761239821164904
|
794 |
+
0.9434523809523808,12058,3222.902790279028,0.4296760859494708,0.008884284733294627
|
795 |
+
0.945436507936508,12090,3213.2841284128413,0.4308163774364822,0.008907862201487149
|
796 |
+
0.9464285714285714,12272,3145.953495349535,0.4373017852688594,0.009041959051832117
|
797 |
+
0.9474206349206348,12327,3126.7161716171618,0.43926166126216015,0.009082482825288014
|
798 |
+
0.9484126984126984,12347,3117.097509750975,0.43997434344154224,0.00909721874290834
|
799 |
+
0.949404761904762,12383,3107.4788478847886,0.44125717136443005,0.009123743394624927
|
800 |
+
0.9503968253968254,12439,3088.2415241524154,0.44325268146669994,0.00916500396396184
|
801 |
+
0.9513888888888888,12459,3078.6228622862286,0.443965363646082,0.009179739881582166
|
802 |
+
0.9523809523809524,12474,3069.0042004200423,0.44449987528061863,0.00919079181979741
|
803 |
+
0.953373015873016,12507,3059.3855385538554,0.4456758008765991,0.009215106083870949
|
804 |
+
0.9543650793650794,12599,3030.5295529552955,0.4489541389017568,0.009282891304924448
|
805 |
+
0.9553571428571428,12650,3011.2922292229223,0.45077147845918114,0.00932046789485628
|
806 |
+
0.9563492063492064,12686,3001.6735673567355,0.4520543063820689,0.009346992546572867
|
807 |
+
0.95734126984127,12742,2982.4362436243623,0.4540498164843388,0.009388253115909781
|
808 |
+
0.9583333333333334,12788,2963.198919891989,0.45568898549691766,0.00942214572643653
|
809 |
+
0.9593253968253967,12824,2953.580258025803,0.4569718134198054,0.009448670378153117
|
810 |
+
0.9603174603174603,12905,2924.724272427243,0.459858176246303,0.009508350844515439
|
811 |
+
0.9642857142857144,13182,2838.1563156315638,0.4697288244307451,0.009712443303556955
|
812 |
+
0.9652777777777778,13253,2818.9189918991897,0.4722588461675516,0.009764755811109114
|
813 |
+
0.9672619047619048,13305,2799.681668166817,0.47411181983394507,0.00980306919692196
|
814 |
+
0.9682539682539684,13333,2790.06300630063,0.47510957488508,0.009823699481590419
|
815 |
+
0.9692460317460316,13604,2713.113711371137,0.4847664184157075,0.010023371165345837
|
816 |
+
0.9702380952380952,13636,2703.4950495049507,0.4859067099027189,0.010046948633538359
|
817 |
+
0.9712301587301588,13873,2626.545754575457,0.49435199372839683,0.010221569257339224
|
818 |
+
0.9722222222222222,13901,2616.927092709271,0.49534974877953175,0.01024219954200768
|
819 |
+
0.9732142857142856,14207,2530.359135913592,0.506253786124078,0.01046765908159867
|
820 |
+
0.9742063492063492,14612,2424.5538553855386,0.5206856002565656,0.010766061413410275
|
821 |
+
0.9751984126984128,14778,2376.4605460546054,0.526600862345437,0.010888369529658982
|
822 |
+
0.9761904761904762,15109,2289.892589258926,0.5383957524142109,0.011132248966275378
|
823 |
+
0.9771825396825397,15225,2261.0366036603664,0.542529309054627,0.01121771728847327
|
824 |
+
0.9791666666666666,15274,2251.41794179418,0.5442753803941133,0.01125382028664307
|
825 |
+
0.98015873015873,15312,2241.7992799279928,0.5456294765349392,0.011281818530121689
|
826 |
+
0.9811507936507936,15512,2193.70597059706,0.5527562983287603,0.01142917770632495
|
827 |
+
0.9831349206349206,15729,2145.6126612661265,0.5604888999750561,0.011589062412505489
|
828 |
+
0.984126984126984,15778,2135.99399939994,0.5622349713145423,0.011625165410675289
|
829 |
+
0.9851190476190476,15964,2097.5193519351933,0.5688629155827959,0.01176220944454432
|
830 |
+
0.9861111111111112,16189,2049.42604260426,0.5768805901008446,0.01192798851777299
|
831 |
+
0.9871031746031746,16266,2030.1887188718872,0.5796244164914657,0.011984721800611246
|
832 |
+
0.988095238095238,16370,2010.951395139514,0.5833303638242526,0.012061348572236941
|
833 |
+
0.9890873015873016,16556,1972.4767476747677,0.5899583080925062,0.012198392606105975
|
834 |
+
0.9900793650793652,16877,1914.764776477648,0.601396857071589,0.012434904083912209
|
835 |
+
0.9910714285714286,17205,1857.052805280528,0.6130848448134555,0.012676573132885558
|
836 |
+
0.992063492063492,17516,1808.9594959495948,0.6241670527028471,0.01290571665188163
|
837 |
+
0.9930555555555556,17735,1770.4848484848485,0.6319709225670812,0.0130670749498242
|
838 |
+
0.9940476190476192,17997,1732.0102010201022,0.6413070591169868,0.013260115470650473
|
839 |
+
0.9950396825396826,18721,1626.2049204920493,0.667106154010619,0.013793555688506279
|
840 |
+
0.996031746031746,18795,1616.586258625863,0.6697430780743328,0.013848078583701486
|
841 |
+
0.9970238095238096,23165,1135.6531653165316,0.8254641342693226,0.017067876583742748
|
842 |
+
0.9990079365079364,23945,1068.3225322532253,0.8532587392652247,0.017642577370935466
|
843 |
+
1.0,28063,799.0,1.0,0.020676702808960615
|
metric_analysis/output_standardized/log_retweets_over_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,1,0.1110177516201285,3.1287642945418706e-06,7.367958810163067e-07
|
3 |
+
0.0109126984126984,14,0.0458136629133554,4.380270012358619e-05,1.0315142334228295e-05
|
4 |
+
0.0208333333333333,31,0.0305420201938555,9.699169313079799e-05,2.284067231150551e-05
|
5 |
+
0.0307539682539682,58,0.0211584469416017,0.0001814683290834285,4.273416109894579e-05
|
6 |
+
0.0406746031746031,87,0.0172859102583349,0.00027220249362514275,6.410124164841868e-05
|
7 |
+
0.050595238095238,128,0.013160342533035,0.00040048182970135943,9.430987277008726e-05
|
8 |
+
0.060515873015873,181,0.0106189425333306,0.0005663063373120786,0.00013336005446395153
|
9 |
+
0.0704365079365079,274,0.0082195927599472,0.0008572814167044726,0.00020188207139846805
|
10 |
+
0.0803571428571428,350,0.0074260070425938,0.0010950675030896547,0.0002578785583557074
|
11 |
+
0.0902777777777777,451,0.0064240020967853,0.0014110726968383836,0.00033229494233835433
|
12 |
+
0.1001984126984127,520,0.0058630374952952,0.0016269574331617728,0.00038313385812847954
|
13 |
+
0.1101190476190476,548,0.0055695733498176,0.0017145628334089451,0.0004037641427969361
|
14 |
+
0.1200396825396825,670,0.0050640430518192,0.0020962720773430536,0.0004936532402809255
|
15 |
+
0.1299603174603174,822,0.0044397680804404,0.0025718442501134176,0.0006056462141954042
|
16 |
+
0.1398809523809523,939,0.0040877025472988,0.0029379096725748164,0.0006918513322743121
|
17 |
+
0.1507936507936507,1139,0.0036561499334258,0.003563662531483191,0.0008392105084775734
|
18 |
+
0.1607142857142857,1401,0.0033397443279497,0.004383398776653161,0.0010322510293038457
|
19 |
+
0.1706349206349206,1624,0.0030947814200735,0.005081113214335998,0.0011965565107704822
|
20 |
+
0.1805555555555555,2070,0.0026847167747472,0.006476542089701672,0.001525167473703755
|
21 |
+
0.1904761904761904,2303,0.0024974390206845,0.0072055441703299285,0.0016968409139805545
|
22 |
+
0.2003968253968254,2658,0.0022891258443659,0.008316255494892292,0.001958403451741343
|
23 |
+
0.2103174603174603,2985,0.0021472681131348,0.009339361419207484,0.0021993357048336755
|
24 |
+
0.2202380952380952,3242,0.0020431615576899,0.010143453842904744,0.0023886922462548667
|
25 |
+
0.2301587301587301,3623,0.0019049409638422,0.011335513039125198,0.0026694114769220795
|
26 |
+
0.2400793650793651,4018,0.0017638702239982,0.012571374935469237,0.002960445849923521
|
27 |
+
0.25,4524,0.0016339688981287,0.014154529668507424,0.003333264565717772
|
28 |
+
0.2599206349206349,4872,0.0015278241082717,0.015243339643007994,0.0035896695323114464
|
29 |
+
0.2698412698412698,5224,0.0014599380842198,0.016344664674686732,0.0038490216824291867
|
30 |
+
0.2797619047619047,6015,0.0013358734220584,0.01881951723166935,0.004431827224313086
|
31 |
+
0.2906746031746032,6705,0.0012345399503041,0.020978364594903244,0.0049402163822143364
|
32 |
+
0.3005952380952381,6991,0.0011997293930759,0.021873191183142217,0.005150940004185001
|
33 |
+
0.310515873015873,7611,0.001133266050563,0.023813025045758177,0.005607753450415111
|
34 |
+
0.3204365079365079,8209,0.0010706819674658,0.025684026093894217,0.006048357387262862
|
35 |
+
0.3303571428571428,8697,0.0010318295150289,0.02721086306963065,0.0064079137771988195
|
36 |
+
0.3402777777777778,9730,0.0009582618477334,0.0304428765858924,0.007169023922288665
|
37 |
+
0.3501984126984127,10089,0.0009323687651642,0.03156610296763293,0.007433533643573519
|
38 |
+
0.3601190476190476,10941,0.0008840972038673,0.03423181014658261,0.008061283734199412
|
39 |
+
0.3700396825396825,11298,0.0008656145917666,0.03534877899973406,0.008324319863722235
|
40 |
+
0.3799603174603174,11952,0.0008387869121772,0.03739499084836444,0.008806184369906898
|
41 |
+
0.3898809523809524,12745,0.00080268212817,0.03987610093393614,0.00939046350355283
|
42 |
+
0.3998015873015873,13622,0.0007683790460187,0.042620027220249365,0.01003663349120413
|
43 |
+
0.4097222222222222,14806,0.0007321509024842,0.04632448414498694,0.010908999814327438
|
44 |
+
0.4196428571428571,15421,0.0007106285418656,0.048248674186130186,0.011362129281152466
|
45 |
+
0.4305555555555556,15822,0.000696980606374,0.04950330866824148,0.011657584429440006
|
46 |
+
0.4404761904761904,16628,0.0006743321938094,0.05202509268964223,0.012251441909539149
|
47 |
+
0.4503968253968254,17340,0.00065250692733,0.05425277286735604,0.012776040576822759
|
48 |
+
0.4603174603174603,18354,0.0006301131324195,0.057425339862021495,0.013523151600173294
|
49 |
+
0.4702380952380952,19552,0.0006047211659253,0.061173599486882654,0.01440583306563083
|
50 |
+
0.4801587301587302,20284,0.0005895765740513,0.0634638549504873,0.014945167650534767
|
51 |
+
0.490079365079365,21566,0.000564702192227,0.06747493077608999,0.015889739969997672
|
52 |
+
0.5,23032,0.0005395738872296,0.07206169923188836,0.016969882731567576
|
53 |
+
0.5099206349206349,24228,0.0005212916903156,0.07580370132816044,0.01785109060526308
|
54 |
+
0.5198412698412699,25296,0.0005061834084447,0.07914522159473115,0.018637988606188496
|
55 |
+
0.5297619047619048,26013,0.0004976799255063,0.08138854559391769,0.019166271252877187
|
56 |
+
0.5396825396825397,26667,0.0004876764751834,0.08343475744254807,0.019648135759061852
|
57 |
+
0.5496031746031746,28535,0.0004681204083387,0.08927928914475228,0.021024470464800313
|
58 |
+
0.5595238095238095,29882,0.0004524594289673,0.09349373464950018,0.02201693451652928
|
59 |
+
0.5694444444444444,31354,0.0004371937638723,0.09809927569106582,0.023101498053385284
|
60 |
+
0.5803571428571429,32866,0.000426354196908,0.10282996730441313,0.02421553342548194
|
61 |
+
0.5902777777777778,34629,0.0004112004488701,0.10834597875569044,0.025514504563713687
|
62 |
+
0.6001984126984127,36276,0.0003980419371793,0.1134990535488009,0.026728007379747544
|
63 |
+
0.6101190476190477,37806,0.0003870781771608,0.11828606291944996,0.027855305077702494
|
64 |
+
0.6200396825396826,39620,0.0003748698230899,0.12396164134974892,0.029191852805866073
|
65 |
+
0.6299603174603174,41027,0.0003649463454591,0.12836381271216932,0.03022852461045602
|
66 |
+
0.6398809523809523,42099,0.0003569193584129,0.1317178480359182,0.0310183697949055
|
67 |
+
0.6498015873015873,42986,0.0003511657214206,0.13449306196517685,0.03167190774136696
|
68 |
+
0.6597222222222222,44674,0.0003421251688588,0.13977441609436353,0.03291561918852249
|
69 |
+
0.6696428571428571,47808,0.0003263702632609,0.14957996339345775,0.035224737479627594
|
70 |
+
0.6795634920634921,49826,0.0003164125508853,0.15589380973984324,0.0367115915675185
|
71 |
+
0.689484126984127,53140,0.0003008070695204,0.166262534611955,0.03915333311720654
|
72 |
+
0.6994047619047619,55365,0.0002917651358013,0.17322403516731066,0.040792703952467826
|
73 |
+
0.7093253968253969,56801,0.0002862937577127,0.1777169406942728,0.041850742837607244
|
74 |
+
0.7202380952380952,58502,0.0002800657421942,0.18303896875928852,0.043104032631215976
|
75 |
+
0.7301587301587301,62958,0.0002650567389384,0.1969807424557671,0.04638719507702464
|
76 |
+
0.7400793650793651,67584,0.0002501977740079,0.21145440608231778,0.049795612822606077
|
77 |
+
0.75,70894,0.0002407572874386,0.22181061589725137,0.05223440718877005
|
78 |
+
0.7599206349206349,72948,0.0002343518471991,0.2282370977582404,0.05374778592837755
|
79 |
+
0.7698412698412699,75718,0.0002262636742258,0.23690377485412137,0.055788710518792715
|
80 |
+
0.7797619047619048,78571,0.0002184445814173,0.24583013938644932,0.057890789167332236
|
81 |
+
0.7896825396825397,84808,0.0002054679665547,0.265344242291507,0.062486185077230944
|
82 |
+
0.7996031746031746,87559,0.0001993432769733,0.27395147286579163,0.0645131105459068
|
83 |
+
0.8095238095238095,90777,0.0001928516703228,0.2840198363656274,0.06688411969101728
|
84 |
+
0.8194444444444444,94063,0.0001857550455259,0.294300955837492,0.06930523095603687
|
85 |
+
0.8293650793650794,97923,0.0001784615254643,0.3063779860144236,0.07214926305675981
|
86 |
+
0.8392857142857143,102033,0.0001717709708193,0.3192372072649907,0.07517749412773683
|
87 |
+
0.8492063492063492,107026,0.0001638177776107,0.33485912738763823,0.07885631596165125
|
88 |
+
0.8601190476190477,109675,0.0001597665218121,0.34314722400387965,0.08080808825046344
|
89 |
+
0.8700396825396826,113262,0.0001543407659995,0.35437010152840137,0.08345097507566894
|
90 |
+
0.8799603174603174,119119,0.0001455305353291,0.3726952740015331,0.08776638855078145
|
91 |
+
0.8898809523809523,127440,0.0001359807274111,0.398729721696416,0.09389726707671814
|
92 |
+
0.8998015873015873,133957,0.0001290390562169,0.4191198786039454,0.0986989658333014
|
93 |
+
0.9097222222222222,142348,0.0001211838665921,0.4453733397994462,0.10488142007090924
|
94 |
+
0.9196428571428572,153038,0.0001116244820123,0.4788198301080988,0.11275776803897355
|
95 |
+
0.929563492063492,163454,0.0001037630115678,0.511409039000047,0.12043223393563941
|
96 |
+
0.939484126984127,177935,9.274401243552316e-05,0.5567166747493077,0.13110177508863655
|
97 |
+
0.949404761904762,198735,7.906615255441045e-05,0.6217949720757787,0.14642712941377573
|
98 |
+
0.9593253968253967,215154,6.953399127171892e-05,0.6731661530278616,0.15852458098418246
|
99 |
+
0.9692460317460316,232131,6.06772297454726e-05,0.726283184456299,0.1710331646561963
|
100 |
+
0.9791666666666666,255601,4.912336231094642e-05,0.7997152824491967,0.18832576398364903
|
101 |
+
0.9890873015873016,273938,4.051692731176368e-05,0.857087433318211,0.20183639005384504
|
102 |
+
1.0,319615,2.082382557391369e-05,1.0,0.23549101551102689
|
metric_analysis/output_standardized/log_retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,1,2.143417655682428,3.2215456976257205e-05,7.367958810163067e-07
|
3 |
+
0.0109126984126984,11,1.7468918034538925,0.0003543700267388293,8.104754691179374e-06
|
4 |
+
0.0208333333333333,23,1.6432671810611028,0.0007409555104539157,1.6946305263375057e-05
|
5 |
+
0.0307539682539682,34,1.5487864668450748,0.001095325537192745,2.505105995455443e-05
|
6 |
+
0.0406746031746031,45,1.4578807811516885,0.0014496955639315744,3.31558146457338e-05
|
7 |
+
0.050595238095238,59,1.397787734828259,0.0019007119615991753,4.34709569799621e-05
|
8 |
+
0.060515873015873,70,1.3793064865985625,0.0022550819883380047,5.157571167114147e-05
|
9 |
+
0.0704365079365079,86,1.342219079933877,0.00277052929995812,6.336444576740239e-05
|
10 |
+
0.0803571428571428,100,1.3250067925542386,0.003221545697625721,7.367958810163068e-05
|
11 |
+
0.0902777777777777,113,1.3061771441754433,0.0036403466383170646,8.325793455484266e-05
|
12 |
+
0.1001984126984127,124,1.2884022960547663,0.003994716665055894,9.136268924602204e-05
|
13 |
+
0.1101190476190476,149,1.2546164327696954,0.004800103089462324,0.0001097825862714297
|
14 |
+
0.1200396825396825,163,1.237761303530253,0.005251119487129925,0.000120097728605658
|
15 |
+
0.1299603174603174,181,1.2226659947251466,0.005830997712702555,0.00013336005446395153
|
16 |
+
0.1398809523809523,201,1.2048894357033295,0.006475306852227699,0.00014809597208427765
|
17 |
+
0.1507936507936507,221,1.191433139144886,0.007119615991752843,0.0001628318897046038
|
18 |
+
0.1607142857142857,238,1.183222859975457,0.0076672787603492155,0.000175357419681881
|
19 |
+
0.1706349206349206,270,1.1596741156004886,0.008698173383589447,0.00019893488787440283
|
20 |
+
0.1805555555555555,294,1.1487927586475322,0.00947134435101962,0.0002166179890187942
|
21 |
+
0.1904761904761904,334,1.1355234542124109,0.010759962630069907,0.00024608982425944646
|
22 |
+
0.2003968253968254,362,1.1254166588926875,0.01166199542540511,0.00026672010892790307
|
23 |
+
0.2103174603174603,399,1.1160912776174,0.012853967333526626,0.0002939815565255064
|
24 |
+
0.2202380952380952,436,1.1083128293352051,0.014045939241648143,0.00032124300412310976
|
25 |
+
0.2301587301587301,472,1.1020695947838788,0.015205695692793402,0.0003477676558396968
|
26 |
+
0.2400793650793651,508,1.0950675043632472,0.016365452143938662,0.00037429230755628385
|
27 |
+
0.25,540,1.0893285138897673,0.017396346767178893,0.00039786977574880566
|
28 |
+
0.2599206349206349,573,1.0840944112778226,0.01845945684739538,0.00042218403982234375
|
29 |
+
0.2698412698412698,609,1.0775125729219717,0.01961921329854064,0.0004487086915389308
|
30 |
+
0.2797619047619047,663,1.0691477573528578,0.02135884797525853,0.0004884956691138114
|
31 |
+
0.2906746031746032,721,1.060571954677911,0.023227344479881448,0.0005312298302127572
|
32 |
+
0.3005952380952381,770,1.0524500566259047,0.02480590187171805,0.0005673328283825562
|
33 |
+
0.310515873015873,812,1.048103007385223,0.026158951064720853,0.0005982782553852411
|
34 |
+
0.3204365079365079,871,1.0416438955273315,0.028059663026320028,0.0006417492123652032
|
35 |
+
0.3303571428571428,914,1.035761826178403,0.02944492767629909,0.0006734314352489044
|
36 |
+
0.3402777777777778,984,1.0277467024345217,0.03170000966463709,0.0007250071469200459
|
37 |
+
0.3501984126984127,1035,1.0230244967357478,0.03334299797042621,0.0007625837368518775
|
38 |
+
0.3601190476190476,1074,1.0190172621424871,0.03459940079250024,0.0007913187762115134
|
39 |
+
0.3700396825396825,1160,1.012688867600473,0.03736993009245836,0.0008546832219789159
|
40 |
+
0.3799603174603174,1237,1.0053147119107226,0.03985052027963017,0.0009114165048171714
|
41 |
+
0.3898809523809524,1281,1.0016513621903245,0.041268000386585485,0.000943835523581889
|
42 |
+
0.3998015873015873,1341,0.997249620997618,0.04320092780516092,0.0009880432764428674
|
43 |
+
0.4097222222222222,1417,0.9918971124207676,0.04564930253535646,0.0010440397634001067
|
44 |
+
0.4196428571428571,1499,0.9865132333995608,0.048290970007409555,0.0011044570256434438
|
45 |
+
0.4305555555555556,1561,0.983190364968206,0.0502883283399375,0.0011501383702664549
|
46 |
+
0.4404761904761904,1633,0.9790934785418004,0.05260784124222802,0.001203187673699629
|
47 |
+
0.4503968253968254,1692,0.9762955472165364,0.0545085532038272,0.0012466586306795911
|
48 |
+
0.4603174603174603,1781,0.9715297580192088,0.05737572887471409,0.0013122334640900423
|
49 |
+
0.4702380952380952,1831,0.9686141408612416,0.05898650172352695,0.0013490732581408578
|
50 |
+
0.4801587301587302,1905,0.96557583848392,0.06137044553976998,0.0014035961533360643
|
51 |
+
0.490079365079365,2006,0.9611672660302412,0.06462420669437197,0.0014780125373187113
|
52 |
+
0.5,2095,0.9570957173459096,0.06749138236525885,0.0015435873707291628
|
53 |
+
0.5099206349206349,2189,0.9537739623026252,0.07051963532102704,0.0016128461835446955
|
54 |
+
0.5198412698412699,2275,0.9504137942708432,0.07329016462098514,0.001676210629312098
|
55 |
+
0.5297619047619048,2376,0.9461150743517937,0.07654392577558712,0.001750627013294745
|
56 |
+
0.5396825396825397,2450,0.9431937683554849,0.07892786959183017,0.0018051499084899515
|
57 |
+
0.5496031746031746,2523,0.9403613239507382,0.08127959795109693,0.001858936007804142
|
58 |
+
0.5595238095238095,2614,0.9370237927380608,0.08421120453593635,0.0019259844329766258
|
59 |
+
0.5694444444444444,2785,0.9314740928203532,0.08972004767887633,0.0020519765286304142
|
60 |
+
0.5803571428571429,2954,0.9257993297998168,0.09516445990786379,0.0021764950325221704
|
61 |
+
0.5902777777777778,3057,0.9223133679230248,0.09848265197641828,0.00225238500826685
|
62 |
+
0.6001984126984127,3203,0.918171233112645,0.10318610869495184,0.0023599572068952307
|
63 |
+
0.6101190476190477,3415,0.9118727064143508,0.11001578557391836,0.0025161579336706874
|
64 |
+
0.6200396825396826,3581,0.9074973981023632,0.11536355143197706,0.0026384660499193945
|
65 |
+
0.6299603174603174,3718,0.90335209700547,0.1197770690377243,0.0027394070856186286
|
66 |
+
0.6398809523809523,3866,0.9000547970039445,0.12454495667021037,0.002848452876009042
|
67 |
+
0.6498015873015873,3976,0.8971731862315426,0.12808865693759866,0.0029295004229208358
|
68 |
+
0.6597222222222222,4098,0.8946920241479999,0.13201894268870204,0.003019389520404825
|
69 |
+
0.6696428571428571,4234,0.8909887319607015,0.136400244837473,0.003119593760223043
|
70 |
+
0.6795634920634921,4343,0.8884327201390895,0.13991172964788506,0.0031999045112538205
|
71 |
+
0.689484126984127,4421,0.8867262402778523,0.1424245352920331,0.003257374589973092
|
72 |
+
0.6994047619047619,4644,0.8816572865395638,0.14960858219773848,0.0034216800714397286
|
73 |
+
0.7093253968253969,4908,0.8762024305187027,0.15811346283947036,0.003616194184028034
|
74 |
+
0.7202380952380952,5043,0.8733238070593741,0.1624625495312651,0.003715661627965235
|
75 |
+
0.7301587301587301,5187,0.8702700733610532,0.16710157533584613,0.003821760234831583
|
76 |
+
0.7400793650793651,5439,0.8656201351014363,0.17521987049386295,0.004007432796847693
|
77 |
+
0.75,5607,0.8625694432138333,0.18063206726587416,0.004131214504858432
|
78 |
+
0.7599206349206349,5814,0.8589762954909695,0.1873006668599594,0.004283731252228807
|
79 |
+
0.7698412698412699,5987,0.8559459884429319,0.1928739409168519,0.004411196939644629
|
80 |
+
0.7797619047619048,6129,0.8531910961698003,0.19744853580748042,0.0045158219547489446
|
81 |
+
0.7896825396825397,6358,0.8493448528501626,0.20482587545504333,0.004684548211501678
|
82 |
+
0.7996031746031746,6515,0.8466514160755445,0.2098837022003157,0.004800225164821238
|
83 |
+
0.8095238095238095,6728,0.8431442930562224,0.2167455945362585,0.004957162687477712
|
84 |
+
0.8194444444444444,6820,0.8417075589387941,0.21970941657807416,0.005024947908531212
|
85 |
+
0.8293650793650794,7125,0.8367692646747009,0.2295351309558326,0.005249670652241186
|
86 |
+
0.8392857142857143,7401,0.8321194850614858,0.23842659708127958,0.005453026315401686
|
87 |
+
0.8492063492063492,7660,0.8282315008546971,0.24677040043813023,0.00564385644858491
|
88 |
+
0.8601190476190477,7913,0.8243482587654483,0.2549209110531233,0.005830265806482035
|
89 |
+
0.8700396825396826,8191,0.8201902577121126,0.2638768080925228,0.006035095061404569
|
90 |
+
0.8799603174603174,8537,0.8158236353302109,0.2750233562063078,0.006290026436236211
|
91 |
+
0.8898809523809523,8823,0.8121718746758682,0.28423697690151734,0.006500750058206875
|
92 |
+
0.8998015873015873,9235,0.8063837662960968,0.2975097451757353,0.006804309961185593
|
93 |
+
0.9097222222222222,9436,0.8038518801656075,0.303985052027963,0.00695240593326987
|
94 |
+
0.9196428571428572,9768,0.7997056994974557,0.3146805837440804,0.007197022165767284
|
95 |
+
0.929563492063492,10231,0.7936671192146495,0.3295963403240875,0.007538158658677834
|
96 |
+
0.939484126984127,10668,0.7889122408794803,0.3436744950227119,0.007860138458681961
|
97 |
+
0.949404761904762,11239,0.7818005566060252,0.36206952095615474,0.008280848906742272
|
98 |
+
0.9593253968253967,12034,0.7726483778092785,0.38768080925227927,0.008866601632150235
|
99 |
+
0.9692460317460316,12783,0.7640086373583896,0.4118101865274959,0.00941846174703145
|
100 |
+
0.9791666666666666,14286,0.7484793180463,0.46023001836281047,0.010525865956198959
|
101 |
+
0.9890873015873016,16933,0.7230443483570232,0.5455043329789633,0.012476164653249123
|
102 |
+
1.0,31041,0.6240002852947716,1.0,0.022870880942627177
|
metric_analysis/output_standardized/retweets_over_log_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,14,10241.747512102276,0.0005680204487361545,1.0315142334228295e-05
|
3 |
+
0.0109126984126984,82,6411.553184403061,0.0033269769140260477,6.0417262243337154e-05
|
4 |
+
0.0208333333333333,164,5331.458638244872,0.006653953828052095,0.00012083452448667431
|
5 |
+
0.0307539682539682,253,4696.706526876823,0.01026494096644622,0.0001864093578971256
|
6 |
+
0.0406746031746031,280,4475.92163141317,0.01136040897472309,0.0002063028466845659
|
7 |
+
0.050595238095238,366,4018.487598414677,0.014849677445530896,0.00026966729245196826
|
8 |
+
0.060515873015873,450,3670.968448884167,0.018257800137947824,0.00033155814645733804
|
9 |
+
0.0704365079365079,522,3443.658749880984,0.021179048160019476,0.00038460744989051213
|
10 |
+
0.0803571428571428,579,3300.3631730935804,0.023491702844159534,0.0004266048151084416
|
11 |
+
0.0902777777777777,634,3191.2654159655003,0.02572321174990871,0.00046712858856433846
|
12 |
+
0.1001984126984127,691,3057.4880652118327,0.02803586643404877,0.0005091259537822679
|
13 |
+
0.1101190476190476,776,2855.125879628761,0.03148456201566113,0.0005717536036686541
|
14 |
+
0.1200396825396825,847,2720.031974699821,0.03436523714853735,0.0006240661112208118
|
15 |
+
0.1299603174603174,903,2644.884840422699,0.03663731894348197,0.000665326680557725
|
16 |
+
0.1398809523809523,995,2512.2825954364475,0.040370024749462406,0.0007331119016112252
|
17 |
+
0.1507936507936507,1043,2465.783943579372,0.04231752343084351,0.0007684781039000079
|
18 |
+
0.1607142857142857,1089,2409.978463778208,0.044183876333833735,0.000802370714426758
|
19 |
+
0.1706349206349206,1153,2343.3129244431043,0.046780541242341864,0.0008495256508118016
|
20 |
+
0.1805555555555555,1187,2298.6494790579704,0.04816001947498681,0.0008745767107663561
|
21 |
+
0.1904761904761904,1253,2218.5893192510925,0.050837830161885825,0.0009232052389134324
|
22 |
+
0.2003968253968254,1296,2181.038558435724,0.05258246439728973,0.0009548874617971336
|
23 |
+
0.2103174603174603,1354,2133.4300161851493,0.05493569197062523,0.0009976216228960793
|
24 |
+
0.2202380952380952,1424,2083.200810963788,0.057775794214306,0.001049197334567221
|
25 |
+
0.2301587301587301,1508,2016.6788884308944,0.06118391690672293,0.0011110881885725906
|
26 |
+
0.2400793650793651,1595,1957.2434949168637,0.06471375826672618,0.0011751894302210093
|
27 |
+
0.25,1646,1924.0614094625105,0.06678297561569359,0.0012127660201528409
|
28 |
+
0.2599206349206349,1693,1888.1800931525004,0.06868990140787926,0.0012473954265606072
|
29 |
+
0.2698412698412698,1753,1852.2105720993288,0.07112427475960563,0.0012916031794215858
|
30 |
+
0.2797619047619047,1813,1818.039475418674,0.073558648111332,0.001335810932282564
|
31 |
+
0.2906746031746032,1857,1791.705607659909,0.07534385523593135,0.0013682299510472816
|
32 |
+
0.3005952380952381,1918,1753.2702562983825,0.07781880147685316,0.0014131744997892765
|
33 |
+
0.310515873015873,1984,1711.5371500241918,0.08049661216375219,0.0014618030279363527
|
34 |
+
0.3204365079365079,2076,1658.641956527047,0.08422931796973263,0.001529588248989853
|
35 |
+
0.3303571428571428,2150,1609.3989544747326,0.08723171177019516,0.0015841111441850595
|
36 |
+
0.3402777777777778,2204,1578.2817061633984,0.0894226477867489,0.0016238981217599402
|
37 |
+
0.3501984126984127,2310,1527.467028791344,0.0937233740414655,0.0017019984851476687
|
38 |
+
0.3601190476190476,2357,1506.1373828761034,0.09563029983365115,0.001736627891555435
|
39 |
+
0.3700396825396825,2406,1483.3271367722798,0.09761837140422769,0.0017727308897252342
|
40 |
+
0.3799603174603174,2500,1440.5856788553633,0.10143222298859902,0.001841989702540767
|
41 |
+
0.3898809523809524,2589,1408.5466934453984,0.10504321012699315,0.0019075645359512181
|
42 |
+
0.3998015873015873,2629,1392.3103276130262,0.10666612569481072,0.0019370363711918705
|
43 |
+
0.4097222222222222,2693,1366.6130386272123,0.10926279060331887,0.001984191307576914
|
44 |
+
0.4196428571428571,2824,1327.1136527390647,0.11457783908792145,0.0020807115679900502
|
45 |
+
0.4305555555555556,2900,1301.7988554511148,0.11766137866677485,0.0021367080549472895
|
46 |
+
0.4404761904761904,2982,1273.901632813587,0.12098835558080091,0.002197125317190627
|
47 |
+
0.4503968253968254,3040,1252.4280115900583,0.1233415831541364,0.0022398594782895727
|
48 |
+
0.4603174603174603,3144,1221.3638625370725,0.12756116363046213,0.0023164862499152686
|
49 |
+
0.4702380952380952,3181,1210.132391580637,0.1290623605306934,0.002343747697512872
|
50 |
+
0.4801587301587302,3237,1192.4468751700258,0.131334442325638,0.002385008266849785
|
51 |
+
0.490079365079365,3309,1173.011743073717,0.13425569034770965,0.0024380575702829593
|
52 |
+
0.5,3403,1146.372070033951,0.13806954193208099,0.002507316383098492
|
53 |
+
0.5099206349206349,3454,1133.927886665909,0.1401387592810484,0.0025448929730303234
|
54 |
+
0.5198412698412699,3581,1099.8695901550593,0.14529151620886924,0.0026384660499193945
|
55 |
+
0.5297619047619048,3664,1078.0676366493603,0.14865906601209072,0.002699620108043748
|
56 |
+
0.5396825396825397,3733,1060.6487591725645,0.15145859536657605,0.002750459023833873
|
57 |
+
0.5496031746031746,3868,1027.9967506835908,0.15693593540796041,0.0028499264677710745
|
58 |
+
0.5595238095238095,3948,1010.3955348603816,0.16018176654359556,0.002908870138252379
|
59 |
+
0.5694444444444444,4022,994.8012552878178,0.1631841603440581,0.0029633930334475858
|
60 |
+
0.5803571428571429,4107,972.2112451667429,0.16663285592567045,0.003026020683333972
|
61 |
+
0.5902777777777778,4189,954.8779497176456,0.1699598328396965,0.003086437945577309
|
62 |
+
0.6001984126984127,4297,934.7004834106868,0.17434170487280398,0.00316601190072707
|
63 |
+
0.6101190476190477,4366,923.3244117903994,0.17714123422728933,0.0032168508165171953
|
64 |
+
0.6200396825396826,4423,913.4868359265708,0.17945388891142938,0.0032588481817351247
|
65 |
+
0.6299603174603174,4558,889.5148546157745,0.18493122895281372,0.003358315625672326
|
66 |
+
0.6398809523809523,4672,870.5048665882368,0.18955653832109384,0.003442310356108185
|
67 |
+
0.6498015873015873,4786,849.9546866922489,0.19418184768937397,0.003526305086544044
|
68 |
+
0.6597222222222222,4901,832.7804952963619,0.19884772994684952,0.0036110366128609193
|
69 |
+
0.6696428571428571,5037,811.3520395758455,0.2043656428774293,0.003711240852679137
|
70 |
+
0.6795634920634921,5109,800.5203812652611,0.20728689089950095,0.0037642901561123114
|
71 |
+
0.689484126984127,5234,781.4194696938304,0.2123585020489309,0.00385638964123935
|
72 |
+
0.6994047619047619,5400,760.2850310752603,0.2190936016553739,0.0039786977574880564
|
73 |
+
0.7093253968253969,5524,745.3425802314556,0.22412463991560838,0.004070060446734079
|
74 |
+
0.7202380952380952,5736,718.2535271058875,0.23272609242504158,0.004226261173509536
|
75 |
+
0.7301587301587301,5846,706.1316731892126,0.23718911023653994,0.0043073087204213295
|
76 |
+
0.7400793650793651,6008,687.8141314252327,0.24376191828620117,0.004426669653145971
|
77 |
+
0.75,6151,671.9318091327671,0.24956384144114901,0.004532031464131303
|
78 |
+
0.7599206349206349,6301,655.3005304817065,0.25564977482046497,0.004642550846283749
|
79 |
+
0.7698412698412699,6444,641.222980944397,0.26145169797541284,0.004747912657269081
|
80 |
+
0.7797619047619048,6566,631.0274981581211,0.2664015904572565,0.00483780175475307
|
81 |
+
0.7896825396825397,6797,608.3009252958204,0.27577392786140303,0.005008001603267837
|
82 |
+
0.7996031746031746,7011,587.5531675251212,0.28445652614922706,0.005165675921805327
|
83 |
+
0.8095238095238095,7157,576.0696671571288,0.29038016797176125,0.005273248120433708
|
84 |
+
0.8194444444444444,7357,560.2436517890462,0.29849474581084917,0.0054206072966369685
|
85 |
+
0.8293650793650794,7603,539.923690961875,0.30847567655292735,0.005601859083366981
|
86 |
+
0.8392857142857143,7800,523.7004888975509,0.31646853572442896,0.005747007871927193
|
87 |
+
0.8492063492063492,7973,509.5041443087782,0.32348764555524,0.005874473559343014
|
88 |
+
0.8601190476190477,8058,502.7703002355239,0.3269363411368524,0.0059371012092294
|
89 |
+
0.8700396825396826,8235,491.0665617707924,0.33411774252444515,0.0060675140801692866
|
90 |
+
0.8799603174603174,8452,476.55673451118935,0.34292205947985555,0.006227398786349825
|
91 |
+
0.8898809523809523,8904,446.87607462645417,0.36126100539619427,0.006560430524569195
|
92 |
+
0.8998015873015873,9298,425.4498120093774,0.3772467237391975,0.006850728101689621
|
93 |
+
0.9097222222222222,9560,411.7376865200165,0.3878768207084026,0.007043768622515893
|
94 |
+
0.9196428571428572,10033,389.3847190107792,0.4070677972978456,0.007392273074236606
|
95 |
+
0.929563492063492,10240,379.36847411850994,0.4154663853613016,0.007544789821606981
|
96 |
+
0.939484126984127,10899,351.11195753265446,0.44220391934109626,0.008030338307196728
|
97 |
+
0.949404761904762,11501,326.7121276970904,0.46662879863675094,0.008473889427568545
|
98 |
+
0.9593253968253967,12079,304.2604884972614,0.490079928591715,0.00889975744679597
|
99 |
+
0.9692460317460316,13013,272.4577979927763,0.5279750071002556,0.0095879247996652
|
100 |
+
0.9791666666666666,13979,244.9501617824125,0.5671684180630503,0.010299669620726952
|
101 |
+
0.9890873015873016,16064,195.4118856404268,0.6517628920355418,0.011835889032645952
|
102 |
+
1.0,24647,96.18719446805105,1.0,0.018159808079408913
|
metric_analysis/output_standardized/roberta_paper_metric_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,1,3533.739728797425,1.2755427434373327e-06,7.367958810163067e-07
|
3 |
+
0.0109126984126984,11,913.2204554544222,1.4030970177810659e-05,8.104754691179374e-06
|
4 |
+
0.0208333333333333,22,642.3126335309646,2.8061940355621317e-05,1.620950938235875e-05
|
5 |
+
0.0307539682539682,34,463.6375267632411,4.336845327686931e-05,2.505105995455443e-05
|
6 |
+
0.0406746031746031,47,319.3778539107914,5.995050894155463e-05,3.462940640776642e-05
|
7 |
+
0.050595238095238,57,261.8047511787649,7.270593637592796e-05,4.1997365217929485e-05
|
8 |
+
0.060515873015873,68,220.9913721970096,8.673690655373862e-05,5.010211990910886e-05
|
9 |
+
0.0704365079365079,81,195.60887797794308,0.00010331896221842393,5.968046636232085e-05
|
10 |
+
0.0803571428571428,97,157.76121015148178,0.00012372764611342127,7.146920045858176e-05
|
11 |
+
0.0902777777777777,114,126.08462267664385,0.0001454118727518559,8.399473043585897e-05
|
12 |
+
0.1001984126984127,128,114.91598022184296,0.00016326947115997858,9.430987277008726e-05
|
13 |
+
0.1101190476190476,149,99.31178628400392,0.00019005586877216256,0.0001097825862714297
|
14 |
+
0.1200396825396825,164,90.3818928159452,0.00020918900992372255,0.00012083452448667431
|
15 |
+
0.1299603174603174,178,82.08798118734508,0.0002270466083318452,0.00013114966682090261
|
16 |
+
0.1398809523809523,197,70.16356545125636,0.0002512819204571545,0.00014514878856021244
|
17 |
+
0.1507936507936507,216,61.84780146447067,0.00027551723258246384,0.00015914791029952226
|
18 |
+
0.1607142857142857,241,54.36857680039037,0.00030740580116839716,0.00017756780732492992
|
19 |
+
0.1706349206349206,272,48.023909569922445,0.00034694762621495447,0.00020040847963643545
|
20 |
+
0.1805555555555555,300,41.873141561514856,0.00038266282303119976,0.00022103876430489203
|
21 |
+
0.1904761904761904,327,37.27337021039718,0.00041710247710400776,0.00024093225309233232
|
22 |
+
0.2003968253968254,354,34.49758353529179,0.00045154213117681576,0.0002608257418797726
|
23 |
+
0.2103174603174603,376,32.10956578521053,0.00047960407153243705,0.00027703525126213135
|
24 |
+
0.2202380952380952,413,29.10383773593751,0.0005267991530396184,0.0003042966988597347
|
25 |
+
0.2301587301587301,440,27.26674857348407,0.0005612388071124263,0.00032419018764717495
|
26 |
+
0.2400793650793651,493,23.38378755565621,0.000628842572514605,0.0003632403693410392
|
27 |
+
0.25,530,21.76810557585511,0.0006760376540217863,0.00039050181693864257
|
28 |
+
0.2599206349206349,589,19.614803085421624,0.0007512946758845889,0.0004339727739186047
|
29 |
+
0.2698412698412698,617,18.715704509161537,0.0007870098727008342,0.0004546030585870613
|
30 |
+
0.2797619047619047,656,17.503118276572664,0.0008367560396948902,0.00048333809794669723
|
31 |
+
0.2906746031746032,710,15.75957395134853,0.0009056353478405061,0.0005231250755215778
|
32 |
+
0.3005952380952381,776,14.116974236232336,0.0009898211689073702,0.0005717536036686541
|
33 |
+
0.310515873015873,830,13.104004547610508,0.001058700477052986,0.0006115405812435346
|
34 |
+
0.3204365079365079,868,12.499962631113736,0.0011071711013036047,0.0006395388247221543
|
35 |
+
0.3303571428571428,914,11.924377006525065,0.001165846067501722,0.0006734314352489044
|
36 |
+
0.3402777777777778,967,11.162073579916996,0.0012334498329039005,0.0007124816169427686
|
37 |
+
0.3501984126984127,1013,10.492987863082892,0.0012921247991020178,0.0007463742274695188
|
38 |
+
0.3601190476190476,1082,9.709458682853722,0.0013801372483991938,0.0007972131432596439
|
39 |
+
0.3700396825396825,1182,8.841857620201926,0.001507691522742927,0.0008708927313612746
|
40 |
+
0.3799603174603174,1318,7.95447265160399,0.0016811653358504044,0.0009710969711794924
|
41 |
+
0.3898809523809524,1381,7.553164524632679,0.0017615245286869564,0.0010175151116835197
|
42 |
+
0.3998015873015873,1476,6.895417771804837,0.0018827010893135029,0.0010875107203800688
|
43 |
+
0.4097222222222222,1590,6.348934235182757,0.0020281129620653587,0.0011715054508159278
|
44 |
+
0.4196428571428571,1669,5.972958366751632,0.002128880838796908,0.0012297123254162159
|
45 |
+
0.4305555555555556,1743,5.658337935222293,0.0022232710018112705,0.0012842352206114227
|
46 |
+
0.4404761904761904,1898,5.125327551050057,0.0024209801270440572,0.0013984385821689503
|
47 |
+
0.4503968253968254,1956,4.94650983750502,0.0024949616061634224,0.0014411727432678961
|
48 |
+
0.4603174603174603,2041,4.73908030227792,0.002603382739355596,0.0015038003931542821
|
49 |
+
0.4702380952380952,2157,4.426219447170888,0.0027513456975943266,0.0015892687153521738
|
50 |
+
0.4801587301587302,2384,3.9638262404902136,0.003040893900354601,0.0017565213803428753
|
51 |
+
0.490079365079365,2502,3.752840702273832,0.003191407944080206,0.0018434632943027996
|
52 |
+
0.5,2593,3.59870356795998,0.0033074823337330035,0.0019105117194752835
|
53 |
+
0.5099206349206349,2707,3.428223279567193,0.0034528942064848593,0.0019945064499111423
|
54 |
+
0.5198412698412699,2908,3.1785787340504648,0.003709278297915763,0.00214260242199542
|
55 |
+
0.5297619047619048,3081,2.955250022079203,0.003929947192530422,0.002270068109411241
|
56 |
+
0.5396825396825397,3152,2.8954501822960066,0.004020510727314472,0.002322380616963399
|
57 |
+
0.5496031746031746,3248,2.7815895186317805,0.004142962830684456,0.0023931130215409644
|
58 |
+
0.5595238095238095,3468,2.583086678098461,0.004423582234240669,0.002555208115364552
|
59 |
+
0.5694444444444444,3687,2.4125368969101038,0.004702926095053446,0.002716566413307123
|
60 |
+
0.5803571428571429,4128,2.125304741103856,0.005265440444909309,0.0030414933968353143
|
61 |
+
0.5902777777777778,4249,2.051449708767312,0.005419781116865226,0.0031306456984382873
|
62 |
+
0.6001984126984127,4484,1.9245385016106695,0.005719533661572999,0.0033037927304771196
|
63 |
+
0.6101190476190477,4686,1.8317436498146948,0.00597719329574734,0.0034526254984424136
|
64 |
+
0.6200396825396826,4966,1.722560123794516,0.006334345263909793,0.0036589283451269796
|
65 |
+
0.6299603174603174,5321,1.58351180677159,0.0067871629378300465,0.0039204908828877685
|
66 |
+
0.6398809523809523,5882,1.4051718048191333,0.0075027424168983906,0.0043338333721379164
|
67 |
+
0.6498015873015873,6161,1.327281088505106,0.007858618842317406,0.004539399422941466
|
68 |
+
0.6597222222222222,6467,1.2602164579934505,0.008248934921809229,0.004764858962532456
|
69 |
+
0.6696428571428571,6780,1.1855272483643158,0.008648179800505114,0.00499547607329056
|
70 |
+
0.6795634920634921,7289,1.0890041480074235,0.009297431056914718,0.00537050517672786
|
71 |
+
0.689484126984127,7661,1.0212425782882102,0.009771932957473405,0.0056445932444659265
|
72 |
+
0.6994047619047619,8206,0.9414799820328928,0.01046710375264675,0.006046146999619814
|
73 |
+
0.7093253968253969,8603,0.888609564785895,0.010973494221791372,0.006338654964383287
|
74 |
+
0.7202380952380952,9007,0.8391207342688993,0.011488813490140054,0.006636320500313875
|
75 |
+
0.7301587301587301,9788,0.750920049260784,0.012485012372764611,0.007211758083387611
|
76 |
+
0.7400793650793651,10314,0.7019883742889699,0.013155947855812648,0.007599312716802188
|
77 |
+
0.75,10726,0.6646019719884612,0.01368147146610883,0.007902872619780907
|
78 |
+
0.7599206349206349,11316,0.6182192984983755,0.014434041684736856,0.008337582189580528
|
79 |
+
0.7698412698412699,11931,0.5754220771104159,0.015218500471950815,0.008790711656405556
|
80 |
+
0.7797619047619048,12508,0.5409698646849892,0.015954488634914155,0.009215842879751966
|
81 |
+
0.7896825396825397,13194,0.5055156298192968,0.016829510956912166,0.009721284854129151
|
82 |
+
0.7996031746031746,14505,0.4477972728644806,0.01850174749355851,0.01068722425414153
|
83 |
+
0.8095238095238095,15401,0.4148045252952486,0.01964463379167836,0.011347393363532141
|
84 |
+
0.8194444444444444,16677,0.3732660056369872,0.021272226332304394,0.012287544907708947
|
85 |
+
0.8293650793650794,17811,0.3390347868176622,0.02271869180336233,0.01312307143678144
|
86 |
+
0.8392857142857143,19304,0.304132160348205,0.024623077119314267,0.014223107687138786
|
87 |
+
0.8492063492063492,20166,0.2869145938607858,0.02572259496415725,0.014858225736574842
|
88 |
+
0.8601190476190477,22479,0.2465673793342506,0.0286729253297278,0.01656243460936556
|
89 |
+
0.8700396825396826,24972,0.2125022735791463,0.03185285338911707,0.018399266740739214
|
90 |
+
0.8799603174603174,28663,0.1745575677179806,0.03656088165514426,0.021118780337570402
|
91 |
+
0.8898809523809523,35102,0.1308102522001188,0.04477410138013725,0.0258630090154344
|
92 |
+
0.8998015873015873,44279,0.0931967814168663,0.05647975713666165,0.032624584815521045
|
93 |
+
0.9097222222222222,53428,0.0704476225450466,0.0681496976963698,0.03936553033093924
|
94 |
+
0.9196428571428572,67477,0.0496322351327484,0.0860697976989209,0.049716775663337334
|
95 |
+
0.929563492063492,90944,0.0310654466601885,0.11600295925916478,0.067007164603147
|
96 |
+
0.939484126984127,115968,0.0206347700888285,0.14792214087094058,0.08544474472969907
|
97 |
+
0.949404761904762,156999,0.0119895694113137,0.20025893517691779,0.11567621652367914
|
98 |
+
0.9593253968253967,225314,0.0059620822317802,0.28739763769483917,0.16601042713530814
|
99 |
+
0.9692460317460316,341935,0.0022855871148248,0.4361527079772443,0.25193629957531083
|
100 |
+
0.9791666666666666,474954,0.0008589343284472,0.6058241281665349,0.349944150872219
|
101 |
+
0.9890873015873016,606979,0.0003409942258809,0.7742276588688487,0.44721962706339685
|
102 |
+
1.0,783980,9.05288957657348e-05,1.0,0.5776332347991642
|
metric_analysis/output_standardized/virality_avg_retweets_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,1,3128.6932364568866,1.0053282396702524e-05,7.367958810163067e-07
|
3 |
+
0.0109126984126984,11,2754.925734752474,0.00011058610636372776,8.104754691179374e-06
|
4 |
+
0.0208333333333333,21,2581.3315487148006,0.000211118930330753,1.547271350134244e-05
|
5 |
+
0.0307539682539682,31,2500.869915344092,0.00031165175429777824,2.284067231150551e-05
|
6 |
+
0.0406746031746031,41,2303.2090263636537,0.00041218457826480346,3.0208631121668577e-05
|
7 |
+
0.050595238095238,51,2152.268107221841,0.0005127174022318287,3.7576589931831644e-05
|
8 |
+
0.060515873015873,62,2003.0277986460685,0.0006233035085955565,4.568134462301102e-05
|
9 |
+
0.0704365079365079,74,1862.527068589252,0.0007439428973559868,5.4522895195206704e-05
|
10 |
+
0.0803571428571428,88,1637.9180724687665,0.0008846888509098221,6.4838037529435e-05
|
11 |
+
0.0902777777777777,102,1482.542530248446,0.0010254348044636573,7.515317986366329e-05
|
12 |
+
0.1001984126984127,112,1440.1384708594214,0.0011259676284306826,8.252113867382636e-05
|
13 |
+
0.1101190476190476,123,1367.2655234393055,0.0012365537347944105,9.062589336500573e-05
|
14 |
+
0.1200396825396825,136,1253.92530228473,0.0013672464059515431,0.00010020423981821773
|
15 |
+
0.1299603174603174,156,1119.7867538390494,0.0015683120538855936,0.00011494015743854386
|
16 |
+
0.1398809523809523,171,1058.1367647948662,0.0017191112898361316,0.00012599209565378845
|
17 |
+
0.1507936507936507,182,976.0304653073382,0.0018296973961998593,0.00013409685034496783
|
18 |
+
0.1607142857142857,201,866.7503880337599,0.002020709761737207,0.00014809597208427765
|
19 |
+
0.1706349206349206,219,815.3028458797158,0.0022016688448778525,0.00016135829794257118
|
20 |
+
0.1805555555555555,232,786.2690567569583,0.0023323615160349854,0.00017093664439578316
|
21 |
+
0.1904761904761904,248,741.7820711022046,0.002493214034382226,0.00018272537849204409
|
22 |
+
0.2003968253968254,266,691.4859160249838,0.0026741731175228714,0.0001959877043503376
|
23 |
+
0.2103174603174603,286,634.4445375768611,0.002875238765456922,0.00021072362197066373
|
24 |
+
0.2202380952380952,303,599.4004450619773,0.0030461445662008646,0.00022324915194794095
|
25 |
+
0.2301587301587301,327,551.4285181823311,0.003287423343721725,0.00024093225309233232
|
26 |
+
0.2400793650793651,348,533.9551789024323,0.003498542274052478,0.00025640496659367474
|
27 |
+
0.25,367,506.6784740394677,0.0036895546395898263,0.00027040408833298456
|
28 |
+
0.2599206349206349,384,485.52769942049525,0.003860460440333769,0.0002829296183102618
|
29 |
+
0.2698412698412698,421,446.810815150956,0.004232431889011763,0.00031019106590786513
|
30 |
+
0.2797619047619047,444,408.7457694650668,0.00446365738413592,0.0003271373711712402
|
31 |
+
0.2906746031746032,474,384.5033519093221,0.004765255856036996,0.0003492412476017294
|
32 |
+
0.3005952380952381,507,362.2200443714963,0.00509701417512818,0.00037355551167526756
|
33 |
+
0.310515873015873,546,330.0766694728744,0.005489092188599578,0.0004022905510349035
|
34 |
+
0.3204365079365079,578,310.3159564904467,0.005810797225294059,0.0004258680192274253
|
35 |
+
0.3303571428571428,623,292.8722993281888,0.006263194933145672,0.00045902383387315913
|
36 |
+
0.3402777777777778,657,275.9762698041734,0.006605006534633558,0.00048407489382771353
|
37 |
+
0.3501984126984127,693,264.6301245685592,0.006966924700914849,0.0005105995455443006
|
38 |
+
0.3601190476190476,730,249.2392647610653,0.007338896149592842,0.0005378609931419039
|
39 |
+
0.3700396825396825,767,235.05164885003057,0.007710867598270835,0.0005651224407395073
|
40 |
+
0.3799603174603174,822,216.09849275103548,0.008263798130089475,0.0006056462141954042
|
41 |
+
0.3898809523809524,875,202.1903749398219,0.008796622097114707,0.0006446963958892685
|
42 |
+
0.3998015873015873,920,193.56605517288256,0.009249019804966322,0.0006778522105350022
|
43 |
+
0.4097222222222222,963,187.94005117417709,0.00968131094802453,0.0007095344334187034
|
44 |
+
0.4196428571428571,1034,175.0797930343889,0.01039509399819041,0.0007618469409708612
|
45 |
+
0.4305555555555556,1101,164.50275477412396,0.011068663918769478,0.0008112122649989537
|
46 |
+
0.4404761904761904,1145,157.99851670144255,0.01151100834422439,0.0008436312837636713
|
47 |
+
0.4503968253968254,1200,150.88390964671086,0.012063938876043028,0.0008841550572195681
|
48 |
+
0.4603174603174603,1290,140.8240668782626,0.012968734291746255,0.0009504666865110357
|
49 |
+
0.4702380952380952,1369,130.09518134268558,0.013762943601085754,0.001008673561111324
|
50 |
+
0.4801587301587302,1444,124.34249552435476,0.014516939780838444,0.0010639332521875469
|
51 |
+
0.490079365079365,1523,117.28492475872802,0.015311149090177943,0.0011221401267878352
|
52 |
+
0.5,1611,110.04173671014205,0.016195837941087764,0.0011869781643172703
|
53 |
+
0.5099206349206349,1723,102.7628034997729,0.01732180556951845,0.0012694993029910965
|
54 |
+
0.5198412698412699,1831,96.64859215298335,0.01840756006836232,0.0013490732581408578
|
55 |
+
0.5297619047619048,1975,88.97259652512547,0.019855232733487483,0.001455171865007206
|
56 |
+
0.5396825396825397,2052,86.08126921842629,0.020629335478033577,0.0015119051478454616
|
57 |
+
0.5496031746031746,2252,78.06253536357954,0.02263999195737408,0.0016592643240487229
|
58 |
+
0.5595238095238095,2391,73.26624858192154,0.024037398210515735,0.0017616789515099895
|
59 |
+
0.5694444444444444,2484,70.65733437208469,0.024972353473409068,0.001830200968444506
|
60 |
+
0.5803571428571429,2584,68.03407090113993,0.02597768171307932,0.0019038805565461366
|
61 |
+
0.5902777777777778,2770,63.54623310376039,0.02784759223886599,0.00204092459041517
|
62 |
+
0.6001984126984127,2892,60.7634161516708,0.0290740926912637,0.002130813687899159
|
63 |
+
0.6101190476190477,3103,56.50224523966051,0.03119533527696793,0.0022862776187936
|
64 |
+
0.6200396825396826,3193,54.78245772103394,0.032100130692671154,0.0023525892480850677
|
65 |
+
0.6299603174603174,3524,49.07891508810281,0.035427767165979694,0.002596468684701465
|
66 |
+
0.6398809523809523,3665,46.847492839618525,0.03684527998391475,0.002700356903924764
|
67 |
+
0.6498015873015873,3886,43.93985563230335,0.03906705539358601,0.002863188793629368
|
68 |
+
0.6597222222222222,4164,40.57703645974158,0.04186186789986931,0.0030680180485519013
|
69 |
+
0.6696428571428571,4393,38.21854701763988,0.04416406956871419,0.0032367443053046355
|
70 |
+
0.6795634920634921,4677,35.74808318428063,0.047019201769377704,0.003445994335513267
|
71 |
+
0.689484126984127,5052,32.567920421715776,0.05078918266814115,0.003722292790894382
|
72 |
+
0.6994047619047619,5385,30.60214054903069,0.05413692570624309,0.003967645819272812
|
73 |
+
0.7093253968253969,5808,28.183043893453764,0.058389464160048254,0.00427931047694271
|
74 |
+
0.7202380952380952,6079,26.95420443855561,0.06111390368955464,0.004478982160698129
|
75 |
+
0.7301587301587301,6475,25.012739959261825,0.06509500351864884,0.004770753329580586
|
76 |
+
0.7400793650793651,6999,22.874625271719133,0.07036292349452096,0.005156834371233131
|
77 |
+
0.75,7589,20.906896682604128,0.07629436010857545,0.005591543941032752
|
78 |
+
0.7599206349206349,8060,19.641355294192596,0.08102945611742234,0.005938574800991432
|
79 |
+
0.7698412698412699,8483,18.548537529332275,0.08528199457122751,0.006250239458661331
|
80 |
+
0.7797619047619048,9205,16.915461328983927,0.09254046446164672,0.006782206084755104
|
81 |
+
0.7896825396825397,9532,16.27014209744153,0.09582788780536845,0.007023138337847436
|
82 |
+
0.7996031746031746,9920,15.580116677414129,0.09972856137528903,0.007309015139681763
|
83 |
+
0.8095238095238095,11077,13.707925776186906,0.11136020910827385,0.00816148797401763
|
84 |
+
0.8194444444444444,11889,12.681181379795396,0.1195234744143963,0.00875976622940287
|
85 |
+
0.8293650793650794,13090,11.388669220662166,0.13159746657283602,0.009644658082503456
|
86 |
+
0.8392857142857143,14621,9.997273862423672,0.1469890419221876,0.010772692576339421
|
87 |
+
0.8492063492063492,15873,9.0580071984413,0.15957575148285916,0.011695161019371837
|
88 |
+
0.8601190476190477,17036,8.371478269233975,0.1712677189102242,0.012552054628993801
|
89 |
+
0.8700396825396826,17741,7.978472899264016,0.17835528299989947,0.013071495725110299
|
90 |
+
0.8809523809523809,19770,7.0,0.1987533929828089,0.014566454567692384
|
91 |
+
0.8898809523809523,20244,6.807979662864933,0.20351864883884588,0.014915695815294115
|
92 |
+
0.8998015873015873,21629,6.278415213300843,0.21744244495827889,0.0159361581105017
|
93 |
+
0.9097222222222222,24560,5.39307122938644,0.24690861566301398,0.018095706837760493
|
94 |
+
0.9196428571428572,26082,4.999137001078749,0.2622097114707952,0.019217110168667312
|
95 |
+
0.929563492063492,28223,4.501034715181624,0.2837337890821353,0.020794590149923225
|
96 |
+
0.939484126984127,30526,4.072360076314477,0.3068864984417412,0.02249143106390378
|
97 |
+
0.949404761904762,34261,3.497256599434304,0.34443550819342517,0.025243363679499687
|
98 |
+
0.9593253968253967,37923,3.0599798377048133,0.3812506283301498,0.0279415101957814
|
99 |
+
0.9692460317460316,43642,2.5354184580479755,0.43874535035689155,0.03215524583931366
|
100 |
+
0.9791666666666666,50271,2.0852622686277305,0.5053885593646326,0.03703946573457076
|
101 |
+
0.9890873015873016,62079,1.5480748118436458,0.624097717904896,0.04573955149761131
|
102 |
+
1.0,99470,0.7599240299001586,1.0,0.07328908628469204
|
metric_analysis/output_standardized/virality_followers_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,1,162.90697674418604,2.0508613617719442e-05,7.367958810163067e-07
|
3 |
+
0.0109126984126984,11,59.90412635890944,0.00022559474979491387,8.104754691179374e-06
|
4 |
+
0.0208333333333333,22,41.26621697088786,0.00045118949958982774,1.620950938235875e-05
|
5 |
+
0.0307539682539682,34,27.75797742299627,0.000697292863002461,2.505105995455443e-05
|
6 |
+
0.0406746031746031,45,20.41887906141272,0.0009228876127973749,3.31558146457338e-05
|
7 |
+
0.050595238095238,56,17.36539923954372,0.0011484823625922888,4.126056933691318e-05
|
8 |
+
0.060515873015873,73,13.329307594828814,0.0014971287940935194,5.378609931419039e-05
|
9 |
+
0.0704365079365079,88,11.423473275192162,0.001804757998359311,6.4838037529435e-05
|
10 |
+
0.0803571428571428,100,9.792804922314204,0.002050861361771944,7.367958810163068e-05
|
11 |
+
0.0902777777777777,114,8.966047810728663,0.0023379819524200164,8.399473043585897e-05
|
12 |
+
0.1001984126984127,132,8.241192277609564,0.0027071369975389665,9.725705629415249e-05
|
13 |
+
0.1101190476190476,150,7.225324262618784,0.0030762920426579163,0.00011051938215244602
|
14 |
+
0.1200396825396825,164,6.426211688059448,0.0033634126333059885,0.00012083452448667431
|
15 |
+
0.1299603174603174,181,5.865184280255489,0.0037120590648072192,0.00013336005446395153
|
16 |
+
0.1398809523809523,209,5.123369293423201,0.004286300246103363,0.00015399033913240812
|
17 |
+
0.1507936507936507,233,4.498533912223866,0.00477850697292863,0.00017167344027679948
|
18 |
+
0.1607142857142857,255,4.081553139911354,0.005229696472518457,0.00018788294965915822
|
19 |
+
0.1706349206349206,286,3.653621381908462,0.00586546349466776,0.00021072362197066373
|
20 |
+
0.1805555555555555,318,3.3862837057887427,0.006521739130434782,0.00023430109016318556
|
21 |
+
0.1904761904761904,343,3.1589097951921734,0.007034454470877769,0.00025272098718859325
|
22 |
+
0.2003968253968254,378,2.901110957300737,0.0077522559474979495,0.00027850884302416394
|
23 |
+
0.2103174603174603,412,2.6883909458302067,0.008449548810500411,0.0003035599029787184
|
24 |
+
0.2202380952380952,440,2.5712123367735846,0.009023789991796555,0.00032419018764717495
|
25 |
+
0.2301587301587301,489,2.3775402418530684,0.010028712059064807,0.00036029318581697403
|
26 |
+
0.2400793650793651,516,2.262779417926834,0.010582444626743232,0.0003801866746044143
|
27 |
+
0.25,532,2.212570952273289,0.010910582444626744,0.0003919754087006752
|
28 |
+
0.2599206349206349,567,2.1149449648249323,0.011628383921246923,0.0004177632645362459
|
29 |
+
0.2698412698412698,612,2.01276188638082,0.0125512715340443,0.00045091907918197975
|
30 |
+
0.2797619047619047,663,1.871412888170072,0.01359721082854799,0.0004884956691138114
|
31 |
+
0.2906746031746032,713,1.7528878661684644,0.014622641509433962,0.0005253354631646267
|
32 |
+
0.3005952380952381,757,1.6483081903461123,0.015525020508613617,0.0005577544819293442
|
33 |
+
0.310515873015873,804,1.5657617737942116,0.016488925348646433,0.0005923838883371106
|
34 |
+
0.3204365079365079,868,1.4648138997453877,0.017801476620180477,0.0006395388247221543
|
35 |
+
0.3303571428571428,930,1.3788231439711065,0.01907301066447908,0.0006852201693451653
|
36 |
+
0.3402777777777778,973,1.3111586957558676,0.019954881050041017,0.0007169023922288665
|
37 |
+
0.3501984126984127,1035,1.2350532566427777,0.02122641509433962,0.0007625837368518775
|
38 |
+
0.3601190476190476,1079,1.1946824800319331,0.02212879409351928,0.000795002755616595
|
39 |
+
0.3700396825396825,1157,1.126624449492463,0.023728465955701394,0.000852472834335867
|
40 |
+
0.3799603174603174,1238,1.0517923348956837,0.02538966365873667,0.0009121533006981878
|
41 |
+
0.3898809523809524,1281,1.0159094482787685,0.026271534044298606,0.000943835523581889
|
42 |
+
0.3998015873015873,1340,0.9728949369054694,0.027481542247744052,0.000987306480561851
|
43 |
+
0.4097222222222222,1407,0.933151427225055,0.028855619360131254,0.0010366718045899436
|
44 |
+
0.4196428571428571,1492,0.8837801041456175,0.03059885151763741,0.0010992994544763296
|
45 |
+
0.4305555555555556,1550,0.8543538482681353,0.03178835110746513,0.0011420336155752754
|
46 |
+
0.4404761904761904,1623,0.8251487414647105,0.033285479901558654,0.0011958197148894659
|
47 |
+
0.4503968253968254,1690,0.7949358527010969,0.03465955701394586,0.0012451850389175584
|
48 |
+
0.4603174603174603,1755,0.7707349266582455,0.03599261689909762,0.0012930767711836185
|
49 |
+
0.4702380952380952,1840,0.7363839449817713,0.03773584905660377,0.0013557044210700045
|
50 |
+
0.4801587301587302,1910,0.7136122363660038,0.039171452009844135,0.0014072801327411459
|
51 |
+
0.490079365079365,2000,0.6861455175535648,0.04101722723543889,0.0014735917620326134
|
52 |
+
0.5,2102,0.6582912665179754,0.04310910582444627,0.0015487449418962768
|
53 |
+
0.5099206349206349,2189,0.6347704849378184,0.04489335520918786,0.0016128461835446955
|
54 |
+
0.5198412698412699,2275,0.6145070157601543,0.04665709598031173,0.001676210629312098
|
55 |
+
0.5297619047619048,2381,0.5911688811978919,0.04883100902378999,0.0017543109926998264
|
56 |
+
0.5396825396825397,2480,0.569913190896405,0.05086136177194422,0.0018272537849204407
|
57 |
+
0.5496031746031746,2548,0.554468003063123,0.05225594749794914,0.0018773559048295497
|
58 |
+
0.5595238095238095,2651,0.5376141635489305,0.05436833470057424,0.001953245880574229
|
59 |
+
0.5694444444444444,2811,0.5065435623907723,0.05764971287940935,0.0020711332215368385
|
60 |
+
0.5803571428571429,3009,0.4744476851437965,0.0617104183757178,0.002217018805978067
|
61 |
+
0.5902777777777778,3160,0.4526047518884247,0.06480721903199343,0.0023282749840115293
|
62 |
+
0.6001984126984127,3261,0.4398674447582947,0.0668785890073831,0.0024026913679941766
|
63 |
+
0.6101190476190477,3433,0.4182156945481237,0.07040607054963084,0.0025294202595289813
|
64 |
+
0.6200396825396826,3714,0.385699748941216,0.07616899097621001,0.0027364599020945632
|
65 |
+
0.6299603174603174,3846,0.3735030733895079,0.07887612797374897,0.0028337169583887156
|
66 |
+
0.6398809523809523,3906,0.3673857063584148,0.08010664479081214,0.002877924711249694
|
67 |
+
0.6498015873015873,4074,0.3524044985857981,0.083552091878589,0.003001706419260434
|
68 |
+
0.6597222222222222,4190,0.3419197998258842,0.08593109105824447,0.0030871747414583255
|
69 |
+
0.6696428571428571,4365,0.3263783963721026,0.08952009844134537,0.003216114020636179
|
70 |
+
0.6795634920634921,4530,0.3143190758771201,0.09290401968826907,0.0033376853410038696
|
71 |
+
0.689484126984127,4709,0.3015478411563975,0.09657506152584085,0.0034695718037057884
|
72 |
+
0.6994047619047619,4873,0.2918550526925871,0.09993847415914685,0.0035904063281924628
|
73 |
+
0.7093253968253969,5083,0.2786949054424083,0.10424528301886793,0.003745133463205887
|
74 |
+
0.7202380952380952,5257,0.2691313357520127,0.1078137817883511,0.0038733359465027246
|
75 |
+
0.7301587301587301,5642,0.2494124362008614,0.11570959803117309,0.004157002360694002
|
76 |
+
0.7400793650793651,5863,0.2395665365261546,0.12024200164068909,0.004319834250398606
|
77 |
+
0.75,6103,0.2296975500170638,0.12516406890894174,0.00449666526184252
|
78 |
+
0.7599206349206349,6314,0.2220161304406145,0.12949138638228055,0.004652129192736961
|
79 |
+
0.7698412698412699,6505,0.2150225411561583,0.13340853158326496,0.004792857206011076
|
80 |
+
0.7797619047619048,6648,0.20896555791065,0.13634126333059884,0.004898219016996407
|
81 |
+
0.7896825396825397,6925,0.1988488053196262,0.14202214930270712,0.005102311476037924
|
82 |
+
0.7996031746031746,7172,0.1912203515618782,0.14708777686628383,0.0052843000586489525
|
83 |
+
0.8095238095238095,7453,0.1817782046397847,0.152850697292863,0.005491339701214534
|
84 |
+
0.8194444444444444,7671,0.1758490505459375,0.15732157506152583,0.005651961203276089
|
85 |
+
0.8293650793650794,7952,0.1691944806665418,0.163084495488105,0.0058590008458416715
|
86 |
+
0.8392857142857143,8146,0.1645962122201038,0.16706316652994257,0.006001939246758835
|
87 |
+
0.8492063492063492,8428,0.1573623039169127,0.17284659557013946,0.006209715685205433
|
88 |
+
0.8601190476190477,8785,0.1499039332865125,0.1801681706316653,0.006472751814728255
|
89 |
+
0.8700396825396826,9156,0.1425418343797845,0.18777686628383922,0.006746103086585305
|
90 |
+
0.8799603174603174,9485,0.1369052513814404,0.1945242001640689,0.006988508931439669
|
91 |
+
0.8898809523809523,9888,0.1296152659458055,0.20278917145200984,0.0072854376714892415
|
92 |
+
0.8998015873015873,10242,0.1235157570136157,0.21004922067268253,0.007546263413369014
|
93 |
+
0.9097222222222222,10647,0.1169143777742638,0.2183552091878589,0.007844665745180618
|
94 |
+
0.9196428571428572,11156,0.1099605772220808,0.2287940935192781,0.008219694848617919
|
95 |
+
0.929563492063492,11792,0.1021918749641372,0.24183757178014767,0.008688297028944289
|
96 |
+
0.939484126984127,12184,0.0977734981562192,0.24987694831829368,0.008977121014302682
|
97 |
+
0.949404761904762,12755,0.0912248028336146,0.2615873666940115,0.009397831462362992
|
98 |
+
0.9593253968253967,14007,0.079949907336287,0.28726415094339625,0.01032029990539541
|
99 |
+
0.9692460317460316,14613,0.0749577471557994,0.2996923707957342,0.01076679820929129
|
100 |
+
0.9791666666666666,16478,0.0625395687276457,0.33794093519278096,0.012140922527386702
|
101 |
+
0.9890873015873016,20751,0.0435969642897507,0.42557424118129616,0.015289251326969382
|
102 |
+
1.0,48760,0.0098477489215356,1.0,0.035926167158355116
|
metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled 2.csv
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.003076923076923,5,56750.0,3.719988988832593e-05,3.6839794050815336e-06
|
3 |
+
0.0123076923076923,31,28349.359999999968,0.00023063931730762076,2.284067231150551e-05
|
4 |
+
0.0215384615384615,52,24319.559999999983,0.0003868788548385897,3.8313385812847955e-05
|
5 |
+
0.0307692307692307,65,21221.51999999996,0.0004835985685482371,4.789173226605994e-05
|
6 |
+
0.04,77,19165.07999999996,0.0005728783042802194,5.673328283825562e-05
|
7 |
+
0.0523076923076923,95,17173.200000000004,0.0007067979078781927,6.999560869654915e-05
|
8 |
+
0.0615384615384615,106,16057.56,0.0007886376656325097,7.810036338772852e-05
|
9 |
+
0.0707692307692307,120,14684.879999999996,0.0008927973573198224,8.841550572195681e-05
|
10 |
+
0.08,135,13536.373333333351,0.0010043970269848002,9.946744393720141e-05
|
11 |
+
0.0923076923076923,167,11616.800000000008,0.001242476322270086,0.00012304491212972323
|
12 |
+
0.1015384615384615,179,10952.400000000009,0.0013317560580020683,0.0001318864627019189
|
13 |
+
0.1107692307692307,194,9990.320000000002,0.001443355727667046,0.00014293840091716351
|
14 |
+
0.12,209,9376.240000000002,0.0015549553973320238,0.00015399033913240812
|
15 |
+
0.1323076923076923,233,8752.44,0.0017335148687959883,0.00017167344027679948
|
16 |
+
0.1415384615384615,241,8540.986666666664,0.0017930346926173099,0.00017756780732492992
|
17 |
+
0.1507692307692307,278,7806.799999999999,0.0020683138777909216,0.0002048292549225333
|
18 |
+
0.16,299,7523.599999999994,0.002224553415321891,0.00022030196842387574
|
19 |
+
0.1723076923076923,317,7362.599999999994,0.002358473018919864,0.00023356429428216923
|
20 |
+
0.1815384615384615,338,7073.840000000001,0.002514712556450833,0.0002490370077835117
|
21 |
+
0.1907692307692307,384,6516.36,0.0028569515434234316,0.0002829296183102618
|
22 |
+
0.2,419,6033.56,0.003117350772641713,0.00030871747414583253
|
23 |
+
0.2123076923076923,456,5618.76,0.0033926299578153248,0.0003359789217434359
|
24 |
+
0.2215384615384615,487,5280.888,0.003623269275122946,0.0003588195940549414
|
25 |
+
0.2307692307692307,526,4824.393333333336,0.003913428416251888,0.0003875546334145774
|
26 |
+
0.24,609,4260.660000000002,0.004530946588398099,0.0004487086915389308
|
27 |
+
0.2523076923076923,645,4030.0,0.004798785795594045,0.00047523334325551785
|
28 |
+
0.2615384615384615,719,3691.0899999999992,0.005349344165941269,0.0005297562384507246
|
29 |
+
0.2707692307692307,775,3428.82,0.00576598293269052,0.0005710168077876377
|
30 |
+
0.28,784,3394.6466666666665,0.0058329427344895055,0.0005776479707167845
|
31 |
+
0.2892307692307692,845,3166.7999999999997,0.006286781391127082,0.0006225925194587792
|
32 |
+
0.3015384615384615,878,3054.0666666666657,0.006532300664390033,0.0006469067835323173
|
33 |
+
0.3107692307692307,946,2800.0149999999994,0.007038219166871266,0.0006970089034414262
|
34 |
+
0.32,997,2616.640000000003,0.00741765804373219,0.0007345854933732579
|
35 |
+
0.3292307692307692,1048,2474.848000000001,0.007797096920593115,0.0007721620833050895
|
36 |
+
0.3415384615384615,1125,2290.6666666666665,0.008369975224873335,0.0008288953661433451
|
37 |
+
0.3507692307692308,1152,2251.8,0.008570854630270294,0.0008487888549307854
|
38 |
+
0.36,1195,2140.38,0.008890773683309898,0.0008804710778144866
|
39 |
+
0.3692307692307692,1271,2013.0342857142855,0.009456212009612452,0.0009364675647717259
|
40 |
+
0.3815384615384615,1292,1982.117333333333,0.00961245154714342,0.0009519402782730683
|
41 |
+
0.3907692307692307,1368,1840.1704347826085,0.010177889873445974,0.0010079367652303076
|
42 |
+
0.4,1407,1778.9000000000003,0.010468049014574916,0.0010366718045899436
|
43 |
+
0.4092307692307692,1447,1719.88,0.010765648133681525,0.001066143639830596
|
44 |
+
0.4215384615384615,1607,1516.499999999999,0.011956044610107954,0.0011840309807932049
|
45 |
+
0.4307692307692308,1755,1346.0,0.013057161350802402,0.0012930767711836185
|
46 |
+
0.44,1823,1287.986666666667,0.013563079853283633,0.0013431788910927272
|
47 |
+
0.4492307692307692,1975,1153.6727272727285,0.014693956505888743,0.001455171865007206
|
48 |
+
0.4615384615384615,2071,1087.9872268907568,0.0154081943917446,0.0015259042695847714
|
49 |
+
0.4707692307692308,2231,975.34,0.01659859086817103,0.0016437916105473804
|
50 |
+
0.48,2396,887.333684210527,0.017826187234485785,0.001765362930915071
|
51 |
+
0.4892307692307692,2619,793.9771428571429,0.01948530232350512,0.0019296684123817074
|
52 |
+
0.5015384615384615,2736,753.530612244898,0.02035577974689195,0.002015873530460615
|
53 |
+
0.5107692307692308,2919,700.650976744186,0.02171729571680468,0.0021507071766865993
|
54 |
+
0.52,3127,644.4646666666665,0.023264811136159035,0.0023039607199379915
|
55 |
+
0.5292307692307693,3301,600.444358974359,0.02455936730427278,0.0024321632032348285
|
56 |
+
0.5384615384615384,3471,561.6797385620916,0.02582416356047586,0.002557418503007601
|
57 |
+
0.5507692307692308,4115,451.9967654986525,0.03061550937809224,0.003031915050382102
|
58 |
+
0.56,4518,396.6374331550802,0.033613820503091314,0.003328843790431674
|
59 |
+
0.5692307692307692,4667,379.78126543209873,0.034722377221763426,0.003438626376703104
|
60 |
+
0.5784615384615385,4849,357.453793103448,0.03607645321369849,0.0035727232270480716
|
61 |
+
0.5907692307692308,4972,345.265,0.036991570504951304,0.0036633491204130773
|
62 |
+
0.6,5261,320.21842105263147,0.03914172414049654,0.00387628313002679
|
63 |
+
0.6092307692307692,5681,289.5254531126872,0.04226651489111592,0.004185737400053639
|
64 |
+
0.6184615384615385,6055,263.0394202898551,0.0450490666547627,0.004461299059553737
|
65 |
+
0.6307692307692307,6802,226.3273758865248,0.0506067302040786,0.005011685582672919
|
66 |
+
0.64,7367,201.76998989694889,0.05481031776145943,0.005427975255447132
|
67 |
+
0.6492307692307693,7801,184.9621954484605,0.05803926820376612,0.005747744667808209
|
68 |
+
0.6584615384615384,7985,178.01515789473683,0.05940822415165651,0.00588331510991521
|
69 |
+
0.6707692307692308,8079,175.3568580560256,0.06010758208155704,0.005952573922730743
|
70 |
+
0.68,8443,165.98497588652486,0.06281573406542716,0.006220767623420678
|
71 |
+
0.6892307692307692,9241,144.15345029239765,0.06875283649160399,0.006808730736471691
|
72 |
+
0.6984615384615385,9504,138.80271646859083,0.07070955069972992,0.00700250805317898
|
73 |
+
0.7107692307692308,9775,133.6674772036474,0.07272578473167719,0.0072021797369343984
|
74 |
+
0.72,10400,121.42667780562527,0.07737577096771793,0.00766267716256959
|
75 |
+
0.7292307692307692,10868,113.25071895424836,0.08085768066126524,0.008007497634885221
|
76 |
+
0.7384615384615385,11823,99.04954407294836,0.0879628596299355,0.008711137701255795
|
77 |
+
0.7507692307692307,12267,93.8173076923077,0.09126620985201883,0.009038275072427035
|
78 |
+
0.76,12714,88.54265957446803,0.09459188000803517,0.009367622831241325
|
79 |
+
0.7692307692307693,14247,74.15760000000002,0.1059973662477959,0.010497130916839322
|
80 |
+
0.7784615384615384,14743,70.05540389053742,0.10968759532471783,0.01086258167382341
|
81 |
+
0.7876923076923077,15830,62.71639784946234,0.1177748513864399,0.011663478796488136
|
82 |
+
0.8,16463,58.88816251076612,0.12248435744630196,0.012129870589171459
|
83 |
+
0.8092307692307692,17102,55.05751773049646,0.12723850337403,0.012600683157140878
|
84 |
+
0.8184615384615385,19538,44.345175283888736,0.1453622897276224,0.014395517923296602
|
85 |
+
0.8276923076923077,20576,40.51128376496799,0.15308498686843888,0.015160312047791528
|
86 |
+
0.84,22694,34.49191625472521,0.16884286022513373,0.016720845723784065
|
87 |
+
0.8492307692307692,24366,30.565914650720305,0.18128250340378993,0.01795276843684333
|
88 |
+
0.8584615384615385,27034,25.41807646695045,0.20113236464820064,0.019918539847394837
|
89 |
+
0.8676923076923077,28429,23.0279127002554,0.21151113392704357,0.020946370101412586
|
90 |
+
0.88,32972,17.645820519988817,0.24531095387957652,0.024293633788869666
|
91 |
+
0.8892307692307693,39488,12.639793663256718,0.29378985038204286,0.029094595749571923
|
92 |
+
0.8984615384615384,44033,10.027246243440574,0.32760455029053115,0.03244333302879104
|
93 |
+
0.9076923076923076,46235,9.17001356841133,0.3439873817973499,0.03406575755878894
|
94 |
+
0.92,52119,7.312447220141592,0.38776421221793184,0.03840106452268889
|
95 |
+
0.9292307692307692,61820,5.1008903893765485,0.4599394385792618,0.045548721364428085
|
96 |
+
0.9384615384615383,65714,4.641389084785888,0.48891071282429005,0.048417804525105586
|
97 |
+
0.9476923076923076,77011,3.2563426595787157,0.5729601440379737,0.0567413875929468
|
98 |
+
0.96,85185,2.703150426409592,0.6337745240274089,0.06276395712437409
|
99 |
+
0.9692307692307692,90951,2.2612983428008078,0.6766734370466263,0.06701232217431412
|
100 |
+
0.9784615384615384,100735,1.9992355751561923,0.7494661815801025,0.07422113307417766
|
101 |
+
0.9876923076923076,107699,1.5609094071743144,0.8012781882165628,0.07935217958957522
|
102 |
+
1.0,134409,1.0,1.0,0.09903199757152077
|
metric_analysis/output_standardized/virality_median_retweets_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.0009920634920634,5,56750.0,9.770128418567934e-06,3.6839794050815336e-06
|
3 |
+
0.0109126984126984,68,20769.80999999998,0.0001328737464925239,5.010211990910886e-05
|
4 |
+
0.0208333333333333,108,15734.440000000006,0.00021103477384106737,7.957395514976113e-05
|
5 |
+
0.0307539682539682,172,11420.029999999995,0.0003360924175987369,0.00012672889153480477
|
6 |
+
0.0406746031746031,219,9203.879999999965,0.0004279316247332755,0.00016135829794257118
|
7 |
+
0.050595238095238,284,7694.966666666665,0.0005549432941746586,0.00020925003020863113
|
8 |
+
0.060515873015873,370,6665.859999999988,0.0007229895029740271,0.0002726144759760335
|
9 |
+
0.0704365079365079,476,5388.830000000004,0.0009301162254476673,0.000350714839363762
|
10 |
+
0.0803571428571428,643,4035.7200000000007,0.0012564385146278364,0.00047375975149348525
|
11 |
+
0.0902777777777777,783,3395.8991666666666,0.0015300021103477385,0.0005769111748357682
|
12 |
+
0.1001984126984127,947,2795.887500000001,0.0018504623224767666,0.0006977456993224426
|
13 |
+
0.1101190476190476,1129,2285.583333333333,0.0022060949969126393,0.0008318425496674104
|
14 |
+
0.1200396825396825,1274,2010.7097142857144,0.0024894287210511096,0.0009386779524147748
|
15 |
+
0.1299603174603174,1422,1754.7858823529411,0.0027786245222407202,0.0010477237428051882
|
16 |
+
0.1398809523809523,1789,1316.0649999999996,0.0034957519481636067,0.0013181278311381727
|
17 |
+
0.1507936507936507,2189,1004.0008333333316,0.004277362221649041,0.0016128461835446955
|
18 |
+
0.1607142857142857,2717,759.2236734693877,0.005309087782649815,0.0020018744087213053
|
19 |
+
0.1706349206349206,3285,605.9448076923071,0.0064189743709991325,0.0024203744691385677
|
20 |
+
0.1805555555555555,4507,397.9200534759351,0.008806793756497136,0.0033207390357404944
|
21 |
+
0.1904761904761904,4979,344.9141220238097,0.009729093879209949,0.0036685066915801913
|
22 |
+
0.2003968253968254,6185,256.6727272727276,0.012085648853768534,0.004557082524085858
|
23 |
+
0.2103174603174603,7881,181.9363148148153,0.015399676413346776,0.0058066883382895135
|
24 |
+
0.2202380952380952,8532,163.15705,0.016671747133444322,0.00628634245683113
|
25 |
+
0.2301587301587301,9916,130.25747244296335,0.019376118679703926,0.007306067956157698
|
26 |
+
0.2400793650793651,12130,95.4236079560604,0.023702331543445806,0.008937334036727801
|
27 |
+
0.25,14577,71.5297497155859,0.028483832391492953,0.010740273557574703
|
28 |
+
0.2599206349206349,17026,55.781783181357575,0.033269241290907525,0.012544686670183639
|
29 |
+
0.2698412698412698,22643,34.6215403148756,0.04424500355632675,0.016683269133852235
|
30 |
+
0.2797619047619047,28429,23.02300286355545,0.05555099616229356,0.020946370101412586
|
31 |
+
0.2906746031746032,45428,9.636950619740496,0.08876747875974082,0.03347116328280878
|
32 |
+
0.3005952380952381,61961,5.027143821742062,0.12107338538857755,0.04565260958365138
|
33 |
+
0.310515873015873,86051,2.6456794555995926,0.16814586410923785,0.06340202235733422
|
34 |
+
0.3224206349206349,134409,1.0,0.26263863812225946,0.09903199757152077
|
35 |
+
1.0,511764,0.0,1.0,0.3770656072524292
|
metric_analysis/output_standardized/virality_retweet_percentile_per_user_viral_covered_vs_new_tweets_labeled.csv
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tpr,new_tweets,threshold,fpr,fpr2
|
2 |
+
0.503968253968254,814,1.0,0.0005997518471472737,0.0005997518471472737
|
3 |
+
0.816468253968254,20605,0.99,0.015181679128341001,0.015181679128341001
|
4 |
+
0.876984126984127,46379,0.98,0.03417185616565529,0.03417185616565529
|
5 |
+
0.9067460317460316,61903,0.97,0.04560987542255244,0.04560987542255244
|
6 |
+
0.9206349206349206,92538,0.96,0.068181617237487,0.068181617237487
|
7 |
+
0.9305555555555556,117906,0.95,0.08687265514710867,0.08687265514710867
|
8 |
+
0.9424603174603174,160095,0.94,0.11795733657130564,0.11795733657130564
|
9 |
+
0.9523809523809524,203539,0.93,0.14996669682617805,0.14996669682617805
|
10 |
+
0.9583333333333334,238956,0.92,0.1760617965441326,0.1760617965441326
|
11 |
+
0.9662698412698412,272234,0.91,0.20058088987259326,0.20058088987259326
|
12 |
+
0.9742063492063492,310028,0.9,0.22842735339972356,0.22842735339972356
|
13 |
+
0.9771825396825397,339615,0.89,0.250226933131353,0.250226933131353
|
14 |
+
0.9791666666666666,375135,0.88,0.2763979228250522,0.2763979228250522
|
15 |
+
0.9831349206349206,422834,0.87,0.31154234955364907,0.31154234955364907
|
16 |
+
0.9851190476190476,462519,0.86,0.3407820940917812,0.3407820940917812
|
17 |
+
0.9861111111111112,490552,0.85,0.3614366930243113,0.3614366930243113
|
18 |
+
0.988095238095238,528701,0.84,0.3895447190892024,0.3895447190892024
|
19 |
+
0.9910714285714286,574204,0.83,0.42307114206308744,0.42307114206308744
|
20 |
+
0.9930555555555556,610579,0.82,0.4498720922350556,0.4498720922350556
|
21 |
+
0.9940476190476192,646190,0.81,0.4761101303539273,0.4761101303539273
|
22 |
+
0.9950396825396826,822465,0.75,0.6059888242800767,0.6059888242800767
|
23 |
+
0.996031746031746,937666,0.71,0.6908684465690363,0.6908684465690363
|
24 |
+
0.9970238095238096,1029756,0.66,0.758719979251828,0.758719979251828
|
25 |
+
0.9990079365079364,1072643,0.64,0.7903189442009744,0.7903189442009744
|
26 |
+
1.0,1174373,0.57,0.8652731891767632,0.8652731891767632
|
metric_analysis/twitter_viral_model.ipynb
ADDED
@@ -0,0 +1,2303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Comparison of Twitter's model of viral tweets with other tweets"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"In this notebook, we try to identify common features of Twitter's identified viral tweets found on the topic page \"Viral Tweets\".\n",
|
15 |
+
"\n",
|
16 |
+
"We also experiment to find if other tweets that have not figured on that topic page, can also be labeled as viral based on these common features. This should help homogeinize the data (those that are viral and those that are not) when training the model."
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "code",
|
21 |
+
"execution_count": null,
|
22 |
+
"metadata": {},
|
23 |
+
"outputs": [],
|
24 |
+
"source": [
|
25 |
+
"import pandas as pd\n",
|
26 |
+
"import seaborn as sns\n",
|
27 |
+
"import numpy as np\n",
|
28 |
+
"\n",
|
29 |
+
"import matplotlib.pyplot as plt\n",
|
30 |
+
"%matplotlib inline\n",
|
31 |
+
"\n",
|
32 |
+
"import plotly.express as px\n",
|
33 |
+
"import plotly.graph_objects as go\n",
|
34 |
+
"from plotly.subplots import make_subplots\n",
|
35 |
+
"\n",
|
36 |
+
"from helper.text_preprocessing import clear_reply_mentions\n",
|
37 |
+
"\n",
|
38 |
+
"from tqdm import tqdm\n",
|
39 |
+
"\n",
|
40 |
+
"#pd.set_option('display.max_rows', None)\n",
|
41 |
+
"pd.set_option('display.max_columns', None)\n",
|
42 |
+
"\n",
|
43 |
+
"DATA_PATH = \"../../data\"\n",
|
44 |
+
"VIRAL_TWEETS_PATH = f\"{DATA_PATH}/new/viral\"\n",
|
45 |
+
"COVID_TWEETS_PATH = f\"{DATA_PATH}/new/covid\"\n",
|
46 |
+
"\n",
|
47 |
+
"PROCESSED_PATH_VIRAL = f'{DATA_PATH}/new/processed/viral'\n",
|
48 |
+
"PROCESSED_PATH_COVID = f'{DATA_PATH}/new/processed/covid'"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "markdown",
|
53 |
+
"metadata": {},
|
54 |
+
"source": [
|
55 |
+
"## 0. Preprocessing"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": null,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"viral_dataset = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/all_tweets.parquet.gzip\")"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": null,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [],
|
72 |
+
"source": [
|
73 |
+
"covid_dataset = pd.read_parquet(f\"{COVID_TWEETS_PATH}/all_tweets.parquet.gzip\")"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"cell_type": "code",
|
78 |
+
"execution_count": null,
|
79 |
+
"metadata": {},
|
80 |
+
"outputs": [],
|
81 |
+
"source": [
|
82 |
+
"covid_users = pd.read_parquet(f\"{COVID_TWEETS_PATH}/users.parquet.gzip\")"
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "markdown",
|
87 |
+
"metadata": {},
|
88 |
+
"source": [
|
89 |
+
"- Keep only original tweets from **covid dataset**. Viral dataset doesn't have retweets"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": null,
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [],
|
97 |
+
"source": [
|
98 |
+
"def is_retweeted(referenced_tweets):\n",
|
99 |
+
" for x in referenced_tweets:\n",
|
100 |
+
" if x['type'] == 'retweeted':\n",
|
101 |
+
" return True\n",
|
102 |
+
" return False\n",
|
103 |
+
"\n",
|
104 |
+
"# Keep only original tweets\n",
|
105 |
+
"referenced = covid_dataset.loc[~covid_dataset.referenced_tweets.isna()].copy()\n",
|
106 |
+
"referenced.loc[:, 'is_retweet'] = referenced.referenced_tweets.apply(is_retweeted)\n",
|
107 |
+
"retweeted = referenced[referenced.is_retweet]\n",
|
108 |
+
"retweeted"
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"cell_type": "code",
|
113 |
+
"execution_count": null,
|
114 |
+
"metadata": {},
|
115 |
+
"outputs": [],
|
116 |
+
"source": [
|
117 |
+
"original_covid_tweets = covid_dataset[~covid_dataset.id.isin(retweeted.id)]\n",
|
118 |
+
"original_covid_tweets.to_parquet(f\"{COVID_TWEETS_PATH}/all_original_tweets.parquet.gzip\", index=False, compression=\"gzip\")"
|
119 |
+
]
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"cell_type": "code",
|
123 |
+
"execution_count": null,
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [],
|
126 |
+
"source": [
|
127 |
+
"# Clear reply mentions at the beginning of tweets texts\n",
|
128 |
+
"original_covid_tweets.loc[:, \"text\"] = original_covid_tweets.text.apply(clear_reply_mentions)\n",
|
129 |
+
"viral_dataset.loc[:, \"text\"] = viral_dataset.text.apply(clear_reply_mentions)"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "markdown",
|
134 |
+
"metadata": {},
|
135 |
+
"source": [
|
136 |
+
"## 1. Exploration"
|
137 |
+
]
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"cell_type": "markdown",
|
141 |
+
"metadata": {},
|
142 |
+
"source": [
|
143 |
+
"### 1.1 - General Exploration"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": null,
|
149 |
+
"metadata": {},
|
150 |
+
"outputs": [],
|
151 |
+
"source": [
|
152 |
+
"viral_dataset = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/all_tweets.parquet.gzip\")\n",
|
153 |
+
"viral_users = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/users.parquet.gzip\")\n",
|
154 |
+
"viral_tweets_ids = pd.read_parquet(f\"{VIRAL_TWEETS_PATH}/viral_tweets_ids.parquet.gzip\")"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": null,
|
160 |
+
"metadata": {},
|
161 |
+
"outputs": [],
|
162 |
+
"source": [
|
163 |
+
"original_covid_tweets = pd.read_parquet(f\"{COVID_TWEETS_PATH}/all_original_tweets.parquet.gzip\")\n",
|
164 |
+
"covid_users = pd.read_parquet(f\"{COVID_TWEETS_PATH}/users.parquet.gzip\")"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": null,
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [],
|
172 |
+
"source": [
|
173 |
+
"display(\"--- VIRAL DATASET ---\")\n",
|
174 |
+
"\n",
|
175 |
+
"display(f\"{len(viral_tweets_ids)} viral tweets collected\")\n",
|
176 |
+
"display(f\"{len(viral_users)} viral users\")\n",
|
177 |
+
"display(f\"{len(viral_dataset)} all tweets collected\")\n",
|
178 |
+
"\n",
|
179 |
+
"display(\"--- COVID DATASET ---\")\n",
|
180 |
+
"\n",
|
181 |
+
"display(f\"{len(original_covid_tweets)} original (not retweeted) covid tweets collected\")\n",
|
182 |
+
"display(f\"{len(original_covid_tweets.author_id.unique())} covid users collected\")"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": null,
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [],
|
190 |
+
"source": [
|
191 |
+
"# REMOVE THIS WHEN DONE COLLECTION (WARNING NOT NECESSARILY)\n",
|
192 |
+
"viral_dataset['viral'] = viral_dataset.id.isin(viral_tweets_ids.id)\n",
|
193 |
+
"\n",
|
194 |
+
"#viral_tweets = all_tweets[all_tweets.id.isin(viral_tweets.id)]\n",
|
195 |
+
"#viral_tweets\n",
|
196 |
+
"\n",
|
197 |
+
"len(viral_dataset[viral_dataset.viral])"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "markdown",
|
202 |
+
"metadata": {},
|
203 |
+
"source": [
|
204 |
+
"- merge tweets with user info"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"cell_type": "code",
|
209 |
+
"execution_count": null,
|
210 |
+
"metadata": {},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"covid_users.columns"
|
214 |
+
]
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"cell_type": "code",
|
218 |
+
"execution_count": null,
|
219 |
+
"metadata": {},
|
220 |
+
"outputs": [],
|
221 |
+
"source": [
|
222 |
+
"user_columns = ['author_id', 'followers_count', 'following_count', 'tweet_count', 'protected', 'verified', 'username']\n",
|
223 |
+
"viral_dataset_with_users = viral_dataset.merge(viral_users.rename(columns={'id': 'author_id'})[user_columns], on='author_id')\n",
|
224 |
+
"covid_dataset_with_users = original_covid_tweets.merge(covid_users.rename(columns={'id': 'author_id'})[user_columns], on='author_id')"
|
225 |
+
]
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"cell_type": "markdown",
|
229 |
+
"metadata": {},
|
230 |
+
"source": [
|
231 |
+
"#### 1.1.1 - Correlation between public metrics"
|
232 |
+
]
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"cell_type": "markdown",
|
236 |
+
"metadata": {},
|
237 |
+
"source": [
|
238 |
+
"- Pearson Correlation between the different public metrics"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": null,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [],
|
246 |
+
"source": [
|
247 |
+
"public_metrics = ['retweet_count', 'like_count', 'reply_count', 'quote_count', 'followers_count', 'following_count']\n",
|
248 |
+
"display(viral_dataset_with_users[public_metrics].corr())\n",
|
249 |
+
"display(covid_dataset_with_users[public_metrics].corr())"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"cell_type": "code",
|
254 |
+
"execution_count": null,
|
255 |
+
"metadata": {},
|
256 |
+
"outputs": [],
|
257 |
+
"source": [
|
258 |
+
"px.scatter(viral_dataset, x='like_count', y='retweet_count')"
|
259 |
+
]
|
260 |
+
},
|
261 |
+
{
|
262 |
+
"cell_type": "markdown",
|
263 |
+
"metadata": {},
|
264 |
+
"source": [
|
265 |
+
"#### 1.1.2 - Exploring retweet count of viral vs non viral tweets"
|
266 |
+
]
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"cell_type": "markdown",
|
270 |
+
"metadata": {},
|
271 |
+
"source": [
|
272 |
+
"Since we have a large number of tweets to plot, we'll only sample a few from each user"
|
273 |
+
]
|
274 |
+
},
|
275 |
+
{
|
276 |
+
"cell_type": "code",
|
277 |
+
"execution_count": null,
|
278 |
+
"metadata": {},
|
279 |
+
"outputs": [],
|
280 |
+
"source": [
|
281 |
+
"def get_largest_n(all_tweets, by='retweet_count', n=100):\n",
|
282 |
+
" '''Get the largest 100 tweets by retweet count for every user\n",
|
283 |
+
" '''\n",
|
284 |
+
" top_n_per_user = all_tweets.groupby(by='author_id')[by].nlargest(n=100).reset_index(level=0, drop=True)\n",
|
285 |
+
" tweets_for_plot = all_tweets[all_tweets.index.isin(top_n_per_user.index)].reset_index()\n",
|
286 |
+
" return tweets_for_plot"
|
287 |
+
]
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"cell_type": "code",
|
291 |
+
"execution_count": null,
|
292 |
+
"metadata": {},
|
293 |
+
"outputs": [],
|
294 |
+
"source": [
|
295 |
+
"tweets_plot_df = get_largest_n(viral_dataset, by='retweet_count')\n",
|
296 |
+
"fig = px.scatter(tweets_plot_df, x=tweets_plot_df.index, y='retweet_count', color='viral')\n",
|
297 |
+
"\n",
|
298 |
+
"fig.update_layout(title_text=\"Viral Dataset Scatter plot of the retweet count for the top 100 tweets per user\", xaxis_title=\"Index\", yaxis_title=\"retweet count\")\n",
|
299 |
+
"\n",
|
300 |
+
"fig.show()"
|
301 |
+
]
|
302 |
+
},
|
303 |
+
{
|
304 |
+
"cell_type": "code",
|
305 |
+
"execution_count": null,
|
306 |
+
"metadata": {},
|
307 |
+
"outputs": [],
|
308 |
+
"source": [
|
309 |
+
"covid_tweets_plot_df = original_covid_tweets.sort_values(by='retweet_count', ascending=False)[:10000]\n",
|
310 |
+
"fig = px.scatter(covid_tweets_plot_df, x=covid_tweets_plot_df.reset_index().index, y='retweet_count')\n",
|
311 |
+
"\n",
|
312 |
+
"fig.update_layout(title_text=\"Covid Dataset Scatter plot of retweet count sorted by retweet count on a 10000 sample\", xaxis_title=\"Index\", yaxis_title=\"retweet count\")\n",
|
313 |
+
"\n",
|
314 |
+
"fig.show()"
|
315 |
+
]
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"cell_type": "markdown",
|
319 |
+
"metadata": {},
|
320 |
+
"source": [
|
321 |
+
"**Finding**: Viral tweets identified by twitter are by no means more viral than other tweets tweeted by the same users. Are users who have tweeted viral tweets (as identified by Twitter) likely to have tweeted other viral tweets?"
|
322 |
+
]
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"cell_type": "code",
|
326 |
+
"execution_count": null,
|
327 |
+
"metadata": {},
|
328 |
+
"outputs": [],
|
329 |
+
"source": [
|
330 |
+
"# Get the ratio for each tweet's retweet count wrt to the mean retweet count of the user's tweets\n",
|
331 |
+
"# Again since we're retrieved 3200 tweets per user, we're only taking the average over that\n",
|
332 |
+
"users_avg_retweets = viral_dataset.groupby(by='author_id').agg(mean_retweets=('retweet_count', 'mean'))\n",
|
333 |
+
"tweets_merged_avg_retweets = viral_dataset.merge(right=users_avg_retweets, left_on='author_id', right_index=True)\n",
|
334 |
+
"tweets_merged_avg_retweets['ratio_avg_retweets'] = tweets_merged_avg_retweets['retweet_count'] / tweets_merged_avg_retweets['mean_retweets']\n",
|
335 |
+
"tweets_merged_avg_retweets_sorted = tweets_merged_avg_retweets.sort_values(by='ratio_avg_retweets').reset_index()"
|
336 |
+
]
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"cell_type": "code",
|
340 |
+
"execution_count": null,
|
341 |
+
"metadata": {},
|
342 |
+
"outputs": [],
|
343 |
+
"source": [
|
344 |
+
"tweets_plot_df = get_largest_n(tweets_merged_avg_retweets_sorted, by='ratio_avg_retweets')\n",
|
345 |
+
"\n",
|
346 |
+
"fig = px.scatter(tweets_plot_df, x=tweets_plot_df.index, y='ratio_avg_retweets', color='viral')\n",
|
347 |
+
"\n",
|
348 |
+
"fig.update_layout(title_text=\"Scatter plot of the tweets sorted by the ratio #retweets/(the mean user avg #retweets)\", xaxis_title=\"Index\", yaxis_title=\"ratio\")\n",
|
349 |
+
"\n",
|
350 |
+
"fig.show()"
|
351 |
+
]
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"cell_type": "markdown",
|
355 |
+
"metadata": {},
|
356 |
+
"source": [
|
357 |
+
"**Finding**: Cleaner separation. Viral tweets, as expected, are on the other end of the spectrum. However other tweets in the same range could qualify as viral as well. These tweets should be identified as viral by the Twitter model."
|
358 |
+
]
|
359 |
+
},
|
360 |
+
{
|
361 |
+
"cell_type": "markdown",
|
362 |
+
"metadata": {},
|
363 |
+
"source": [
|
364 |
+
"### 1.2 Finding the right threshold for virality"
|
365 |
+
]
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"cell_type": "markdown",
|
369 |
+
"metadata": {},
|
370 |
+
"source": [
|
371 |
+
"#### 1.2.0 - Relabel viral tweets in the viral dataset by correcting the initial virality threshold (ONLY IN OLD PAPER SUBMITTED BY STUDENT)"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "markdown",
|
376 |
+
"metadata": {},
|
377 |
+
"source": [
|
378 |
+
"Let's observe the retweet count of a user based on the tweet date."
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"cell_type": "code",
|
383 |
+
"execution_count": null,
|
384 |
+
"metadata": {},
|
385 |
+
"outputs": [],
|
386 |
+
"source": [
|
387 |
+
"sample_user = viral_users.id[10]\n",
|
388 |
+
"author_tweets = viral_dataset[viral_dataset.author_id == sample_user]\n",
|
389 |
+
"fig = px.scatter(author_tweets, x='created_at', y='retweet_count', color='viral')\n",
|
390 |
+
"\n",
|
391 |
+
"fig.update_layout(title_text=\"Scatter plot of the retweet count wrt to the tweet date for a single user\")\n",
|
392 |
+
"\n",
|
393 |
+
"fig.show() "
|
394 |
+
]
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"cell_type": "markdown",
|
398 |
+
"metadata": {},
|
399 |
+
"source": [
|
400 |
+
"**Finding**: The above graph of a user's retweet count wrt the tweet date, shows that the viral tweets taken from the Twitter \"Viral Tweets\" topic page, have been taken at certain points in time. **Other tweets with higher retweet counts** may have been on that Topic page at different points in time as well. In any case, they **should be qualified as viral all the same**."
|
401 |
+
]
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"cell_type": "markdown",
|
405 |
+
"metadata": {},
|
406 |
+
"source": [
|
407 |
+
"One quick fix for that is, for each user, mark as viral all tweets that have higher retweet count than the viral tweet we scraped for that user. "
|
408 |
+
]
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"cell_type": "code",
|
412 |
+
"execution_count": null,
|
413 |
+
"metadata": {},
|
414 |
+
"outputs": [],
|
415 |
+
"source": [
|
416 |
+
"# Get the minimum retweet count out of the viral tweets for each user\n",
|
417 |
+
"min_retweet_count_by_user = viral_dataset[viral_dataset.viral].groupby(by='author_id')[['retweet_count']].min()\n",
|
418 |
+
"\n",
|
419 |
+
"# Set as viral any tweet that has a retweet count higher or equal to the user's minimum retweet count we just computed\n",
|
420 |
+
"viral_dataset_labeled = viral_dataset.merge(min_retweet_count_by_user, left_on='author_id', right_index=True, suffixes=(None, \"_user_viral_threshold\"))\n",
|
421 |
+
"viral_dataset_labeled['viral'] = viral_dataset_labeled['retweet_count'] >= viral_dataset_labeled['retweet_count_user_viral_threshold']"
|
422 |
+
]
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"cell_type": "code",
|
426 |
+
"execution_count": null,
|
427 |
+
"metadata": {},
|
428 |
+
"outputs": [],
|
429 |
+
"source": [
|
430 |
+
"# Save this result \n",
|
431 |
+
"#viral_dataset_labeled.to_parquet(f'{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip', compression='gzip')"
|
432 |
+
]
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"cell_type": "code",
|
436 |
+
"execution_count": null,
|
437 |
+
"metadata": {},
|
438 |
+
"outputs": [],
|
439 |
+
"source": [
|
440 |
+
"display(f\"Number of identified viral tweets increased from {len(viral_tweets_ids)} to {len(viral_dataset_labeled[viral_dataset_labeled.viral])}\")"
|
441 |
+
]
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"cell_type": "markdown",
|
445 |
+
"metadata": {},
|
446 |
+
"source": [
|
447 |
+
"Another problem we're facing is that we're **missing historical data** on the number of followers of a user. So we cannot use the metric of:\n",
|
448 |
+
"$ \\frac{\\#retweets}{\\#followers}$ effectively. That's why we came up with the other metric: $\\frac{\\#retweets}{mean(\\#retweets)}$."
|
449 |
+
]
|
450 |
+
},
|
451 |
+
{
|
452 |
+
"cell_type": "markdown",
|
453 |
+
"metadata": {},
|
454 |
+
"source": [
|
455 |
+
"#### 1.2.1 Applying the virality followers metric to both datasets"
|
456 |
+
]
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"cell_type": "code",
|
460 |
+
"execution_count": null,
|
461 |
+
"metadata": {},
|
462 |
+
"outputs": [],
|
463 |
+
"source": [
|
464 |
+
"# Applying the first metric on the covid dataset\n",
|
465 |
+
"covid_dataset_with_users['virality_followers'] = covid_dataset_with_users['retweet_count'] / covid_dataset_with_users['followers_count'].astype(\"float64\")\n",
|
466 |
+
"# Handle division by zero if user has 0 followers\n",
|
467 |
+
"covid_dataset_with_users['virality_followers'] = covid_dataset_with_users.virality_followers.replace({np.inf: 0.0})"
|
468 |
+
]
|
469 |
+
},
|
470 |
+
{
|
471 |
+
"cell_type": "code",
|
472 |
+
"execution_count": null,
|
473 |
+
"metadata": {},
|
474 |
+
"outputs": [],
|
475 |
+
"source": [
|
476 |
+
"len(covid_dataset_with_users[(covid_dataset_with_users['virality_followers'] > 0.8)])"
|
477 |
+
]
|
478 |
+
},
|
479 |
+
{
|
480 |
+
"cell_type": "code",
|
481 |
+
"execution_count": null,
|
482 |
+
"metadata": {},
|
483 |
+
"outputs": [],
|
484 |
+
"source": [
|
485 |
+
"# Applying the second metric on the viral dataset\n",
|
486 |
+
"viral_dataset_with_users['virality_followers'] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['followers_count'].astype(\"float64\")\n",
|
487 |
+
"# Handle division by zero if user has 0 followers\n",
|
488 |
+
"viral_dataset_with_users['virality_followers'] = viral_dataset_with_users.virality_followers.replace({np.inf: 0.0})"
|
489 |
+
]
|
490 |
+
},
|
491 |
+
{
|
492 |
+
"cell_type": "code",
|
493 |
+
"execution_count": null,
|
494 |
+
"metadata": {},
|
495 |
+
"outputs": [],
|
496 |
+
"source": [
|
497 |
+
"len(viral_dataset_with_users[(viral_dataset_with_users['virality_followers'] > 1)])"
|
498 |
+
]
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"cell_type": "markdown",
|
502 |
+
"metadata": {},
|
503 |
+
"source": [
|
504 |
+
"#### 1.2.2 Applying the virality avg retweets metric to viral dataset "
|
505 |
+
]
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"cell_type": "code",
|
509 |
+
"execution_count": null,
|
510 |
+
"metadata": {},
|
511 |
+
"outputs": [],
|
512 |
+
"source": [
|
513 |
+
"viral_users_retweet_statistics = viral_dataset_with_users.groupby(by='author_id').retweet_count.agg(['min', 'mean', 'max'])\n",
|
514 |
+
"viral_users_retweet_statistics = viral_users_retweet_statistics.rename(columns={\"min\": \"min_user_retweets\", \"max\": \"max_user_retweets\", \"mean\": \"mean_user_retweets\"})"
|
515 |
+
]
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"cell_type": "code",
|
519 |
+
"execution_count": null,
|
520 |
+
"metadata": {},
|
521 |
+
"outputs": [],
|
522 |
+
"source": [
|
523 |
+
"viral_dataset_with_users = viral_dataset_with_users.merge(viral_users_retweet_statistics, on='author_id')"
|
524 |
+
]
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"cell_type": "code",
|
528 |
+
"execution_count": null,
|
529 |
+
"metadata": {},
|
530 |
+
"outputs": [],
|
531 |
+
"source": [
|
532 |
+
"# Applying the first metric on the viral dataset\n",
|
533 |
+
"viral_dataset_with_users['virality_avg_retweets'] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['mean_user_retweets'].astype(\"float64\")\n",
|
534 |
+
"# Handle division by zero if user has 0 followers\n",
|
535 |
+
"viral_dataset_with_users['virality_avg_retweets'] = viral_dataset_with_users.virality_avg_retweets.replace({np.inf: 0.0})"
|
536 |
+
]
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"cell_type": "code",
|
540 |
+
"execution_count": null,
|
541 |
+
"metadata": {},
|
542 |
+
"outputs": [],
|
543 |
+
"source": [
|
544 |
+
"len(viral_dataset_with_users[(viral_dataset_with_users['virality_avg_retweets'] > 1)])"
|
545 |
+
]
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"cell_type": "markdown",
|
549 |
+
"metadata": {},
|
550 |
+
"source": [
|
551 |
+
"#### 1.2.3 How many tweets are covered by metric 1?"
|
552 |
+
]
|
553 |
+
},
|
554 |
+
{
|
555 |
+
"cell_type": "code",
|
556 |
+
"execution_count": null,
|
557 |
+
"metadata": {},
|
558 |
+
"outputs": [],
|
559 |
+
"source": [
|
560 |
+
"temp = viral_dataset_with_users[viral_dataset_with_users.virality_followers > 0]\n",
|
561 |
+
"temp_2 = viral_dataset_with_users[viral_dataset_with_users.virality_avg_retweets > 0]\n",
|
562 |
+
"viral_temp = viral_dataset_with_users[viral_dataset_with_users.viral]"
|
563 |
+
]
|
564 |
+
},
|
565 |
+
{
|
566 |
+
"cell_type": "code",
|
567 |
+
"execution_count": null,
|
568 |
+
"metadata": {},
|
569 |
+
"outputs": [],
|
570 |
+
"source": [
|
571 |
+
"fig = px.ecdf(viral_dataset_with_users[viral_dataset_with_users.viral], x='virality_followers')\n",
|
572 |
+
"\n",
|
573 |
+
"# TODO: percentage y axis\n",
|
574 |
+
"# TODO: Only take the scraped tweets\n",
|
575 |
+
"fig.update_layout(title_text=\"Percentage of viral tweets recognized by Metric 1: number of followers\", xaxis_title=\"Metric 1: virality_followers\", yaxis_title=\"Percentage\")\n",
|
576 |
+
"\n",
|
577 |
+
"fig.show()"
|
578 |
+
]
|
579 |
+
},
|
580 |
+
{
|
581 |
+
"cell_type": "code",
|
582 |
+
"execution_count": null,
|
583 |
+
"metadata": {},
|
584 |
+
"outputs": [],
|
585 |
+
"source": [
|
586 |
+
"fig1 = sns.displot(temp, x='virality_followers', kind='ecdf')\n",
|
587 |
+
"\n",
|
588 |
+
"plt.xscale('log')\n",
|
589 |
+
"plt.title(\"Proportion of tweets labeled as viral as function of Metric 1: number of followers (logscale)\")"
|
590 |
+
]
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"cell_type": "code",
|
594 |
+
"execution_count": null,
|
595 |
+
"metadata": {},
|
596 |
+
"outputs": [],
|
597 |
+
"source": [
|
598 |
+
"temp_2 = viral_dataset_with_users[viral_dataset_with_users.virality_avg_retweets > 0]"
|
599 |
+
]
|
600 |
+
},
|
601 |
+
{
|
602 |
+
"cell_type": "code",
|
603 |
+
"execution_count": null,
|
604 |
+
"metadata": {},
|
605 |
+
"outputs": [],
|
606 |
+
"source": [
|
607 |
+
"fig = px.ecdf(viral_temp, x='virality_avg_retweets')\n",
|
608 |
+
"\n",
|
609 |
+
"fig.update_layout(title_text=\"Percentage of viral tweets recognized by Metric 2 avg retweets\", xaxis_title=\"Metric 2: avg retweets\", yaxis_title=\"Percentage\")\n",
|
610 |
+
"\n",
|
611 |
+
"fig.show()"
|
612 |
+
]
|
613 |
+
},
|
614 |
+
{
|
615 |
+
"cell_type": "code",
|
616 |
+
"execution_count": null,
|
617 |
+
"metadata": {},
|
618 |
+
"outputs": [],
|
619 |
+
"source": [
|
620 |
+
"fig = sns.displot(temp_2, x='virality_avg_retweets', kind='ecdf')\n",
|
621 |
+
"\n",
|
622 |
+
"plt.xscale('log')\n",
|
623 |
+
"plt.title(\"Proportion of tweets labeled as viral as function of Metric 2: avg retweets (logscale)\")"
|
624 |
+
]
|
625 |
+
},
|
626 |
+
{
|
627 |
+
"cell_type": "markdown",
|
628 |
+
"metadata": {},
|
629 |
+
"source": [
|
630 |
+
"TODO: Plot the percentage of viral tweets labeled vs the # of new tweets labeled wrt to the varying threshold of the metric we use. "
|
631 |
+
]
|
632 |
+
},
|
633 |
+
{
|
634 |
+
"cell_type": "code",
|
635 |
+
"execution_count": null,
|
636 |
+
"metadata": {},
|
637 |
+
"outputs": [],
|
638 |
+
"source": [
|
639 |
+
"#viral_dataset_with_users = viral_dataset_with_users.groupby(by='virality_followers').count()\n",
|
640 |
+
"viral_dataset_with_users = pd.read_parquet(f\"{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip\")\n",
|
641 |
+
"# Applying the second metric on the viral dataset\n",
|
642 |
+
"viral_dataset_with_users['virality_followers'] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['followers_count'].astype(\"float64\")\n",
|
643 |
+
"# Handle division by zero if user has 0 followers\n",
|
644 |
+
"viral_dataset_with_users['virality_followers'] = viral_dataset_with_users.virality_followers.replace({np.inf: 0.0})\n"
|
645 |
+
]
|
646 |
+
},
|
647 |
+
{
|
648 |
+
"cell_type": "code",
|
649 |
+
"execution_count": null,
|
650 |
+
"metadata": {},
|
651 |
+
"outputs": [],
|
652 |
+
"source": [
|
653 |
+
"viral_dataset_with_users"
|
654 |
+
]
|
655 |
+
},
|
656 |
+
{
|
657 |
+
"cell_type": "code",
|
658 |
+
"execution_count": null,
|
659 |
+
"metadata": {},
|
660 |
+
"outputs": [],
|
661 |
+
"source": [
|
662 |
+
"viral_dataset_with_users_truncated = viral_dataset_with_users[viral_dataset_with_users.virality_followers > 0.1]\n",
|
663 |
+
"#viral_dataset_with_users['viral_metric_1'] = viral_dataset_with_users['']\n",
|
664 |
+
"len(viral_dataset_with_users_truncated)"
|
665 |
+
]
|
666 |
+
},
|
667 |
+
{
|
668 |
+
"cell_type": "code",
|
669 |
+
"execution_count": null,
|
670 |
+
"metadata": {},
|
671 |
+
"outputs": [],
|
672 |
+
"source": [
|
673 |
+
"ready_to_plot = viral_dataset_with_users_truncated.copy()\n",
|
674 |
+
"ready_to_plot['viral'] = ready_to_plot['viral'].replace({False: None})\n",
|
675 |
+
"ready_to_plot = ready_to_plot.groupby(by='virality_followers').count()[['text', 'viral']].cumsum().rename(columns={'text':'tweets'})"
|
676 |
+
]
|
677 |
+
},
|
678 |
+
{
|
679 |
+
"cell_type": "code",
|
680 |
+
"execution_count": null,
|
681 |
+
"metadata": {},
|
682 |
+
"outputs": [],
|
683 |
+
"source": [
|
684 |
+
"fig = px.line(ready_to_plot, x='viral', y='tweets', hover_data=[ready_to_plot.index])#, log_y=True)\n",
|
685 |
+
"\n",
|
686 |
+
"fig.update_layout(title_text=\"Line plot of #viral tweets labeled as viral vs # new tweets labeled as viral by varying threshold of Metric 1 (#followers)\", xaxis_title=\"Number of viral tweets labeled as viral\", yaxis_title=\"Number of new tweets labeled as viral\")\n",
|
687 |
+
"fig.show()"
|
688 |
+
]
|
689 |
+
},
|
690 |
+
{
|
691 |
+
"cell_type": "code",
|
692 |
+
"execution_count": null,
|
693 |
+
"metadata": {},
|
694 |
+
"outputs": [],
|
695 |
+
"source": [
|
696 |
+
"ready_to_plot = viral_dataset_with_users_truncated.copy()\n",
|
697 |
+
"ready_to_plot['viral'] = ready_to_plot['viral'].replace({False: None})\n",
|
698 |
+
"ready_to_plot = ready_to_plot.groupby(by='virality_followers').count()[['text', 'viral']].cumsum().rename(columns={'text':'tweets'})\n",
|
699 |
+
"ready_to_plot['tweets'] = len(viral_dataset_with_users) - ready_to_plot.tweets"
|
700 |
+
]
|
701 |
+
},
|
702 |
+
{
|
703 |
+
"cell_type": "code",
|
704 |
+
"execution_count": null,
|
705 |
+
"metadata": {},
|
706 |
+
"outputs": [],
|
707 |
+
"source": [
|
708 |
+
"fig = px.line(ready_to_plot, x='viral', y='tweets', hover_data=[ready_to_plot.index])#, log_y=True)\n",
|
709 |
+
"\n",
|
710 |
+
"fig.update_layout(title_text=\"Line plot of #viral tweets labeled as viral vs # new tweets labeled as viral by varying threshold of Metric 1 (#followers)\", xaxis_title=\"Number of viral tweets labeled as viral\", yaxis_title=\"Number of new tweets labeled as viral\")\n",
|
711 |
+
"fig.show()"
|
712 |
+
]
|
713 |
+
},
|
714 |
+
{
|
715 |
+
"cell_type": "code",
|
716 |
+
"execution_count": null,
|
717 |
+
"metadata": {},
|
718 |
+
"outputs": [],
|
719 |
+
"source": [
|
720 |
+
"'''\n",
|
721 |
+
"tempo3 = tempo2.copy()\n",
|
722 |
+
"tempo3['viral'] = tempo3['viral'].replace({False: None})\n",
|
723 |
+
"tempo3 = tempo3.groupby(by='virality_followers').count()[['text', 'viral']].rename(columns={'text':'tweets'})\n",
|
724 |
+
"tempo3['viral_cumsum'] = tempo3.viral.cumsum()\n",
|
725 |
+
"tempo3\n",
|
726 |
+
"'''"
|
727 |
+
]
|
728 |
+
},
|
729 |
+
{
|
730 |
+
"cell_type": "code",
|
731 |
+
"execution_count": null,
|
732 |
+
"metadata": {},
|
733 |
+
"outputs": [],
|
734 |
+
"source": [
|
735 |
+
"min_threshold = viral_dataset_with_users.virality_followers.min()\n",
|
736 |
+
"max_threshold = viral_dataset_with_users.virality_followers.max()\n",
|
737 |
+
"display(f\"sampling from {min_threshold} to {max_threshold}\")\n",
|
738 |
+
"thresholds_space = np.linspace(min_threshold, max_threshold, num=10000)\n",
|
739 |
+
"\n",
|
740 |
+
"number_of_viral_tweets = len(viral_dataset_with_users[viral_dataset_with_users.viral]) \n",
|
741 |
+
"\n",
|
742 |
+
"percentages_of_viral_covered = []\n",
|
743 |
+
"nb_of_tweets_labeled_as_viral = []\n",
|
744 |
+
"\n",
|
745 |
+
"for i in thresholds_space:\n",
|
746 |
+
" new_tweets_labeled = viral_dataset_with_users[viral_dataset_with_users.virality_followers >= i]\n",
|
747 |
+
" percentage_of_viral_covered = len(new_tweets_labeled[new_tweets_labeled.viral]) / number_of_viral_tweets\n",
|
748 |
+
" nb_of_tweets_labeled_as_viral.append(len(new_tweets_labeled))\n",
|
749 |
+
" percentages_of_viral_covered.append(percentage_of_viral_covered)"
|
750 |
+
]
|
751 |
+
},
|
752 |
+
{
|
753 |
+
"cell_type": "code",
|
754 |
+
"execution_count": null,
|
755 |
+
"metadata": {},
|
756 |
+
"outputs": [],
|
757 |
+
"source": [
|
758 |
+
"result_to_plot = pd.DataFrame({'percentage_of_viral_covered':percentages_of_viral_covered, 'nb_of_tweets_labeled_as_viral':nb_of_tweets_labeled_as_viral, 'thresholds': thresholds_space})\n",
|
759 |
+
"\n",
|
760 |
+
"px.scatter(\n",
|
761 |
+
" result_to_plot,\n",
|
762 |
+
" x='percentage_of_viral_covered',\n",
|
763 |
+
" y='nb_of_tweets_labeled_as_viral', log_y=True, hover_name='thresholds')"
|
764 |
+
]
|
765 |
+
},
|
766 |
+
{
|
767 |
+
"cell_type": "code",
|
768 |
+
"execution_count": null,
|
769 |
+
"metadata": {},
|
770 |
+
"outputs": [],
|
771 |
+
"source": [
|
772 |
+
"result_to_plot.to_csv('new_tweets_labeled_vs_percentage_of_viral.csv', index=False)"
|
773 |
+
]
|
774 |
+
},
|
775 |
+
{
|
776 |
+
"cell_type": "markdown",
|
777 |
+
"metadata": {},
|
778 |
+
"source": [
|
779 |
+
"#### 1.2.4 Comparing several metrics wrt distributions of viral tweets covered"
|
780 |
+
]
|
781 |
+
},
|
782 |
+
{
|
783 |
+
"cell_type": "code",
|
784 |
+
"execution_count": null,
|
785 |
+
"metadata": {},
|
786 |
+
"outputs": [],
|
787 |
+
"source": [
|
788 |
+
"def plot_distribution_for_metric(\n",
|
789 |
+
" df, metric='virality_followers', num_experiments=1000, generate_thresholds_from_viral_quantiles=True, min_threshold=None, max_threshold=None, remove_duplicates=True, output_filename=None):\n",
|
790 |
+
" viral_tweets = df[df.viral]\n",
|
791 |
+
" number_of_viral_tweets = len(viral_tweets)\n",
|
792 |
+
" \n",
|
793 |
+
" if not generate_thresholds_from_viral_quantiles: \n",
|
794 |
+
" # If not, generate a linear space of the thresholds between min and max of the metric values\n",
|
795 |
+
" if not min_threshold:\n",
|
796 |
+
" min_threshold = df[metric].min()\n",
|
797 |
+
" if not max_threshold:\n",
|
798 |
+
" max_threshold = df[metric].max()\n",
|
799 |
+
" display(f\"sampling from {min_threshold} to {max_threshold}\")\n",
|
800 |
+
" thresholds_space = np.linspace(min_threshold, max_threshold, num=num_experiments)\n",
|
801 |
+
" else:\n",
|
802 |
+
" # Take quantiles of metric for different percentages of viral tweets covered (from 0 to 100)\n",
|
803 |
+
" thresholds_space = viral_tweets[metric].quantile([i / 100 for i in range(101)]) \n",
|
804 |
+
" display(f\"sampling from {thresholds_space.min()} to {thresholds_space.max()}\")\n",
|
805 |
+
"\n",
|
806 |
+
" percentages_of_viral_covered = []\n",
|
807 |
+
" nb_of_tweets_labeled_as_viral = []\n",
|
808 |
+
"\n",
|
809 |
+
" for i in thresholds_space:\n",
|
810 |
+
" new_tweets_labeled = df[df[metric] >= i]\n",
|
811 |
+
" percentage_of_viral_covered = len(new_tweets_labeled[new_tweets_labeled.viral]) / number_of_viral_tweets\n",
|
812 |
+
" nb_of_tweets_labeled_as_viral.append(len(new_tweets_labeled))\n",
|
813 |
+
" percentages_of_viral_covered.append(percentage_of_viral_covered)\n",
|
814 |
+
" \n",
|
815 |
+
" results_to_plot = pd.DataFrame({\n",
|
816 |
+
" f'percentage_of_viral_covered_{metric}':percentages_of_viral_covered,\n",
|
817 |
+
" f'nb_of_tweets_labeled_as_viral_{metric}':nb_of_tweets_labeled_as_viral,\n",
|
818 |
+
" f'thresholds_{metric}': thresholds_space})\n",
|
819 |
+
"\n",
|
820 |
+
" #if remove_duplicates:\n",
|
821 |
+
" # results_to_plot = results_to_plot.sort_values(by='nb_of_tweets_labeled_as_viral').drop_duplicates(subset=['percentage_of_viral_covered'], keep='first')\n",
|
822 |
+
"\n",
|
823 |
+
" # Discard rows where 100% of viral tweets are covered\n",
|
824 |
+
" #results_to_plot = results_to_plot[results_to_plot.percentage_of_viral_covered < 1.0]\n",
|
825 |
+
" # TODO: take min of 100% coverage\n",
|
826 |
+
"\n",
|
827 |
+
" fig = px.scatter(\n",
|
828 |
+
" results_to_plot,\n",
|
829 |
+
" x=f'percentage_of_viral_covered_{metric}',\n",
|
830 |
+
" y=f'nb_of_tweets_labeled_as_viral_{metric}', hover_name=f'thresholds_{metric}')#log_y=True, trendline='ols' \n",
|
831 |
+
"\n",
|
832 |
+
" fig.update_layout(title_text=f\"Percentage of viral covered vs new tweets labeled as viral according to varying metric {metric}\")\n",
|
833 |
+
" fig.show()\n",
|
834 |
+
"\n",
|
835 |
+
" display(f\"Result length {len(results_to_plot)}\")\n",
|
836 |
+
" if not output_filename:\n",
|
837 |
+
" output_filename = metric\n",
|
838 |
+
" results_to_plot.to_csv(f'{output_filename}_viral_covered_vs_new_tweets_labeled.csv', index=False) \n",
|
839 |
+
" \n",
|
840 |
+
" return results_to_plot"
|
841 |
+
]
|
842 |
+
},
|
843 |
+
{
|
844 |
+
"cell_type": "code",
|
845 |
+
"execution_count": null,
|
846 |
+
"metadata": {},
|
847 |
+
"outputs": [],
|
848 |
+
"source": [
|
849 |
+
"#viral_dataset_with_users = viral_dataset_with_users.groupby(by='virality_followers').count()\n",
|
850 |
+
"METRIC_1 = 'virality_followers'\n",
|
851 |
+
"viral_dataset_with_users = pd.read_parquet(f\"{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip\")\n",
|
852 |
+
"# Applying the second metric on the viral dataset\n",
|
853 |
+
"viral_dataset_with_users[METRIC_1] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['followers_count'].astype(\"float64\")\n",
|
854 |
+
"# Handle division by zero if user has 0 followers\n",
|
855 |
+
"viral_dataset_with_users[METRIC_1] = viral_dataset_with_users[METRIC_1].replace({np.inf: 0.0})"
|
856 |
+
]
|
857 |
+
},
|
858 |
+
{
|
859 |
+
"cell_type": "code",
|
860 |
+
"execution_count": null,
|
861 |
+
"metadata": {},
|
862 |
+
"outputs": [],
|
863 |
+
"source": [
|
864 |
+
"df_1 = plot_distribution_for_metric(viral_dataset_with_users, metric='virality_followers', num_experiments=10000)"
|
865 |
+
]
|
866 |
+
},
|
867 |
+
{
|
868 |
+
"cell_type": "code",
|
869 |
+
"execution_count": null,
|
870 |
+
"metadata": {},
|
871 |
+
"outputs": [],
|
872 |
+
"source": [
|
873 |
+
"# Metric 2: retweet / user avg retweets\n",
|
874 |
+
"METRIC_2 = 'virality_avg_retweets'\n",
|
875 |
+
"viral_users_retweet_statistics = viral_dataset_with_users.groupby(by='author_id').retweet_count.agg(['min', 'mean', 'max', 'median'])\n",
|
876 |
+
"viral_users_retweet_statistics = viral_users_retweet_statistics.rename(columns={\n",
|
877 |
+
" \"min\": \"min_user_retweets\", \"max\": \"max_user_retweets\", \"mean\": \"mean_user_retweets\", \"median\": \"median_user_retweets\"})\n",
|
878 |
+
"\n",
|
879 |
+
"viral_dataset_with_users = viral_dataset_with_users.merge(viral_users_retweet_statistics, on='author_id')\n",
|
880 |
+
"\n",
|
881 |
+
"viral_dataset_with_users[METRIC_2] = viral_dataset_with_users['retweet_count'] / viral_dataset_with_users['mean_user_retweets'].astype(\"float64\")\n",
|
882 |
+
"# Handle division by zero if user has 0 followers\n",
|
883 |
+
"viral_dataset_with_users[METRIC_2] = viral_dataset_with_users[METRIC_2].replace({np.inf: 0.0})"
|
884 |
+
]
|
885 |
+
},
|
886 |
+
{
|
887 |
+
"cell_type": "code",
|
888 |
+
"execution_count": null,
|
889 |
+
"metadata": {},
|
890 |
+
"outputs": [],
|
891 |
+
"source": [
|
892 |
+
"df_2 = plot_distribution_for_metric(viral_dataset_with_users, metric='virality_avg_retweets', num_experiments=10000)"
|
893 |
+
]
|
894 |
+
},
|
895 |
+
{
|
896 |
+
"cell_type": "code",
|
897 |
+
"execution_count": null,
|
898 |
+
"metadata": {},
|
899 |
+
"outputs": [],
|
900 |
+
"source": [
|
901 |
+
"# Metric 3: Minimum retweet count (Hard threshold)\n",
|
902 |
+
"METRIC_3 = 'retweet_count'\n",
|
903 |
+
"\n",
|
904 |
+
"viral_tweets = viral_dataset_with_users[viral_dataset_with_users.viral]\n",
|
905 |
+
"min_viral_retweet_count = viral_tweets.retweet_count.min()\n",
|
906 |
+
"max_viral_retweet_count = viral_tweets.retweet_count.max()\n",
|
907 |
+
"\n",
|
908 |
+
"df_3 = plot_distribution_for_metric(\n",
|
909 |
+
" viral_dataset_with_users, metric=METRIC_3, num_experiments=10000,\n",
|
910 |
+
" min_threshold=min_viral_retweet_count, max_threshold=max_viral_retweet_count, generate_thresholds_from_viral_quantiles=False,\n",
|
911 |
+
" output_filename='hard_threshold')"
|
912 |
+
]
|
913 |
+
},
|
914 |
+
{
|
915 |
+
"cell_type": "code",
|
916 |
+
"execution_count": null,
|
917 |
+
"metadata": {},
|
918 |
+
"outputs": [],
|
919 |
+
"source": [
|
920 |
+
"# Metric 4 from Maldonado paper 'Virality Prediction for News Tweets Using RoBERTa'\n",
|
921 |
+
"def roberta_paper_metric(x):\n",
|
922 |
+
" g = x['retweet_count'] + x['like_count']\n",
|
923 |
+
" h = x['followers_count'] - x['following_count']\n",
|
924 |
+
" A = 10\n",
|
925 |
+
"\n",
|
926 |
+
" r = max(x['retweet_count'], 1)\n",
|
927 |
+
" f = max(x['like_count'], 1)\n",
|
928 |
+
" w = max(x['followers_count'], 1)\n",
|
929 |
+
" d = max(x['following_count'], 1)\n",
|
930 |
+
" h = max(h, 1)\n",
|
931 |
+
"\n",
|
932 |
+
" num = g * d * (A * r + f)\n",
|
933 |
+
" denom = w * r * (A * d + h)\n",
|
934 |
+
" #if denom == 0:\n",
|
935 |
+
" # return 0\n",
|
936 |
+
" return num / denom"
|
937 |
+
]
|
938 |
+
},
|
939 |
+
{
|
940 |
+
"cell_type": "code",
|
941 |
+
"execution_count": null,
|
942 |
+
"metadata": {},
|
943 |
+
"outputs": [],
|
944 |
+
"source": [
|
945 |
+
"METRIC_4 = 'roberta_paper_metric'\n",
|
946 |
+
"viral_dataset_with_users[METRIC_4] = viral_dataset_with_users.apply(lambda x: roberta_paper_metric(x), axis='columns')\n",
|
947 |
+
"\n",
|
948 |
+
"df_4 = plot_distribution_for_metric(\n",
|
949 |
+
" viral_dataset_with_users, metric=METRIC_4, num_experiments=100000)"
|
950 |
+
]
|
951 |
+
},
|
952 |
+
{
|
953 |
+
"cell_type": "code",
|
954 |
+
"execution_count": null,
|
955 |
+
"metadata": {},
|
956 |
+
"outputs": [],
|
957 |
+
"source": [
|
958 |
+
"METRIC_5 = 'virality_retweet_percentile_per_user'\n",
|
959 |
+
"\n",
|
960 |
+
"# Take only tweets with positive retweet count, otherwise the quantiles will be very heavy-tailed\n",
|
961 |
+
"#tweets_with_retweets = viral_dataset_with_users[viral_dataset_with_users.retweet_count > 0]\n",
|
962 |
+
"\n",
|
963 |
+
"viral_tweets = viral_dataset_with_users[viral_dataset_with_users.viral]\n",
|
964 |
+
"percentiles = [i/100 for i in range(101)]\n",
|
965 |
+
"number_of_viral_tweets = len(viral_tweets)\n",
|
966 |
+
"\n",
|
967 |
+
"percentages_of_viral_covered = []\n",
|
968 |
+
"nb_of_tweets_labeled_as_viral = []\n",
|
969 |
+
"\n",
|
970 |
+
"for i in tqdm(percentiles):\n",
|
971 |
+
" temp = viral_dataset_with_users.groupby(by='author_id')[['retweet_count']].quantile(i).rename(columns={'retweet_count': f'percentile_{i}'})\n",
|
972 |
+
" temp = viral_dataset_with_users.merge(temp, on='author_id')\n",
|
973 |
+
"\n",
|
974 |
+
" new_tweets_labeled = temp[temp['retweet_count'] >= temp[f'percentile_{i}']]\n",
|
975 |
+
" percentage_of_viral_covered = len(new_tweets_labeled[new_tweets_labeled.viral]) / number_of_viral_tweets\n",
|
976 |
+
" nb_of_tweets_labeled_as_viral.append(len(new_tweets_labeled))\n",
|
977 |
+
" percentages_of_viral_covered.append(percentage_of_viral_covered)\n",
|
978 |
+
"\n",
|
979 |
+
"df_5 = pd.DataFrame({\n",
|
980 |
+
" f'percentage_of_viral_covered_{METRIC_5}':percentages_of_viral_covered,\n",
|
981 |
+
" f'nb_of_tweets_labeled_as_viral_{METRIC_5}':nb_of_tweets_labeled_as_viral,\n",
|
982 |
+
" f'thresholds_{METRIC_5}': percentiles})\n",
|
983 |
+
"\n",
|
984 |
+
"fig = px.scatter(\n",
|
985 |
+
" df_5,\n",
|
986 |
+
" x=f'percentage_of_viral_covered_{METRIC_5}',\n",
|
987 |
+
" y=f'nb_of_tweets_labeled_as_viral_{METRIC_5}', hover_name=f'thresholds_{METRIC_5}')#log_y=True, trendline='ols' \n",
|
988 |
+
"\n",
|
989 |
+
"fig.update_layout(title_text=f\"Percentage of viral covered vs new tweets labeled as viral according to varying metric {METRIC_5}\")\n",
|
990 |
+
"fig.show()\n",
|
991 |
+
"\n",
|
992 |
+
"display(f\"Result length {len(df_5)}\")\n",
|
993 |
+
"df_5.to_csv(f'{METRIC_5}_viral_covered_vs_new_tweets_labeled.csv', index=False)"
|
994 |
+
]
|
995 |
+
},
|
996 |
+
{
|
997 |
+
"cell_type": "code",
|
998 |
+
"execution_count": null,
|
999 |
+
"metadata": {},
|
1000 |
+
"outputs": [],
|
1001 |
+
"source": [
|
1002 |
+
"# Metric 6: Median\n",
|
1003 |
+
"METRIC_6 = 'virality_median_retweets'\n",
|
1004 |
+
"\n",
|
1005 |
+
"positive_median_dataset = viral_dataset_with_users[viral_dataset_with_users['median_user_retweets'] > 0].copy()\n",
|
1006 |
+
"positive_median_dataset.loc[:, METRIC_6] = positive_median_dataset['retweet_count'] / positive_median_dataset['median_user_retweets'].astype(\"float64\")\n",
|
1007 |
+
"# Handle division by zero if user has 0 followers\n",
|
1008 |
+
"positive_median_dataset.loc[:, METRIC_6] = positive_median_dataset[METRIC_6].replace({np.inf: 0.0, np.nan:0.0})"
|
1009 |
+
]
|
1010 |
+
},
|
1011 |
+
{
|
1012 |
+
"cell_type": "code",
|
1013 |
+
"execution_count": null,
|
1014 |
+
"metadata": {},
|
1015 |
+
"outputs": [],
|
1016 |
+
"source": [
|
1017 |
+
"df_6 = plot_distribution_for_metric(\n",
|
1018 |
+
" positive_median_dataset, metric=METRIC_6, num_experiments=10000, remove_duplicates=True)"
|
1019 |
+
]
|
1020 |
+
},
|
1021 |
+
{
|
1022 |
+
"cell_type": "code",
|
1023 |
+
"execution_count": null,
|
1024 |
+
"metadata": {},
|
1025 |
+
"outputs": [],
|
1026 |
+
"source": [
|
1027 |
+
"# log(retweet_counts) / followers_count\n",
|
1028 |
+
"METRIC_7 = 'log_retweets_over_followers'\n",
|
1029 |
+
"\n",
|
1030 |
+
"positive_retweet_and_follower_count = viral_dataset_with_users[(viral_dataset_with_users.retweet_count > 0) & (viral_dataset_with_users.followers_count > 0)].copy()\n",
|
1031 |
+
"\n",
|
1032 |
+
"positive_retweet_and_follower_count.loc[:, METRIC_7] = (np.log(positive_retweet_and_follower_count['retweet_count']) / positive_retweet_and_follower_count['followers_count']).astype(\"float64\")\n",
|
1033 |
+
"positive_retweet_and_follower_count.loc[:, METRIC_7] = positive_retweet_and_follower_count[METRIC_7].replace({np.inf: 0.0, np.nan:0.0})\n",
|
1034 |
+
"\n",
|
1035 |
+
"df_7 = plot_distribution_for_metric(\n",
|
1036 |
+
" positive_retweet_and_follower_count, metric=METRIC_7, num_experiments=10000, remove_duplicates=True)"
|
1037 |
+
]
|
1038 |
+
},
|
1039 |
+
{
|
1040 |
+
"cell_type": "code",
|
1041 |
+
"execution_count": null,
|
1042 |
+
"metadata": {},
|
1043 |
+
"outputs": [],
|
1044 |
+
"source": [
|
1045 |
+
"METRIC_8 = 'retweets_over_log_followers'\n",
|
1046 |
+
"\n",
|
1047 |
+
"positive_retweet_and_follower_count.loc[:, METRIC_8] = (positive_retweet_and_follower_count['retweet_count'] / np.log(positive_retweet_and_follower_count['followers_count'])).astype(\"float64\")\n",
|
1048 |
+
"positive_retweet_and_follower_count.loc[:, METRIC_8] = positive_retweet_and_follower_count[METRIC_8].replace({np.inf: 0.0, np.nan:0.0})\n",
|
1049 |
+
"\n",
|
1050 |
+
"df_8 = plot_distribution_for_metric(\n",
|
1051 |
+
" positive_retweet_and_follower_count, metric=METRIC_8, num_experiments=10000, remove_duplicates=True)"
|
1052 |
+
]
|
1053 |
+
},
|
1054 |
+
{
|
1055 |
+
"cell_type": "code",
|
1056 |
+
"execution_count": null,
|
1057 |
+
"metadata": {},
|
1058 |
+
"outputs": [],
|
1059 |
+
"source": [
|
1060 |
+
"METRIC_9 = 'log_retweets_over_log_followers'\n",
|
1061 |
+
"\n",
|
1062 |
+
"positive_retweet_and_follower_count.loc[:, METRIC_9] = (np.log(positive_retweet_and_follower_count['retweet_count']) / np.log(positive_retweet_and_follower_count['followers_count'])).astype(\"float64\")\n",
|
1063 |
+
"positive_retweet_and_follower_count.loc[:, METRIC_9] = positive_retweet_and_follower_count[METRIC_9].replace({np.inf: 0.0, np.nan:0.0})\n",
|
1064 |
+
"\n",
|
1065 |
+
"df_9 = plot_distribution_for_metric(\n",
|
1066 |
+
" positive_retweet_and_follower_count, metric=METRIC_9, num_experiments=10000, remove_duplicates=True)"
|
1067 |
+
]
|
1068 |
+
},
|
1069 |
+
{
|
1070 |
+
"cell_type": "code",
|
1071 |
+
"execution_count": null,
|
1072 |
+
"metadata": {},
|
1073 |
+
"outputs": [],
|
1074 |
+
"source": [
|
1075 |
+
"final_result = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9], axis=1)\n",
|
1076 |
+
"final_result.to_csv('final_result_viral_coverage.csv')"
|
1077 |
+
]
|
1078 |
+
},
|
1079 |
+
{
|
1080 |
+
"cell_type": "markdown",
|
1081 |
+
"metadata": {},
|
1082 |
+
"source": [
|
1083 |
+
"### 1.3 Viral Dataset Exploration: Comparison between viral and non viral tweets using other features "
|
1084 |
+
]
|
1085 |
+
},
|
1086 |
+
{
|
1087 |
+
"cell_type": "code",
|
1088 |
+
"execution_count": null,
|
1089 |
+
"metadata": {},
|
1090 |
+
"outputs": [],
|
1091 |
+
"source": [
|
1092 |
+
"# TODO: Only take viral tweets from scraped. Since sentiment is already computed on the other dataset, we relabel dataset viral by checking if in scraped ids \n",
|
1093 |
+
"# (DONE)"
|
1094 |
+
]
|
1095 |
+
},
|
1096 |
+
{
|
1097 |
+
"cell_type": "code",
|
1098 |
+
"execution_count": null,
|
1099 |
+
"metadata": {},
|
1100 |
+
"outputs": [],
|
1101 |
+
"source": [
|
1102 |
+
"viral_dataset_labeled = pd.read_parquet(f'{PROCESSED_PATH_VIRAL}/all_tweets.parquet.gzip')"
|
1103 |
+
]
|
1104 |
+
},
|
1105 |
+
{
|
1106 |
+
"cell_type": "code",
|
1107 |
+
"execution_count": null,
|
1108 |
+
"metadata": {},
|
1109 |
+
"outputs": [],
|
1110 |
+
"source": [
|
1111 |
+
"display(f\"{len(viral_dataset_labeled[viral_dataset_labeled.viral])} viral tweets out of {len(viral_dataset_labeled)}\")"
|
1112 |
+
]
|
1113 |
+
},
|
1114 |
+
{
|
1115 |
+
"cell_type": "markdown",
|
1116 |
+
"metadata": {},
|
1117 |
+
"source": [
|
1118 |
+
"#### 1.3.1 - Language"
|
1119 |
+
]
|
1120 |
+
},
|
1121 |
+
{
|
1122 |
+
"cell_type": "code",
|
1123 |
+
"execution_count": null,
|
1124 |
+
"metadata": {},
|
1125 |
+
"outputs": [],
|
1126 |
+
"source": [
|
1127 |
+
"languages_aggregates = viral_dataset_labeled.groupby(by='lang', as_index=False)[['id']].count().rename(columns={'id': 'count'})\n",
|
1128 |
+
"languages_aggregates = languages_aggregates.sort_values(by='count', ascending=False)\n",
|
1129 |
+
"languages_aggregates.loc[languages_aggregates['count'] < 10000, 'lang'] = 'Other Languages'\n",
|
1130 |
+
"fig = px.pie(languages_aggregates, values='count', names='lang', title='Distribution of Tweets languages')\n",
|
1131 |
+
"\n",
|
1132 |
+
"fig.update_layout(\n",
|
1133 |
+
" autosize=False,\n",
|
1134 |
+
" width=500,\n",
|
1135 |
+
" height=500\n",
|
1136 |
+
")"
|
1137 |
+
]
|
1138 |
+
},
|
1139 |
+
{
|
1140 |
+
"cell_type": "code",
|
1141 |
+
"execution_count": null,
|
1142 |
+
"metadata": {},
|
1143 |
+
"outputs": [],
|
1144 |
+
"source": [
|
1145 |
+
"pd.crosstab(index = viral_dataset_labeled['lang'] == 'en', columns=viral_dataset_labeled['viral']) "
|
1146 |
+
]
|
1147 |
+
},
|
1148 |
+
{
|
1149 |
+
"cell_type": "markdown",
|
1150 |
+
"metadata": {},
|
1151 |
+
"source": [
|
1152 |
+
"#### 1.3.2 - Media"
|
1153 |
+
]
|
1154 |
+
},
|
1155 |
+
{
|
1156 |
+
"cell_type": "code",
|
1157 |
+
"execution_count": null,
|
1158 |
+
"metadata": {},
|
1159 |
+
"outputs": [],
|
1160 |
+
"source": [
|
1161 |
+
"# Has media\n",
|
1162 |
+
"labels = [\"Media\", \"No Media\"]\n",
|
1163 |
+
"viral_has_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == True) & (viral_dataset_labeled.has_media == True)])\n",
|
1164 |
+
"viral_no_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == True) & (viral_dataset_labeled.has_media == False)])\n",
|
1165 |
+
"normal_has_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == False) & (viral_dataset_labeled.has_media == True)])\n",
|
1166 |
+
"normal_no_media = len(viral_dataset_labeled[(viral_dataset_labeled.viral == False) & (viral_dataset_labeled.has_media == False)])\n",
|
1167 |
+
"\n",
|
1168 |
+
"\n",
|
1169 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
1170 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
1171 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_has_media, viral_no_media], name=\"Viral with Media\"),\n",
|
1172 |
+
" 1, 1)\n",
|
1173 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_has_media, normal_no_media], name=\"Tweet with Media\"),\n",
|
1174 |
+
" 1, 2)\n",
|
1175 |
+
"\n",
|
1176 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
1177 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
1178 |
+
"\n",
|
1179 |
+
"fig.update_layout(\n",
|
1180 |
+
" width=1000,\n",
|
1181 |
+
" height=500,\n",
|
1182 |
+
" title_text=\"Percentage of tweets with some kind of media\",\n",
|
1183 |
+
" # Add annotations in the center of the donut pies.\n",
|
1184 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
1185 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
1186 |
+
"fig.show()"
|
1187 |
+
]
|
1188 |
+
},
|
1189 |
+
{
|
1190 |
+
"cell_type": "markdown",
|
1191 |
+
"metadata": {},
|
1192 |
+
"source": [
|
1193 |
+
"Calculating the p-value between the target `viral` and `has_media`\n"
|
1194 |
+
]
|
1195 |
+
},
|
1196 |
+
{
|
1197 |
+
"cell_type": "code",
|
1198 |
+
"execution_count": null,
|
1199 |
+
"metadata": {},
|
1200 |
+
"outputs": [],
|
1201 |
+
"source": [
|
1202 |
+
"from scipy.stats import chi2_contingency \n",
|
1203 |
+
"\n",
|
1204 |
+
"# Calculating the p-value\n",
|
1205 |
+
"contingency_media = pd.crosstab(index = viral_dataset_labeled['has_media'], columns=viral_dataset_labeled['viral']) \n",
|
1206 |
+
"display(contingency_media)\n",
|
1207 |
+
"# Display with percentages\n",
|
1208 |
+
"display(pd.crosstab(index = viral_dataset_labeled['has_media'], columns=viral_dataset_labeled['viral'], normalize='columns') )\n",
|
1209 |
+
"\n",
|
1210 |
+
"c, p, dof, expected = chi2_contingency(contingency_media) \n",
|
1211 |
+
"display(f'p-value {p}')\n",
|
1212 |
+
"c, p, dof, expected"
|
1213 |
+
]
|
1214 |
+
},
|
1215 |
+
{
|
1216 |
+
"cell_type": "markdown",
|
1217 |
+
"metadata": {},
|
1218 |
+
"source": [
|
1219 |
+
"**Finding**: Viral tweets have more chance of having some kind of media (Video, Image, GIF..) embedded than non viral tweets."
|
1220 |
+
]
|
1221 |
+
},
|
1222 |
+
{
|
1223 |
+
"cell_type": "markdown",
|
1224 |
+
"metadata": {},
|
1225 |
+
"source": [
|
1226 |
+
"#### 1.3.2 - Context annotations (Topics)"
|
1227 |
+
]
|
1228 |
+
},
|
1229 |
+
{
|
1230 |
+
"cell_type": "code",
|
1231 |
+
"execution_count": null,
|
1232 |
+
"metadata": {},
|
1233 |
+
"outputs": [],
|
1234 |
+
"source": [
|
1235 |
+
"viral_tweets_topic_domains = viral_dataset_labeled[viral_dataset_labeled.viral == True] \\\n",
|
1236 |
+
" .explode('topic_domains') \\\n",
|
1237 |
+
" .dropna(axis=0, subset=['topic_domains']) \\\n",
|
1238 |
+
" .topic_domains \n",
|
1239 |
+
"\n",
|
1240 |
+
"tweets_topic_domains = viral_dataset_labeled[viral_dataset_labeled.viral == False] \\\n",
|
1241 |
+
" .explode('topic_domains') \\\n",
|
1242 |
+
" .dropna(axis=0, subset=['topic_domains']) \\\n",
|
1243 |
+
" .topic_domains\n",
|
1244 |
+
"\n",
|
1245 |
+
"viral_topics_domains_sorted = viral_tweets_topic_domains.groupby(viral_tweets_topic_domains).count().sort_values(ascending=False)\n",
|
1246 |
+
"tweet_topics_domains_sorted = tweets_topic_domains.groupby(tweets_topic_domains).count().sort_values(ascending=False)"
|
1247 |
+
]
|
1248 |
+
},
|
1249 |
+
{
|
1250 |
+
"cell_type": "code",
|
1251 |
+
"execution_count": null,
|
1252 |
+
"metadata": {},
|
1253 |
+
"outputs": [],
|
1254 |
+
"source": [
|
1255 |
+
"import pickle\n",
|
1256 |
+
"\n",
|
1257 |
+
"with open(f'{DATA_PATH}/topic_domains.pickle', 'rb') as handle:\n",
|
1258 |
+
" topic_domains = pickle.load(handle)\n",
|
1259 |
+
"\n",
|
1260 |
+
"top_10_viral_topic_domains = viral_topics_domains_sorted[:10]\n",
|
1261 |
+
"top_10_tweet_topic_domains = tweet_topics_domains_sorted[:10]\n",
|
1262 |
+
"\n",
|
1263 |
+
"display(f\"Top 10 topic domains in viral tweets: \\n {[topic_domains.get(x)['name'] for x in top_10_viral_topic_domains.index.values]}\")\n",
|
1264 |
+
"display(f\"Top 10 topic domains in general tweets: \\n {[topic_domains.get(x)['name'] for x in top_10_tweet_topic_domains.index.values]}\")"
|
1265 |
+
]
|
1266 |
+
},
|
1267 |
+
{
|
1268 |
+
"cell_type": "code",
|
1269 |
+
"execution_count": null,
|
1270 |
+
"metadata": {},
|
1271 |
+
"outputs": [],
|
1272 |
+
"source": [
|
1273 |
+
"viral_labels = [topic_domains.get(x)['name'] for x in top_10_viral_topic_domains.index.values]\n",
|
1274 |
+
"non_viral_labels = [topic_domains.get(x)['name'] for x in top_10_tweet_topic_domains.index.values]\n",
|
1275 |
+
"\n",
|
1276 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
1277 |
+
"fig2 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
1278 |
+
"fig2.add_trace(go.Pie(labels=viral_labels, values=top_10_viral_topic_domains.values, name=\"Viral Tweet Topic domain\"),\n",
|
1279 |
+
" 1, 1)\n",
|
1280 |
+
"fig2.add_trace(go.Pie(labels=non_viral_labels, values=top_10_tweet_topic_domains.values, name=\"Non-Viral Tweet Topic domain\"),\n",
|
1281 |
+
" 1, 2)\n",
|
1282 |
+
"\n",
|
1283 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
1284 |
+
"fig2.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
1285 |
+
"\n",
|
1286 |
+
"fig2.update_layout(\n",
|
1287 |
+
" width=1000,\n",
|
1288 |
+
" height=500,\n",
|
1289 |
+
" title_text=\"Top 10 topic domains for viral vs non-viral tweets\",\n",
|
1290 |
+
" # Add annotations in the center of the donut pies.\n",
|
1291 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
1292 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
1293 |
+
"fig2.show()"
|
1294 |
+
]
|
1295 |
+
},
|
1296 |
+
{
|
1297 |
+
"cell_type": "markdown",
|
1298 |
+
"metadata": {},
|
1299 |
+
"source": [
|
1300 |
+
"#### 1.3.3 - Tweet Length"
|
1301 |
+
]
|
1302 |
+
},
|
1303 |
+
{
|
1304 |
+
"cell_type": "code",
|
1305 |
+
"execution_count": null,
|
1306 |
+
"metadata": {},
|
1307 |
+
"outputs": [],
|
1308 |
+
"source": [
|
1309 |
+
"viral_dataset_labeled.loc[:, 'tweet_length'] = viral_dataset_labeled.text.apply(len)"
|
1310 |
+
]
|
1311 |
+
},
|
1312 |
+
{
|
1313 |
+
"cell_type": "code",
|
1314 |
+
"execution_count": null,
|
1315 |
+
"metadata": {},
|
1316 |
+
"outputs": [],
|
1317 |
+
"source": [
|
1318 |
+
"display(viral_dataset_labeled[['tweet_length', 'retweet_count']].corr())\n",
|
1319 |
+
"\n",
|
1320 |
+
"avg_tweet_length_viral = viral_dataset_labeled[viral_dataset_labeled.viral].tweet_length.mean()\n",
|
1321 |
+
"avg_tweet_length_non_viral = viral_dataset_labeled[~viral_dataset_labeled.viral].tweet_length.mean()\n",
|
1322 |
+
"\n",
|
1323 |
+
"display(f'viral avg tweet length: {avg_tweet_length_viral} \\n non-viral avg tweet length: {avg_tweet_length_non_viral}')"
|
1324 |
+
]
|
1325 |
+
},
|
1326 |
+
{
|
1327 |
+
"cell_type": "markdown",
|
1328 |
+
"metadata": {},
|
1329 |
+
"source": [
|
1330 |
+
"Some tweets are replies to others so **mentions are automatically inserted at the beginning of the tweet**, but they do not count in the Twitter max character count, so we should discard them."
|
1331 |
+
]
|
1332 |
+
},
|
1333 |
+
{
|
1334 |
+
"cell_type": "code",
|
1335 |
+
"execution_count": null,
|
1336 |
+
"metadata": {},
|
1337 |
+
"outputs": [],
|
1338 |
+
"source": [
|
1339 |
+
"viral_dataset_labeled.loc[:, \"text\"] = viral_dataset_labeled.text.apply(clear_reply_mentions)\n",
|
1340 |
+
"viral_dataset_labeled.loc[:, 'tweet_length'] = viral_dataset_labeled.text.apply(len)"
|
1341 |
+
]
|
1342 |
+
},
|
1343 |
+
{
|
1344 |
+
"cell_type": "code",
|
1345 |
+
"execution_count": null,
|
1346 |
+
"metadata": {},
|
1347 |
+
"outputs": [],
|
1348 |
+
"source": [
|
1349 |
+
"display(viral_dataset_labeled[['tweet_length', 'retweet_count']].corr())\n",
|
1350 |
+
"\n",
|
1351 |
+
"avg_tweet_length_viral = viral_dataset_labeled[viral_dataset_labeled.viral].tweet_length.mean()\n",
|
1352 |
+
"avg_tweet_length_non_viral = viral_dataset_labeled[~viral_dataset_labeled.viral].tweet_length.mean()\n",
|
1353 |
+
"\n",
|
1354 |
+
"display(f'viral avg tweet length: {avg_tweet_length_viral} \\n non-viral avg tweet length: {avg_tweet_length_non_viral}')"
|
1355 |
+
]
|
1356 |
+
},
|
1357 |
+
{
|
1358 |
+
"cell_type": "markdown",
|
1359 |
+
"metadata": {},
|
1360 |
+
"source": [
|
1361 |
+
"Calculating the welch’s t-test (scipy t-test) for continuous variable `tweet_length`"
|
1362 |
+
]
|
1363 |
+
},
|
1364 |
+
{
|
1365 |
+
"cell_type": "code",
|
1366 |
+
"execution_count": null,
|
1367 |
+
"metadata": {},
|
1368 |
+
"outputs": [],
|
1369 |
+
"source": [
|
1370 |
+
"from scipy.stats import ttest_ind\n",
|
1371 |
+
"\n",
|
1372 |
+
"ttest_ind(viral_dataset_labeled[viral_dataset_labeled.viral].tweet_length, viral_dataset_labeled[~viral_dataset_labeled.viral].tweet_length, equal_var=False)"
|
1373 |
+
]
|
1374 |
+
},
|
1375 |
+
{
|
1376 |
+
"cell_type": "markdown",
|
1377 |
+
"metadata": {},
|
1378 |
+
"source": [
|
1379 |
+
"#### 1.3.4 - Sentiment "
|
1380 |
+
]
|
1381 |
+
},
|
1382 |
+
{
|
1383 |
+
"cell_type": "markdown",
|
1384 |
+
"metadata": {},
|
1385 |
+
"source": [
|
1386 |
+
"For the sentiment analysis, we used huggingface's [default sentiment analysis model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you). We instantiate a huggingface pipeline using that default model, and we pass the tweets text to it, outputting a **label** (e.g. POSITIVE, NEGATIVE) alongside a **confidence score**. This will only be applied to english tweets.\n",
|
1387 |
+
"\n",
|
1388 |
+
"**NOTE**: Feel free to skip the following cells if you already have the processed data. Sentiment analysis takes some time (around 2 hours on the whole data). "
|
1389 |
+
]
|
1390 |
+
},
|
1391 |
+
{
|
1392 |
+
"cell_type": "code",
|
1393 |
+
"execution_count": null,
|
1394 |
+
"metadata": {},
|
1395 |
+
"outputs": [],
|
1396 |
+
"source": [
|
1397 |
+
"from transformers import pipeline\n",
|
1398 |
+
"\n",
|
1399 |
+
"# Device = 0 means it will use the Cuda at index 0\n",
|
1400 |
+
"sentiment_classifier = pipeline(\"sentiment-analysis\", device=0)"
|
1401 |
+
]
|
1402 |
+
},
|
1403 |
+
{
|
1404 |
+
"cell_type": "markdown",
|
1405 |
+
"metadata": {},
|
1406 |
+
"source": [
|
1407 |
+
"This will only be applied to **english tweets**. All the viral tweets we scraped are in English, so we won't be losing viral data when filtering."
|
1408 |
+
]
|
1409 |
+
},
|
1410 |
+
{
|
1411 |
+
"cell_type": "code",
|
1412 |
+
"execution_count": null,
|
1413 |
+
"metadata": {},
|
1414 |
+
"outputs": [],
|
1415 |
+
"source": [
|
1416 |
+
"english_viral_dataset = viral_dataset_labeled[viral_dataset_labeled.lang == 'en']\n",
|
1417 |
+
"english_viral_dataset"
|
1418 |
+
]
|
1419 |
+
},
|
1420 |
+
{
|
1421 |
+
"cell_type": "markdown",
|
1422 |
+
"metadata": {},
|
1423 |
+
"source": [
|
1424 |
+
"Here we use the pandas `apply` function, with `result_type` to *expand*, so that the sentiment scores and label will be output into different columns."
|
1425 |
+
]
|
1426 |
+
},
|
1427 |
+
{
|
1428 |
+
"cell_type": "code",
|
1429 |
+
"execution_count": null,
|
1430 |
+
"metadata": {},
|
1431 |
+
"outputs": [],
|
1432 |
+
"source": [
|
1433 |
+
"applied = english_viral_dataset.apply(lambda x: sentiment_classifier(x.text)[0], axis=1, result_type='expand')\n",
|
1434 |
+
"#pd.concat([small_test_set, applied], axis='columns')\n",
|
1435 |
+
"applied"
|
1436 |
+
]
|
1437 |
+
},
|
1438 |
+
{
|
1439 |
+
"cell_type": "code",
|
1440 |
+
"execution_count": null,
|
1441 |
+
"metadata": {},
|
1442 |
+
"outputs": [],
|
1443 |
+
"source": [
|
1444 |
+
"sentiment_features = pd.concat([english_viral_dataset, applied], axis=1)\n",
|
1445 |
+
"sentiment_features"
|
1446 |
+
]
|
1447 |
+
},
|
1448 |
+
{
|
1449 |
+
"cell_type": "code",
|
1450 |
+
"execution_count": null,
|
1451 |
+
"metadata": {},
|
1452 |
+
"outputs": [],
|
1453 |
+
"source": [
|
1454 |
+
"sentiment_features = sentiment_features.rename(columns={\"label\": \"sentiment\", \"score\": \"sentiment_score\"})"
|
1455 |
+
]
|
1456 |
+
},
|
1457 |
+
{
|
1458 |
+
"cell_type": "code",
|
1459 |
+
"execution_count": null,
|
1460 |
+
"metadata": {},
|
1461 |
+
"outputs": [],
|
1462 |
+
"source": [
|
1463 |
+
"sentiment_features.to_parquet(f\"{PROCESSED_PATH_VIRAL}/all_english_tweets_with_users_with_sentiment.parquet.gzip\", index=False, compression=\"gzip\")"
|
1464 |
+
]
|
1465 |
+
},
|
1466 |
+
{
|
1467 |
+
"cell_type": "markdown",
|
1468 |
+
"metadata": {},
|
1469 |
+
"source": [
|
1470 |
+
"Get the processed data already"
|
1471 |
+
]
|
1472 |
+
},
|
1473 |
+
{
|
1474 |
+
"cell_type": "code",
|
1475 |
+
"execution_count": null,
|
1476 |
+
"metadata": {},
|
1477 |
+
"outputs": [],
|
1478 |
+
"source": [
|
1479 |
+
"sentiment_features = pd.read_parquet(f\"{PROCESSED_PATH_VIRAL}/all_english_tweets_with_users_with_sentiment.parquet.gzip\")\n",
|
1480 |
+
"display(f\"{len(sentiment_features[sentiment_features.viral])} viral tweets out of {len(sentiment_features)}\")"
|
1481 |
+
]
|
1482 |
+
},
|
1483 |
+
{
|
1484 |
+
"cell_type": "code",
|
1485 |
+
"execution_count": null,
|
1486 |
+
"metadata": {},
|
1487 |
+
"outputs": [],
|
1488 |
+
"source": [
|
1489 |
+
"# Tweets with sentiment scores over 70%\n",
|
1490 |
+
"display(f\"Tweets with sentiment analysis confidence scores above 0.7: {len(sentiment_features[sentiment_features.sentiment_score > 0.7])}\")\n",
|
1491 |
+
"display(f\"{len(sentiment_features[sentiment_features.sentiment == 'POSITIVE'])} positive tweets\")\n",
|
1492 |
+
"display(f\"{len(sentiment_features[sentiment_features.sentiment == 'NEGATIVE'])} negative tweets\")\n",
|
1493 |
+
"\n",
|
1494 |
+
"confident_sentiment_tweets = sentiment_features[sentiment_features.sentiment_score > 0.7]"
|
1495 |
+
]
|
1496 |
+
},
|
1497 |
+
{
|
1498 |
+
"cell_type": "code",
|
1499 |
+
"execution_count": null,
|
1500 |
+
"metadata": {},
|
1501 |
+
"outputs": [],
|
1502 |
+
"source": [
|
1503 |
+
"# We keep only retweeted tweets to pan out tweets with zero retweets with little utility.\n",
|
1504 |
+
"#retweeted_tweets = confident_sentiment_tweets[confident_sentiment_tweets.retweet_count > 0]\n",
|
1505 |
+
"\n",
|
1506 |
+
"labels = [\"Positive\", \"Negative\"]\n",
|
1507 |
+
"viral_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
|
1508 |
+
"viral_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
|
1509 |
+
"normal_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
|
1510 |
+
"normal_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
|
1511 |
+
"\n",
|
1512 |
+
"\n",
|
1513 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
1514 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
1515 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_positive, viral_negative], name=\"Positive Viral Tweets\"),\n",
|
1516 |
+
" 1, 1)\n",
|
1517 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_positive, normal_negative], name=\"Positive Non-Viral Tweets\"),\n",
|
1518 |
+
" 1, 2)\n",
|
1519 |
+
"\n",
|
1520 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
1521 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
1522 |
+
"\n",
|
1523 |
+
"fig.update_layout(\n",
|
1524 |
+
" width=1000,\n",
|
1525 |
+
" height=500,\n",
|
1526 |
+
" title_text=\"Distribution of positive and negative sentiment in viral vs non-viral tweets\",\n",
|
1527 |
+
" # Add annotations in the center of the donut pies.\n",
|
1528 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
1529 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
1530 |
+
"fig.show()"
|
1531 |
+
]
|
1532 |
+
},
|
1533 |
+
{
|
1534 |
+
"cell_type": "markdown",
|
1535 |
+
"metadata": {},
|
1536 |
+
"source": [
|
1537 |
+
"Calculating the p-value between the target `viral` and positive sentiment\n"
|
1538 |
+
]
|
1539 |
+
},
|
1540 |
+
{
|
1541 |
+
"cell_type": "code",
|
1542 |
+
"execution_count": null,
|
1543 |
+
"metadata": {},
|
1544 |
+
"outputs": [],
|
1545 |
+
"source": [
|
1546 |
+
"from scipy.stats import chi2_contingency \n",
|
1547 |
+
"\n",
|
1548 |
+
"#confident_sentiment_tweets.loc[:, 'is_positive'] = confident_sentiment_tweets.sentiment == 'POSITIVE'\n",
|
1549 |
+
"\n",
|
1550 |
+
"# Calculating the p-value\n",
|
1551 |
+
"contingency_sentiment = pd.crosstab(index = confident_sentiment_tweets['sentiment'], columns=confident_sentiment_tweets['viral']) \n",
|
1552 |
+
"# Display with percentages\n",
|
1553 |
+
"contingency_sentiment_normalized_percentage = pd.crosstab(\n",
|
1554 |
+
" index = confident_sentiment_tweets['sentiment'], columns=confident_sentiment_tweets['viral'], normalize='columns') \n",
|
1555 |
+
"display(contingency_sentiment_normalized_percentage)\n",
|
1556 |
+
"\n",
|
1557 |
+
"c, p, dof, expected = chi2_contingency(contingency_sentiment) \n",
|
1558 |
+
"display(f'p-value {p}')\n",
|
1559 |
+
"c, p, dof, expected"
|
1560 |
+
]
|
1561 |
+
},
|
1562 |
+
{
|
1563 |
+
"cell_type": "markdown",
|
1564 |
+
"metadata": {},
|
1565 |
+
"source": [
|
1566 |
+
"Calculating the p-value between the target `viral` and negative sentiment\n"
|
1567 |
+
]
|
1568 |
+
},
|
1569 |
+
{
|
1570 |
+
"cell_type": "code",
|
1571 |
+
"execution_count": null,
|
1572 |
+
"metadata": {},
|
1573 |
+
"outputs": [],
|
1574 |
+
"source": [
|
1575 |
+
"from scipy.stats import chi2_contingency \n",
|
1576 |
+
"\n",
|
1577 |
+
"confident_sentiment_tweets.loc[:, 'is_negative'] = confident_sentiment_tweets.sentiment == 'NEGATIVE'\n",
|
1578 |
+
"\n",
|
1579 |
+
"# Calculating the p-value\n",
|
1580 |
+
"contingency_negative_sentiment = pd.crosstab(index = confident_sentiment_tweets['is_negative'], columns=confident_sentiment_tweets['viral']) \n",
|
1581 |
+
"# Display with percentages\n",
|
1582 |
+
"contingency_negative_sentiment_normalized_percentage = pd.crosstab(\n",
|
1583 |
+
" index = confident_sentiment_tweets['is_negative'], columns=confident_sentiment_tweets['viral'], normalize='columns') \n",
|
1584 |
+
"display(contingency_negative_sentiment_normalized_percentage)\n",
|
1585 |
+
"\n",
|
1586 |
+
"c, p, dof, expected = chi2_contingency(contingency_negative_sentiment) \n",
|
1587 |
+
"display(f'p-value {p}')\n",
|
1588 |
+
"c, p, dof, expected"
|
1589 |
+
]
|
1590 |
+
},
|
1591 |
+
{
|
1592 |
+
"cell_type": "code",
|
1593 |
+
"execution_count": null,
|
1594 |
+
"metadata": {},
|
1595 |
+
"outputs": [],
|
1596 |
+
"source": [
|
1597 |
+
"'''\n",
|
1598 |
+
"import spacy\n",
|
1599 |
+
"import vaderSentiment\n",
|
1600 |
+
"from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
|
1601 |
+
"\n",
|
1602 |
+
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
1603 |
+
"\n",
|
1604 |
+
"spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS\n",
|
1605 |
+
"print('Number of stop words: %d' % len(spacy_stopwords))\n",
|
1606 |
+
"print('First ten stop words:',list(spacy_stopwords)[:10])\n",
|
1607 |
+
"'''"
|
1608 |
+
]
|
1609 |
+
},
|
1610 |
+
{
|
1611 |
+
"cell_type": "code",
|
1612 |
+
"execution_count": null,
|
1613 |
+
"metadata": {},
|
1614 |
+
"outputs": [],
|
1615 |
+
"source": [
|
1616 |
+
"'''\n",
|
1617 |
+
"# Remove new lines \n",
|
1618 |
+
"remove_new_lines = lambda x: \" \".join(x.split())\n",
|
1619 |
+
"viral_dataset_labeled['processed_text'] = viral_dataset_labeled['text'].apply(remove_new_lines)\n",
|
1620 |
+
"\n",
|
1621 |
+
"\n",
|
1622 |
+
"english_tweets = viral_dataset_labeled[viral_dataset_labeled.lang == 'en']\n",
|
1623 |
+
"'''"
|
1624 |
+
]
|
1625 |
+
},
|
1626 |
+
{
|
1627 |
+
"cell_type": "markdown",
|
1628 |
+
"metadata": {},
|
1629 |
+
"source": [
|
1630 |
+
"#### 1.3.5 - Number of hashtags "
|
1631 |
+
]
|
1632 |
+
},
|
1633 |
+
{
|
1634 |
+
"cell_type": "code",
|
1635 |
+
"execution_count": null,
|
1636 |
+
"metadata": {},
|
1637 |
+
"outputs": [],
|
1638 |
+
"source": [
|
1639 |
+
"viral_dataset_labeled.loc[:, \"nb_of_hashtags\"] = viral_dataset_labeled.hashtags.apply(lambda x: len(x) if np.all(x) else 0)"
|
1640 |
+
]
|
1641 |
+
},
|
1642 |
+
{
|
1643 |
+
"cell_type": "code",
|
1644 |
+
"execution_count": null,
|
1645 |
+
"metadata": {},
|
1646 |
+
"outputs": [],
|
1647 |
+
"source": [
|
1648 |
+
"labels = [\"Hashtags\", \"No Hashtags\"]\n",
|
1649 |
+
"viral_has_hashtags = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags >= 1)])\n",
|
1650 |
+
"viral_no_hashtags = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags == 0)])\n",
|
1651 |
+
"normal_has_hashtags = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags >= 1)])\n",
|
1652 |
+
"normal_no_hashtags = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (viral_dataset_labeled.nb_of_hashtags == 0)])\n",
|
1653 |
+
"\n",
|
1654 |
+
"\n",
|
1655 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
1656 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
1657 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_has_hashtags, viral_no_hashtags], name=\"Viral with Hashtags\"),\n",
|
1658 |
+
" 1, 1)\n",
|
1659 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_has_hashtags, normal_no_hashtags], name=\"Tweet with No Hashtags\"),\n",
|
1660 |
+
" 1, 2)\n",
|
1661 |
+
"\n",
|
1662 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
1663 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
1664 |
+
"\n",
|
1665 |
+
"fig.update_layout(\n",
|
1666 |
+
" width=1000,\n",
|
1667 |
+
" height=500,\n",
|
1668 |
+
" title_text=\"Percentage of tweets with hashtags\",\n",
|
1669 |
+
" # Add annotations in the center of the donut pies.\n",
|
1670 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
1671 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
1672 |
+
"fig.show()"
|
1673 |
+
]
|
1674 |
+
},
|
1675 |
+
{
|
1676 |
+
"cell_type": "markdown",
|
1677 |
+
"metadata": {},
|
1678 |
+
"source": [
|
1679 |
+
"Calculating the p-value between the target `viral` and `has_hashtags`\n"
|
1680 |
+
]
|
1681 |
+
},
|
1682 |
+
{
|
1683 |
+
"cell_type": "code",
|
1684 |
+
"execution_count": null,
|
1685 |
+
"metadata": {},
|
1686 |
+
"outputs": [],
|
1687 |
+
"source": [
|
1688 |
+
"from scipy.stats import chi2_contingency \n",
|
1689 |
+
"\n",
|
1690 |
+
"viral_dataset_labeled['has_hashtags'] = viral_dataset_labeled.nb_of_hashtags >= 1\n",
|
1691 |
+
"\n",
|
1692 |
+
"# Calculating the p-value\n",
|
1693 |
+
"contingency_has_hashtags = pd.crosstab(index = viral_dataset_labeled['has_hashtags'], columns=viral_dataset_labeled['viral']) \n",
|
1694 |
+
"# Display with percentages\n",
|
1695 |
+
"contingency_has_hashtags_normalized_percentage = pd.crosstab(\n",
|
1696 |
+
" index = viral_dataset_labeled['has_hashtags'], columns=viral_dataset_labeled['viral'], normalize='columns') \n",
|
1697 |
+
"display(contingency_has_hashtags_normalized_percentage)\n",
|
1698 |
+
"\n",
|
1699 |
+
"c, p, dof, expected = chi2_contingency(contingency_has_hashtags) \n",
|
1700 |
+
"display(f'p-value {p}')\n",
|
1701 |
+
"c, p, dof, expected"
|
1702 |
+
]
|
1703 |
+
},
|
1704 |
+
{
|
1705 |
+
"cell_type": "markdown",
|
1706 |
+
"metadata": {},
|
1707 |
+
"source": [
|
1708 |
+
"#### 1.3.6 - Verified account"
|
1709 |
+
]
|
1710 |
+
},
|
1711 |
+
{
|
1712 |
+
"cell_type": "code",
|
1713 |
+
"execution_count": null,
|
1714 |
+
"metadata": {},
|
1715 |
+
"outputs": [],
|
1716 |
+
"source": [
|
1717 |
+
"# Verified account\n",
|
1718 |
+
"labels = [\"Verified\", \"Not verified\"]\n",
|
1719 |
+
"viral_is_verified = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (viral_dataset_labeled.verified)])\n",
|
1720 |
+
"viral_not_verified = len(viral_dataset_labeled[(viral_dataset_labeled.viral) & (~viral_dataset_labeled.verified)])\n",
|
1721 |
+
"normal_is_verified = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (viral_dataset_labeled.verified)])\n",
|
1722 |
+
"normal_not_verified = len(viral_dataset_labeled[(~viral_dataset_labeled.viral) & (~viral_dataset_labeled.verified)])\n",
|
1723 |
+
"\n",
|
1724 |
+
"\n",
|
1725 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
1726 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
1727 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_is_verified, viral_not_verified], name=\"Viral with verified accounts\"),\n",
|
1728 |
+
" 1, 1)\n",
|
1729 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_is_verified, normal_not_verified], name=\"Tweet with an unverified account\"),\n",
|
1730 |
+
" 1, 2)\n",
|
1731 |
+
"\n",
|
1732 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
1733 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
1734 |
+
"\n",
|
1735 |
+
"fig.update_layout(\n",
|
1736 |
+
" width=1000,\n",
|
1737 |
+
" height=500,\n",
|
1738 |
+
" title_text=\"Percentage of tweets from verified accounts\",\n",
|
1739 |
+
" # Add annotations in the center of the donut pies.\n",
|
1740 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
1741 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
1742 |
+
"fig.show()"
|
1743 |
+
]
|
1744 |
+
},
|
1745 |
+
{
|
1746 |
+
"cell_type": "markdown",
|
1747 |
+
"metadata": {},
|
1748 |
+
"source": [
|
1749 |
+
"Calculating the p-value between the target `viral` and `is_verified`\n"
|
1750 |
+
]
|
1751 |
+
},
|
1752 |
+
{
|
1753 |
+
"cell_type": "code",
|
1754 |
+
"execution_count": null,
|
1755 |
+
"metadata": {},
|
1756 |
+
"outputs": [],
|
1757 |
+
"source": [
|
1758 |
+
"from scipy.stats import chi2_contingency \n",
|
1759 |
+
"\n",
|
1760 |
+
"# Calculating the p-value\n",
|
1761 |
+
"contingency_verified = pd.crosstab(index = viral_dataset_labeled['verified'], columns=viral_dataset_labeled['viral']) \n",
|
1762 |
+
"# Display with percentages\n",
|
1763 |
+
"contingency_verified_normalized_percentage = pd.crosstab(\n",
|
1764 |
+
" index = viral_dataset_labeled['verified'], columns=viral_dataset_labeled['viral'], normalize='columns') \n",
|
1765 |
+
"display(contingency_verified_normalized_percentage)\n",
|
1766 |
+
"\n",
|
1767 |
+
"c, p, dof, expected = chi2_contingency(contingency_verified) \n",
|
1768 |
+
"display(f'p-value {p}')\n",
|
1769 |
+
"c, p, dof, expected"
|
1770 |
+
]
|
1771 |
+
},
|
1772 |
+
{
|
1773 |
+
"cell_type": "markdown",
|
1774 |
+
"metadata": {},
|
1775 |
+
"source": [
|
1776 |
+
"#### 1.3.7 - Has mentions"
|
1777 |
+
]
|
1778 |
+
},
|
1779 |
+
{
|
1780 |
+
"cell_type": "code",
|
1781 |
+
"execution_count": null,
|
1782 |
+
"metadata": {},
|
1783 |
+
"outputs": [],
|
1784 |
+
"source": [
|
1785 |
+
"viral_dataset_labeled.loc[:, \"nb_of_mentions\"] = viral_dataset_labeled.mentions.apply(lambda x: len(x) if np.all(x) else 0)"
|
1786 |
+
]
|
1787 |
+
},
|
1788 |
+
{
|
1789 |
+
"cell_type": "code",
|
1790 |
+
"execution_count": null,
|
1791 |
+
"metadata": {},
|
1792 |
+
"outputs": [],
|
1793 |
+
"source": [
|
1794 |
+
"from scipy.stats import chi2_contingency \n",
|
1795 |
+
"\n",
|
1796 |
+
"# Calculating the p-value\n",
|
1797 |
+
"contingency_has_mentions = pd.crosstab(index = viral_dataset_labeled['nb_of_mentions'] > 0, columns=viral_dataset_labeled['viral']) \n",
|
1798 |
+
"display(contingency_has_mentions)\n",
|
1799 |
+
"# Display with percentages\n",
|
1800 |
+
"display(pd.crosstab(index = viral_dataset_labeled['nb_of_mentions'] > 0, columns=viral_dataset_labeled['viral'], normalize='columns') )\n",
|
1801 |
+
"\n",
|
1802 |
+
"c, p, dof, expected = chi2_contingency(contingency_has_mentions) \n",
|
1803 |
+
"display(f'p-value {p}')\n",
|
1804 |
+
"c, p, dof, expected"
|
1805 |
+
]
|
1806 |
+
},
|
1807 |
+
{
|
1808 |
+
"cell_type": "markdown",
|
1809 |
+
"metadata": {},
|
1810 |
+
"source": [
|
1811 |
+
"#### 1.3.8 - Save result of preprocessing to disk"
|
1812 |
+
]
|
1813 |
+
},
|
1814 |
+
{
|
1815 |
+
"cell_type": "code",
|
1816 |
+
"execution_count": null,
|
1817 |
+
"metadata": {},
|
1818 |
+
"outputs": [],
|
1819 |
+
"source": [
|
1820 |
+
"viral_dataset_labeled.to_parquet(f'{PROCESSED_PATH_VIRAL}/all_english_tweets_with_users_with_sentiment.parquet.gzip', index=False, compression=\"gzip\")"
|
1821 |
+
]
|
1822 |
+
},
|
1823 |
+
{
|
1824 |
+
"cell_type": "markdown",
|
1825 |
+
"metadata": {},
|
1826 |
+
"source": []
|
1827 |
+
},
|
1828 |
+
{
|
1829 |
+
"cell_type": "code",
|
1830 |
+
"execution_count": null,
|
1831 |
+
"metadata": {},
|
1832 |
+
"outputs": [],
|
1833 |
+
"source": [
|
1834 |
+
"viral_dataset_labeled.columns\n"
|
1835 |
+
]
|
1836 |
+
},
|
1837 |
+
{
|
1838 |
+
"cell_type": "markdown",
|
1839 |
+
"metadata": {},
|
1840 |
+
"source": [
|
1841 |
+
"### 1.4 - Covid dataset Exploration"
|
1842 |
+
]
|
1843 |
+
},
|
1844 |
+
{
|
1845 |
+
"cell_type": "markdown",
|
1846 |
+
"metadata": {},
|
1847 |
+
"source": [
|
1848 |
+
"Here we concern ourselves only with original tweets (no retweets)."
|
1849 |
+
]
|
1850 |
+
},
|
1851 |
+
{
|
1852 |
+
"cell_type": "code",
|
1853 |
+
"execution_count": null,
|
1854 |
+
"metadata": {},
|
1855 |
+
"outputs": [],
|
1856 |
+
"source": [
|
1857 |
+
"original_covid_tweets = pd.read_parquet(f\"{COVID_TWEETS_PATH}/all_original_tweets.parquet.gzip\")\n",
|
1858 |
+
"original_covid_tweets.loc[:, \"text\"] = original_covid_tweets.text.apply(clear_reply_mentions)\n",
|
1859 |
+
"\n",
|
1860 |
+
"covid_users = pd.read_parquet(f\"{COVID_TWEETS_PATH}/users.parquet.gzip\")\n",
|
1861 |
+
"\n",
|
1862 |
+
"display(\"--- COVID DATASET ---\")\n",
|
1863 |
+
"\n",
|
1864 |
+
"display(f\"{len(original_covid_tweets)} original (not retweeted) covid tweets collected\")\n",
|
1865 |
+
"display(f\"{len(original_covid_tweets.author_id.unique())} covid users collected\")\n",
|
1866 |
+
"\n",
|
1867 |
+
"original_covid_tweets"
|
1868 |
+
]
|
1869 |
+
},
|
1870 |
+
{
|
1871 |
+
"cell_type": "code",
|
1872 |
+
"execution_count": null,
|
1873 |
+
"metadata": {},
|
1874 |
+
"outputs": [],
|
1875 |
+
"source": [
|
1876 |
+
"user_columns = ['author_id', 'followers_count', 'following_count', 'tweet_count', 'protected', 'verified', 'username']\n",
|
1877 |
+
"covid_dataset_with_users = original_covid_tweets.merge(covid_users.rename(columns={'id': 'author_id'})[user_columns], on='author_id')"
|
1878 |
+
]
|
1879 |
+
},
|
1880 |
+
{
|
1881 |
+
"cell_type": "code",
|
1882 |
+
"execution_count": null,
|
1883 |
+
"metadata": {},
|
1884 |
+
"outputs": [],
|
1885 |
+
"source": [
|
1886 |
+
"# Applying the first metric on the covid dataset\n",
|
1887 |
+
"covid_dataset_with_users['virality_followers'] = covid_dataset_with_users['retweet_count'] / covid_dataset_with_users['followers_count'].astype(\"float64\")\n",
|
1888 |
+
"# Handle division by zero if user has 0 followers\n",
|
1889 |
+
"covid_dataset_with_users['virality_followers'] = covid_dataset_with_users.virality_followers.replace({np.inf: 0.0})"
|
1890 |
+
]
|
1891 |
+
},
|
1892 |
+
{
|
1893 |
+
"cell_type": "code",
|
1894 |
+
"execution_count": null,
|
1895 |
+
"metadata": {},
|
1896 |
+
"outputs": [],
|
1897 |
+
"source": [
|
1898 |
+
"covid_dataset_with_users"
|
1899 |
+
]
|
1900 |
+
},
|
1901 |
+
{
|
1902 |
+
"cell_type": "code",
|
1903 |
+
"execution_count": null,
|
1904 |
+
"metadata": {},
|
1905 |
+
"outputs": [],
|
1906 |
+
"source": [
|
1907 |
+
"px.histogram(covid_dataset_with_users, x='followers_count', y = 'virality_followers', log_y=True)"
|
1908 |
+
]
|
1909 |
+
},
|
1910 |
+
{
|
1911 |
+
"cell_type": "code",
|
1912 |
+
"execution_count": null,
|
1913 |
+
"metadata": {},
|
1914 |
+
"outputs": [],
|
1915 |
+
"source": [
|
1916 |
+
"covid_dataset_with_users['viral'] = covid_dataset_with_users.virality_followers > 1\n",
|
1917 |
+
"covid_dataset_with_users[covid_dataset_with_users.viral]"
|
1918 |
+
]
|
1919 |
+
},
|
1920 |
+
{
|
1921 |
+
"cell_type": "markdown",
|
1922 |
+
"metadata": {},
|
1923 |
+
"source": [
|
1924 |
+
"### 1.4.1 - Language"
|
1925 |
+
]
|
1926 |
+
},
|
1927 |
+
{
|
1928 |
+
"cell_type": "code",
|
1929 |
+
"execution_count": null,
|
1930 |
+
"metadata": {},
|
1931 |
+
"outputs": [],
|
1932 |
+
"source": [
|
1933 |
+
"languages_aggregates = covid_dataset_with_users.groupby(by='lang', as_index=False)[['id']].count().rename(columns={'id': 'count'})\n",
|
1934 |
+
"languages_aggregates = languages_aggregates.sort_values(by='count', ascending=False)\n",
|
1935 |
+
"languages_aggregates.loc[languages_aggregates['count'] < 10000, 'lang'] = 'Other Languages'\n",
|
1936 |
+
"fig = px.pie(languages_aggregates, values='count', names='lang', title='Distribution of Tweets languages')\n",
|
1937 |
+
"\n",
|
1938 |
+
"fig.update_layout(\n",
|
1939 |
+
" autosize=False,\n",
|
1940 |
+
" width=500,\n",
|
1941 |
+
" height=500\n",
|
1942 |
+
")"
|
1943 |
+
]
|
1944 |
+
},
|
1945 |
+
{
|
1946 |
+
"cell_type": "code",
|
1947 |
+
"execution_count": null,
|
1948 |
+
"metadata": {},
|
1949 |
+
"outputs": [],
|
1950 |
+
"source": [
|
1951 |
+
"english_covid_tweets = covid_dataset_with_users[covid_dataset_with_users.lang == 'en']\n",
|
1952 |
+
"display(f\"{len(english_covid_tweets)} english covid tweets\")\n",
|
1953 |
+
"\n",
|
1954 |
+
"english_viral_covid_tweets = english_covid_tweets[english_covid_tweets.viral]\n",
|
1955 |
+
"display(f\"{len(english_viral_covid_tweets)} viral english covid tweets\")"
|
1956 |
+
]
|
1957 |
+
},
|
1958 |
+
{
|
1959 |
+
"cell_type": "markdown",
|
1960 |
+
"metadata": {},
|
1961 |
+
"source": [
|
1962 |
+
"### 1.4.2 - Media"
|
1963 |
+
]
|
1964 |
+
},
|
1965 |
+
{
|
1966 |
+
"cell_type": "code",
|
1967 |
+
"execution_count": null,
|
1968 |
+
"metadata": {},
|
1969 |
+
"outputs": [],
|
1970 |
+
"source": [
|
1971 |
+
"# Has media\n",
|
1972 |
+
"labels = [\"Media\", \"No Media\"]\n",
|
1973 |
+
"viral_has_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == True) & (covid_dataset_with_users.has_media == True)])\n",
|
1974 |
+
"viral_no_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == True) & (covid_dataset_with_users.has_media == False)])\n",
|
1975 |
+
"normal_has_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == False) & (covid_dataset_with_users.has_media == True)])\n",
|
1976 |
+
"normal_no_media = len(covid_dataset_with_users[(covid_dataset_with_users.viral == False) & (covid_dataset_with_users.has_media == False)])\n",
|
1977 |
+
"\n",
|
1978 |
+
"\n",
|
1979 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
1980 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
1981 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_has_media, viral_no_media], name=\"Viral with Media\"),\n",
|
1982 |
+
" 1, 1)\n",
|
1983 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_has_media, normal_no_media], name=\"Tweet with Media\"),\n",
|
1984 |
+
" 1, 2)\n",
|
1985 |
+
"\n",
|
1986 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
1987 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
1988 |
+
"\n",
|
1989 |
+
"fig.update_layout(\n",
|
1990 |
+
" width=1000,\n",
|
1991 |
+
" height=500,\n",
|
1992 |
+
" title_text=\"Percentage of tweets with some kind of media\",\n",
|
1993 |
+
" # Add annotations in the center of the donut pies.\n",
|
1994 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
1995 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
1996 |
+
"fig.show()"
|
1997 |
+
]
|
1998 |
+
},
|
1999 |
+
{
|
2000 |
+
"cell_type": "markdown",
|
2001 |
+
"metadata": {},
|
2002 |
+
"source": [
|
2003 |
+
"### 1.4.3 - Tweet Length"
|
2004 |
+
]
|
2005 |
+
},
|
2006 |
+
{
|
2007 |
+
"cell_type": "code",
|
2008 |
+
"execution_count": null,
|
2009 |
+
"metadata": {},
|
2010 |
+
"outputs": [],
|
2011 |
+
"source": [
|
2012 |
+
"covid_dataset_with_users.loc[:, 'tweet_length'] = covid_dataset_with_users.text.apply(len)\n",
|
2013 |
+
"covid_dataset_with_users[['tweet_length', 'retweet_count']].corr()"
|
2014 |
+
]
|
2015 |
+
},
|
2016 |
+
{
|
2017 |
+
"cell_type": "markdown",
|
2018 |
+
"metadata": {},
|
2019 |
+
"source": [
|
2020 |
+
"### 1.4.4 - Sentiment"
|
2021 |
+
]
|
2022 |
+
},
|
2023 |
+
{
|
2024 |
+
"cell_type": "code",
|
2025 |
+
"execution_count": null,
|
2026 |
+
"metadata": {},
|
2027 |
+
"outputs": [],
|
2028 |
+
"source": [
|
2029 |
+
"from transformers import pipeline\n",
|
2030 |
+
"\n",
|
2031 |
+
"# Device = 0 means it will use the Cuda at index 0\n",
|
2032 |
+
"sentiment_classifier = pipeline(\"sentiment-analysis\", device=0)\n",
|
2033 |
+
"\n",
|
2034 |
+
"english_covid_dataset = covid_dataset_with_users[covid_dataset_with_users.lang == 'en']\n",
|
2035 |
+
"english_covid_dataset"
|
2036 |
+
]
|
2037 |
+
},
|
2038 |
+
{
|
2039 |
+
"cell_type": "markdown",
|
2040 |
+
"metadata": {},
|
2041 |
+
"source": [
|
2042 |
+
"Here we compute sentiments again. To avoid having to compute the sentiments again, we've already preprocessed the data and computed the sentiments and saved it to parquet. Feel free to skip the next 2 cells."
|
2043 |
+
]
|
2044 |
+
},
|
2045 |
+
{
|
2046 |
+
"cell_type": "code",
|
2047 |
+
"execution_count": null,
|
2048 |
+
"metadata": {},
|
2049 |
+
"outputs": [],
|
2050 |
+
"source": [
|
2051 |
+
"applied = english_covid_dataset.apply(lambda x: sentiment_classifier(x.text)[0], axis=1, result_type='expand')\n",
|
2052 |
+
"#pd.concat([small_test_set, applied], axis='columns')\n",
|
2053 |
+
"applied"
|
2054 |
+
]
|
2055 |
+
},
|
2056 |
+
{
|
2057 |
+
"cell_type": "code",
|
2058 |
+
"execution_count": null,
|
2059 |
+
"metadata": {},
|
2060 |
+
"outputs": [],
|
2061 |
+
"source": [
|
2062 |
+
"sentiment_features = pd.concat([english_covid_dataset, applied], axis=1)\n",
|
2063 |
+
"sentiment_features = sentiment_features.rename(columns={\"label\": \"sentiment\", \"score\": \"sentiment_score\"})"
|
2064 |
+
]
|
2065 |
+
},
|
2066 |
+
{
|
2067 |
+
"cell_type": "code",
|
2068 |
+
"execution_count": null,
|
2069 |
+
"metadata": {},
|
2070 |
+
"outputs": [],
|
2071 |
+
"source": [
|
2072 |
+
"sentiment_features = pd.read_parquet(f\"{PROCESSED_PATH_COVID}/english_tweets_with_users_with_sentiment.parquet.gzip\")\n",
|
2073 |
+
"sentiment_features"
|
2074 |
+
]
|
2075 |
+
},
|
2076 |
+
{
|
2077 |
+
"cell_type": "code",
|
2078 |
+
"execution_count": null,
|
2079 |
+
"metadata": {},
|
2080 |
+
"outputs": [],
|
2081 |
+
"source": [
|
2082 |
+
"# Tweets with sentiment scores over 70%\n",
|
2083 |
+
"display(f\"Tweets with sentiment analysis confidence scores above 0.7: {len(sentiment_features[sentiment_features.sentiment_score > 0.7])}\")\n",
|
2084 |
+
"display(f\"{len(sentiment_features[sentiment_features.sentiment == 'POSITIVE'])} positive tweets\")\n",
|
2085 |
+
"display(f\"{len(sentiment_features[sentiment_features.sentiment == 'NEGATIVE'])} negative tweets\")\n",
|
2086 |
+
"\n",
|
2087 |
+
"confident_sentiment_tweets = sentiment_features[sentiment_features.sentiment_score > 0.7]"
|
2088 |
+
]
|
2089 |
+
},
|
2090 |
+
{
|
2091 |
+
"cell_type": "code",
|
2092 |
+
"execution_count": null,
|
2093 |
+
"metadata": {},
|
2094 |
+
"outputs": [],
|
2095 |
+
"source": [
|
2096 |
+
"# We keep only retweeted tweets to pan out tweets with zero retweets with little utility.\n",
|
2097 |
+
"labels = [\"Positive\", \"Negative\"]\n",
|
2098 |
+
"viral_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
|
2099 |
+
"viral_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == True) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
|
2100 |
+
"normal_positive = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'POSITIVE')])\n",
|
2101 |
+
"normal_negative = len(confident_sentiment_tweets[(confident_sentiment_tweets.viral == False) & (confident_sentiment_tweets.sentiment == 'NEGATIVE')])\n",
|
2102 |
+
"\n",
|
2103 |
+
"\n",
|
2104 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
2105 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
2106 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_positive, viral_negative], name=\"Positive Viral Tweets\"),\n",
|
2107 |
+
" 1, 1)\n",
|
2108 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_positive, normal_negative], name=\"Positive Non-Viral Tweets\"),\n",
|
2109 |
+
" 1, 2)\n",
|
2110 |
+
"\n",
|
2111 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
2112 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
2113 |
+
"\n",
|
2114 |
+
"fig.update_layout(\n",
|
2115 |
+
" width=1000,\n",
|
2116 |
+
" height=500,\n",
|
2117 |
+
" title_text=\"Distribution of positive and negative sentiment in viral vs non-viral tweets\",\n",
|
2118 |
+
" # Add annotations in the center of the donut pies.\n",
|
2119 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
2120 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
2121 |
+
"fig.show()"
|
2122 |
+
]
|
2123 |
+
},
|
2124 |
+
{
|
2125 |
+
"cell_type": "markdown",
|
2126 |
+
"metadata": {},
|
2127 |
+
"source": [
|
2128 |
+
"### 1.4.5 - Number of Hashtags"
|
2129 |
+
]
|
2130 |
+
},
|
2131 |
+
{
|
2132 |
+
"cell_type": "code",
|
2133 |
+
"execution_count": null,
|
2134 |
+
"metadata": {},
|
2135 |
+
"outputs": [],
|
2136 |
+
"source": [
|
2137 |
+
"covid_dataset_with_users.loc[:, \"nb_of_hashtags\"] = covid_dataset_with_users.hashtags.apply(lambda x: len(x) if np.all(x) else 0)"
|
2138 |
+
]
|
2139 |
+
},
|
2140 |
+
{
|
2141 |
+
"cell_type": "code",
|
2142 |
+
"execution_count": null,
|
2143 |
+
"metadata": {},
|
2144 |
+
"outputs": [],
|
2145 |
+
"source": [
|
2146 |
+
"labels = [\"Hashtags\", \"No Hashtags\"]\n",
|
2147 |
+
"viral_has_hashtags = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags >= 1)])\n",
|
2148 |
+
"viral_no_hashtags = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags == 0)])\n",
|
2149 |
+
"normal_has_hashtags = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags > 1)])\n",
|
2150 |
+
"normal_no_hashtags = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (covid_dataset_with_users.nb_of_hashtags == 0)])\n",
|
2151 |
+
"\n",
|
2152 |
+
"\n",
|
2153 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
2154 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
2155 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_has_hashtags, viral_no_hashtags], name=\"Viral with Hashtags\"),\n",
|
2156 |
+
" 1, 1)\n",
|
2157 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_has_hashtags, normal_no_hashtags], name=\"Tweet with No Hashtags\"),\n",
|
2158 |
+
" 1, 2)\n",
|
2159 |
+
"\n",
|
2160 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
2161 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
2162 |
+
"\n",
|
2163 |
+
"fig.update_layout(\n",
|
2164 |
+
" width=1000,\n",
|
2165 |
+
" height=500,\n",
|
2166 |
+
" title_text=\"Percentage of tweets with hashtags\",\n",
|
2167 |
+
" # Add annotations in the center of the donut pies.\n",
|
2168 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
2169 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
2170 |
+
"fig.show()"
|
2171 |
+
]
|
2172 |
+
},
|
2173 |
+
{
|
2174 |
+
"cell_type": "markdown",
|
2175 |
+
"metadata": {},
|
2176 |
+
"source": [
|
2177 |
+
"#### 1.4.6 - Verified Account"
|
2178 |
+
]
|
2179 |
+
},
|
2180 |
+
{
|
2181 |
+
"cell_type": "code",
|
2182 |
+
"execution_count": null,
|
2183 |
+
"metadata": {},
|
2184 |
+
"outputs": [],
|
2185 |
+
"source": [
|
2186 |
+
"# Has media\n",
|
2187 |
+
"labels = [\"Verified\", \"Not verified\"]\n",
|
2188 |
+
"viral_is_verified = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (covid_dataset_with_users.verified)])\n",
|
2189 |
+
"viral_not_verified = len(covid_dataset_with_users[(covid_dataset_with_users.viral) & (~covid_dataset_with_users.verified)])\n",
|
2190 |
+
"normal_is_verified = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (covid_dataset_with_users.verified)])\n",
|
2191 |
+
"normal_not_verified = len(covid_dataset_with_users[(~covid_dataset_with_users.viral) & (~covid_dataset_with_users.verified)])\n",
|
2192 |
+
"\n",
|
2193 |
+
"\n",
|
2194 |
+
"# Create subplots: use 'domain' type for Pie subplot\n",
|
2195 |
+
"fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n",
|
2196 |
+
"fig.add_trace(go.Pie(labels=labels, values=[viral_is_verified, viral_not_verified], name=\"Viral with verified accounts\"),\n",
|
2197 |
+
" 1, 1)\n",
|
2198 |
+
"fig.add_trace(go.Pie(labels=labels, values=[normal_is_verified, normal_not_verified], name=\"Tweet with an unverified account\"),\n",
|
2199 |
+
" 1, 2)\n",
|
2200 |
+
"\n",
|
2201 |
+
"# Use `hole` to create a donut-like pie chart\n",
|
2202 |
+
"fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\")\n",
|
2203 |
+
"\n",
|
2204 |
+
"fig.update_layout(\n",
|
2205 |
+
" width=1000,\n",
|
2206 |
+
" height=500,\n",
|
2207 |
+
" title_text=\"Percentage of tweets from verified accounts\",\n",
|
2208 |
+
" # Add annotations in the center of the donut pies.\n",
|
2209 |
+
" annotations=[dict(text='Viral', x=0.18, y=0.5, font_size=20, showarrow=False),\n",
|
2210 |
+
" dict(text='Non-Viral', x=0.82, y=0.5, font_size=20, showarrow=False)])\n",
|
2211 |
+
"fig.show()"
|
2212 |
+
]
|
2213 |
+
},
|
2214 |
+
{
|
2215 |
+
"cell_type": "markdown",
|
2216 |
+
"metadata": {},
|
2217 |
+
"source": [
|
2218 |
+
"### 1.4.7 - Save dataframe with analysis to disk"
|
2219 |
+
]
|
2220 |
+
},
|
2221 |
+
{
|
2222 |
+
"cell_type": "code",
|
2223 |
+
"execution_count": null,
|
2224 |
+
"metadata": {},
|
2225 |
+
"outputs": [],
|
2226 |
+
"source": [
|
2227 |
+
"covid_dataset_with_users.to_parquet(f'{PROCESSED_PATH_COVID}/all_english_tweets_with_users_with_sentiment.parquet.gzip', index=False, compression=\"gzip\")"
|
2228 |
+
]
|
2229 |
+
},
|
2230 |
+
{
|
2231 |
+
"cell_type": "markdown",
|
2232 |
+
"metadata": {},
|
2233 |
+
"source": [
|
2234 |
+
"Questions for TJ:\n",
|
2235 |
+
"\n",
|
2236 |
+
"Learn threshold? Use unsupervised learning (anomaly detection), x axis date y retweet count, isolation coordinate\n",
|
2237 |
+
"Ratio\n",
|
2238 |
+
"Try to come up with Different metrics (one cannot be used for second dataset)\n",
|
2239 |
+
"\n",
|
2240 |
+
"Preprocessing:\n",
|
2241 |
+
" - Remove tweets with no retweets or likes? NO\n",
|
2242 |
+
" - Define threshold using the metric? DONE (label above viral tweet)\n",
|
2243 |
+
" - Skewed distribution if we use only Twitter viral tweets (1000) DONE\n",
|
2244 |
+
"- Which features? (Any new ideas)\n",
|
2245 |
+
" - Topic\n",
|
2246 |
+
" - Hashtags relevant? (Most likely different from coronavirus and we already have topics).\n",
|
2247 |
+
" - Has media\n",
|
2248 |
+
" - Sentiment? [TODO]\n",
|
2249 |
+
" - Tweet length [TODO]\n",
|
2250 |
+
" - RETRIEVE USERS THAT LIKED OR RETWEETED USING API [TODO]\n",
|
2251 |
+
" - Word cloud of entities [TODO]\n",
|
2252 |
+
"- Check bigrams and trigrams distribution\n",
|
2253 |
+
"- Normalize features (like, retweets, reply etc...)? DEPENDS, Included in first model, will be removed from second model with covid set.\n",
|
2254 |
+
"- BertTweet [DO NOT REMOVE STOP WORDS FOR LANGUAGE MODELS, FOR ]\n",
|
2255 |
+
"- Next steps (now that data collection part is done and data analysis almost done)\n",
|
2256 |
+
" - Hydrate Covid dataset id\n",
|
2257 |
+
"- Viral generator (Trump generator)\n",
|
2258 |
+
"\n",
|
2259 |
+
"1st classifier: hashtags, twitter entities (context annotations, domain annotations, entities), mentions, domain of urls (youtube.com let’s say)\n",
|
2260 |
+
"2nd classifier: bag of words with tf-idf, remove stopwords and other entities that you used in the 1st classifier\n",
|
2261 |
+
"3rd: language model\n"
|
2262 |
+
]
|
2263 |
+
},
|
2264 |
+
{
|
2265 |
+
"cell_type": "markdown",
|
2266 |
+
"metadata": {},
|
2267 |
+
"source": []
|
2268 |
+
},
|
2269 |
+
{
|
2270 |
+
"cell_type": "code",
|
2271 |
+
"execution_count": null,
|
2272 |
+
"metadata": {},
|
2273 |
+
"outputs": [],
|
2274 |
+
"source": []
|
2275 |
+
}
|
2276 |
+
],
|
2277 |
+
"metadata": {
|
2278 |
+
"kernelspec": {
|
2279 |
+
"display_name": "Python 3 (ipykernel)",
|
2280 |
+
"language": "python",
|
2281 |
+
"name": "python3"
|
2282 |
+
},
|
2283 |
+
"language_info": {
|
2284 |
+
"codemirror_mode": {
|
2285 |
+
"name": "ipython",
|
2286 |
+
"version": 3
|
2287 |
+
},
|
2288 |
+
"file_extension": ".py",
|
2289 |
+
"mimetype": "text/x-python",
|
2290 |
+
"name": "python",
|
2291 |
+
"nbconvert_exporter": "python",
|
2292 |
+
"pygments_lexer": "ipython3",
|
2293 |
+
"version": "3.8.15"
|
2294 |
+
},
|
2295 |
+
"vscode": {
|
2296 |
+
"interpreter": {
|
2297 |
+
"hash": "71d2f77bccee14ca7852d7b7a1fa8ea4708b81087104d93973081337557f0ee6"
|
2298 |
+
}
|
2299 |
+
}
|
2300 |
+
},
|
2301 |
+
"nbformat": 4,
|
2302 |
+
"nbformat_minor": 4
|
2303 |
+
}
|
metric_analysis/viral_tweet_user_exploration.ipynb
ADDED
@@ -0,0 +1,1208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"## Viral Tweets: User exploration"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"In this notebook, we will explore the users who have tweeted viral tweets. Namely, we will focus our analysis on the viral tweets from the user point of view. For example, we'll examine the popularity of the user vs the popularity of his tweets, the history of his tweets and analyze any flagrant changes in their features when they became viral, etc."
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "markdown",
|
19 |
+
"metadata": {},
|
20 |
+
"source": [
|
21 |
+
"## 0 - Setup"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 1,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"import pandas as pd\n",
|
31 |
+
"import seaborn as sns\n",
|
32 |
+
"import numpy as np\n",
|
33 |
+
"\n",
|
34 |
+
"import matplotlib.pyplot as plt\n",
|
35 |
+
"%matplotlib inline\n",
|
36 |
+
"\n",
|
37 |
+
"from tqdm import tqdm\n",
|
38 |
+
"\n",
|
39 |
+
"#pd.set_option('display.max_rows', None)\n",
|
40 |
+
"pd.set_option('display.max_columns', None)\n",
|
41 |
+
"\n",
|
42 |
+
"DATA_PATH = \"../../data\"\n",
|
43 |
+
"VIRAL_TWEETS_PATH = f\"{DATA_PATH}/viral_users\""
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": 2,
|
49 |
+
"metadata": {},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"from helper.twitter_client_wrapper import TwitterClientWrapper, EXPANSIONS, MEDIA_FIELDS, TWEET_FIELDS, USER_FIELDS\n",
|
53 |
+
"\n",
|
54 |
+
"twitter_client_wrapper = TwitterClientWrapper(\"../../api_key.yaml\", wait_on_rate_limit=False)"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "markdown",
|
59 |
+
"metadata": {},
|
60 |
+
"source": [
|
61 |
+
"## 1 - Retrieve the data from disk"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "markdown",
|
66 |
+
"metadata": {},
|
67 |
+
"source": [
|
68 |
+
"### 1.1 Retrieve the viral tweets data"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "markdown",
|
73 |
+
"metadata": {},
|
74 |
+
"source": [
|
75 |
+
"**Note**: You may notice that all tweets have been retrieved, since some may have been deleted since scraping them."
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "markdown",
|
80 |
+
"metadata": {},
|
81 |
+
"source": [
|
82 |
+
"**Note 2**: Also keep in mind that when retrieving users, the number of users may be less because users may have two or more viral tweets in the sample of viral tweets we have. "
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "code",
|
87 |
+
"execution_count": 3,
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [],
|
90 |
+
"source": [
|
91 |
+
"# dtypes={\"id\": str, \"author_id\": str, \"has_media\": bool, \"possibly_sensitive\": bool}\n",
|
92 |
+
"dtypes={\"id\": str, \"author_id\": str}"
|
93 |
+
]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"cell_type": "code",
|
97 |
+
"execution_count": 4,
|
98 |
+
"metadata": {},
|
99 |
+
"outputs": [
|
100 |
+
{
|
101 |
+
"name": "stderr",
|
102 |
+
"output_type": "stream",
|
103 |
+
"text": [
|
104 |
+
"C:\\Users\\steph\\AppData\\Local\\Temp\\ipykernel_18728\\1524257405.py:2: DtypeWarning: Columns (3,8,14,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
105 |
+
" viral_tweets_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/all_tweets.csv\", dtype=dtypes, escapechar='\\\\', encoding='utf-8')\n"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"data": {
|
110 |
+
"text/html": [
|
111 |
+
"<div>\n",
|
112 |
+
"<style scoped>\n",
|
113 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
114 |
+
" vertical-align: middle;\n",
|
115 |
+
" }\n",
|
116 |
+
"\n",
|
117 |
+
" .dataframe tbody tr th {\n",
|
118 |
+
" vertical-align: top;\n",
|
119 |
+
" }\n",
|
120 |
+
"\n",
|
121 |
+
" .dataframe thead th {\n",
|
122 |
+
" text-align: right;\n",
|
123 |
+
" }\n",
|
124 |
+
"</style>\n",
|
125 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
126 |
+
" <thead>\n",
|
127 |
+
" <tr style=\"text-align: right;\">\n",
|
128 |
+
" <th></th>\n",
|
129 |
+
" <th>created_at</th>\n",
|
130 |
+
" <th>author_id</th>\n",
|
131 |
+
" <th>text</th>\n",
|
132 |
+
" <th>possibly_sensitive</th>\n",
|
133 |
+
" <th>edit_history_tweet_ids</th>\n",
|
134 |
+
" <th>lang</th>\n",
|
135 |
+
" <th>id</th>\n",
|
136 |
+
" <th>mentions</th>\n",
|
137 |
+
" <th>retweet_count</th>\n",
|
138 |
+
" <th>reply_count</th>\n",
|
139 |
+
" <th>like_count</th>\n",
|
140 |
+
" <th>quote_count</th>\n",
|
141 |
+
" <th>context_annotations</th>\n",
|
142 |
+
" <th>urls</th>\n",
|
143 |
+
" <th>has_media</th>\n",
|
144 |
+
" <th>annotations</th>\n",
|
145 |
+
" <th>hashtags</th>\n",
|
146 |
+
" <th>attachments.poll_ids</th>\n",
|
147 |
+
" <th>withheld.copyright</th>\n",
|
148 |
+
" <th>withheld.country_codes</th>\n",
|
149 |
+
" <th>withheld.scope</th>\n",
|
150 |
+
" <th>cashtags</th>\n",
|
151 |
+
" <th>geo.place_id</th>\n",
|
152 |
+
" <th>geo.coordinates.type</th>\n",
|
153 |
+
" <th>geo.coordinates.coordinates</th>\n",
|
154 |
+
" </tr>\n",
|
155 |
+
" </thead>\n",
|
156 |
+
" <tbody>\n",
|
157 |
+
" <tr>\n",
|
158 |
+
" <th>0</th>\n",
|
159 |
+
" <td>2022-10-31T03:21:11.000Z</td>\n",
|
160 |
+
" <td>1047733077898739712</td>\n",
|
161 |
+
" <td>@manjirosx you too jiro🫶🏽</td>\n",
|
162 |
+
" <td>False</td>\n",
|
163 |
+
" <td>['1586921195059834880']</td>\n",
|
164 |
+
" <td>en</td>\n",
|
165 |
+
" <td>1586921195059834880</td>\n",
|
166 |
+
" <td>[{'start': 0, 'end': 10, 'username': 'manjiros...</td>\n",
|
167 |
+
" <td>0.0</td>\n",
|
168 |
+
" <td>0.0</td>\n",
|
169 |
+
" <td>1.0</td>\n",
|
170 |
+
" <td>0.0</td>\n",
|
171 |
+
" <td>NaN</td>\n",
|
172 |
+
" <td>NaN</td>\n",
|
173 |
+
" <td>False</td>\n",
|
174 |
+
" <td>NaN</td>\n",
|
175 |
+
" <td>NaN</td>\n",
|
176 |
+
" <td>NaN</td>\n",
|
177 |
+
" <td>NaN</td>\n",
|
178 |
+
" <td>NaN</td>\n",
|
179 |
+
" <td>NaN</td>\n",
|
180 |
+
" <td>NaN</td>\n",
|
181 |
+
" <td>NaN</td>\n",
|
182 |
+
" <td>NaN</td>\n",
|
183 |
+
" <td>NaN</td>\n",
|
184 |
+
" </tr>\n",
|
185 |
+
" <tr>\n",
|
186 |
+
" <th>1</th>\n",
|
187 |
+
" <td>2022-10-31T03:13:57.000Z</td>\n",
|
188 |
+
" <td>1047733077898739712</td>\n",
|
189 |
+
" <td>@ilyicey u omd</td>\n",
|
190 |
+
" <td>False</td>\n",
|
191 |
+
" <td>['1586919376086704129']</td>\n",
|
192 |
+
" <td>nl</td>\n",
|
193 |
+
" <td>1586919376086704129</td>\n",
|
194 |
+
" <td>[{'start': 0, 'end': 8, 'username': 'ilyicey',...</td>\n",
|
195 |
+
" <td>0.0</td>\n",
|
196 |
+
" <td>0.0</td>\n",
|
197 |
+
" <td>0.0</td>\n",
|
198 |
+
" <td>0.0</td>\n",
|
199 |
+
" <td>NaN</td>\n",
|
200 |
+
" <td>NaN</td>\n",
|
201 |
+
" <td>False</td>\n",
|
202 |
+
" <td>NaN</td>\n",
|
203 |
+
" <td>NaN</td>\n",
|
204 |
+
" <td>NaN</td>\n",
|
205 |
+
" <td>NaN</td>\n",
|
206 |
+
" <td>NaN</td>\n",
|
207 |
+
" <td>NaN</td>\n",
|
208 |
+
" <td>NaN</td>\n",
|
209 |
+
" <td>NaN</td>\n",
|
210 |
+
" <td>NaN</td>\n",
|
211 |
+
" <td>NaN</td>\n",
|
212 |
+
" </tr>\n",
|
213 |
+
" <tr>\n",
|
214 |
+
" <th>2</th>\n",
|
215 |
+
" <td>2022-10-31T03:13:24.000Z</td>\n",
|
216 |
+
" <td>1047733077898739712</td>\n",
|
217 |
+
" <td>@ilyicey i’m fine</td>\n",
|
218 |
+
" <td>False</td>\n",
|
219 |
+
" <td>['1586919239243296768']</td>\n",
|
220 |
+
" <td>en</td>\n",
|
221 |
+
" <td>1586919239243296768</td>\n",
|
222 |
+
" <td>[{'start': 0, 'end': 8, 'username': 'ilyicey',...</td>\n",
|
223 |
+
" <td>1.0</td>\n",
|
224 |
+
" <td>1.0</td>\n",
|
225 |
+
" <td>2.0</td>\n",
|
226 |
+
" <td>0.0</td>\n",
|
227 |
+
" <td>NaN</td>\n",
|
228 |
+
" <td>NaN</td>\n",
|
229 |
+
" <td>False</td>\n",
|
230 |
+
" <td>NaN</td>\n",
|
231 |
+
" <td>NaN</td>\n",
|
232 |
+
" <td>NaN</td>\n",
|
233 |
+
" <td>NaN</td>\n",
|
234 |
+
" <td>NaN</td>\n",
|
235 |
+
" <td>NaN</td>\n",
|
236 |
+
" <td>NaN</td>\n",
|
237 |
+
" <td>NaN</td>\n",
|
238 |
+
" <td>NaN</td>\n",
|
239 |
+
" <td>NaN</td>\n",
|
240 |
+
" </tr>\n",
|
241 |
+
" <tr>\n",
|
242 |
+
" <th>3</th>\n",
|
243 |
+
" <td>2022-10-30T22:49:53.000Z</td>\n",
|
244 |
+
" <td>1047733077898739712</td>\n",
|
245 |
+
" <td>@imVolo_ I’ll unfollow rn</td>\n",
|
246 |
+
" <td>False</td>\n",
|
247 |
+
" <td>['1586852923706732544']</td>\n",
|
248 |
+
" <td>en</td>\n",
|
249 |
+
" <td>1586852923706732544</td>\n",
|
250 |
+
" <td>[{'start': 0, 'end': 8, 'username': 'imVolo_',...</td>\n",
|
251 |
+
" <td>0.0</td>\n",
|
252 |
+
" <td>0.0</td>\n",
|
253 |
+
" <td>3.0</td>\n",
|
254 |
+
" <td>0.0</td>\n",
|
255 |
+
" <td>NaN</td>\n",
|
256 |
+
" <td>NaN</td>\n",
|
257 |
+
" <td>False</td>\n",
|
258 |
+
" <td>NaN</td>\n",
|
259 |
+
" <td>NaN</td>\n",
|
260 |
+
" <td>NaN</td>\n",
|
261 |
+
" <td>NaN</td>\n",
|
262 |
+
" <td>NaN</td>\n",
|
263 |
+
" <td>NaN</td>\n",
|
264 |
+
" <td>NaN</td>\n",
|
265 |
+
" <td>NaN</td>\n",
|
266 |
+
" <td>NaN</td>\n",
|
267 |
+
" <td>NaN</td>\n",
|
268 |
+
" </tr>\n",
|
269 |
+
" <tr>\n",
|
270 |
+
" <th>4</th>\n",
|
271 |
+
" <td>2022-10-30T22:45:33.000Z</td>\n",
|
272 |
+
" <td>1047733077898739712</td>\n",
|
273 |
+
" <td>“what do you want to be for halloween?” his li...</td>\n",
|
274 |
+
" <td>False</td>\n",
|
275 |
+
" <td>['1586851830767591424']</td>\n",
|
276 |
+
" <td>en</td>\n",
|
277 |
+
" <td>1586851830767591424</td>\n",
|
278 |
+
" <td>NaN</td>\n",
|
279 |
+
" <td>611.0</td>\n",
|
280 |
+
" <td>19.0</td>\n",
|
281 |
+
" <td>4132.0</td>\n",
|
282 |
+
" <td>55.0</td>\n",
|
283 |
+
" <td>[{'domain': {'id': '29', 'name': 'Events [Enti...</td>\n",
|
284 |
+
" <td>NaN</td>\n",
|
285 |
+
" <td>False</td>\n",
|
286 |
+
" <td>NaN</td>\n",
|
287 |
+
" <td>NaN</td>\n",
|
288 |
+
" <td>NaN</td>\n",
|
289 |
+
" <td>NaN</td>\n",
|
290 |
+
" <td>NaN</td>\n",
|
291 |
+
" <td>NaN</td>\n",
|
292 |
+
" <td>NaN</td>\n",
|
293 |
+
" <td>NaN</td>\n",
|
294 |
+
" <td>NaN</td>\n",
|
295 |
+
" <td>NaN</td>\n",
|
296 |
+
" </tr>\n",
|
297 |
+
" </tbody>\n",
|
298 |
+
"</table>\n",
|
299 |
+
"</div>"
|
300 |
+
],
|
301 |
+
"text/plain": [
|
302 |
+
" created_at author_id \\\n",
|
303 |
+
"0 2022-10-31T03:21:11.000Z 1047733077898739712 \n",
|
304 |
+
"1 2022-10-31T03:13:57.000Z 1047733077898739712 \n",
|
305 |
+
"2 2022-10-31T03:13:24.000Z 1047733077898739712 \n",
|
306 |
+
"3 2022-10-30T22:49:53.000Z 1047733077898739712 \n",
|
307 |
+
"4 2022-10-30T22:45:33.000Z 1047733077898739712 \n",
|
308 |
+
"\n",
|
309 |
+
" text possibly_sensitive \\\n",
|
310 |
+
"0 @manjirosx you too jiro🫶🏽 False \n",
|
311 |
+
"1 @ilyicey u omd False \n",
|
312 |
+
"2 @ilyicey i’m fine False \n",
|
313 |
+
"3 @imVolo_ I’ll unfollow rn False \n",
|
314 |
+
"4 “what do you want to be for halloween?” his li... False \n",
|
315 |
+
"\n",
|
316 |
+
" edit_history_tweet_ids lang id \\\n",
|
317 |
+
"0 ['1586921195059834880'] en 1586921195059834880 \n",
|
318 |
+
"1 ['1586919376086704129'] nl 1586919376086704129 \n",
|
319 |
+
"2 ['1586919239243296768'] en 1586919239243296768 \n",
|
320 |
+
"3 ['1586852923706732544'] en 1586852923706732544 \n",
|
321 |
+
"4 ['1586851830767591424'] en 1586851830767591424 \n",
|
322 |
+
"\n",
|
323 |
+
" mentions retweet_count \\\n",
|
324 |
+
"0 [{'start': 0, 'end': 10, 'username': 'manjiros... 0.0 \n",
|
325 |
+
"1 [{'start': 0, 'end': 8, 'username': 'ilyicey',... 0.0 \n",
|
326 |
+
"2 [{'start': 0, 'end': 8, 'username': 'ilyicey',... 1.0 \n",
|
327 |
+
"3 [{'start': 0, 'end': 8, 'username': 'imVolo_',... 0.0 \n",
|
328 |
+
"4 NaN 611.0 \n",
|
329 |
+
"\n",
|
330 |
+
" reply_count like_count quote_count \\\n",
|
331 |
+
"0 0.0 1.0 0.0 \n",
|
332 |
+
"1 0.0 0.0 0.0 \n",
|
333 |
+
"2 1.0 2.0 0.0 \n",
|
334 |
+
"3 0.0 3.0 0.0 \n",
|
335 |
+
"4 19.0 4132.0 55.0 \n",
|
336 |
+
"\n",
|
337 |
+
" context_annotations urls has_media \\\n",
|
338 |
+
"0 NaN NaN False \n",
|
339 |
+
"1 NaN NaN False \n",
|
340 |
+
"2 NaN NaN False \n",
|
341 |
+
"3 NaN NaN False \n",
|
342 |
+
"4 [{'domain': {'id': '29', 'name': 'Events [Enti... NaN False \n",
|
343 |
+
"\n",
|
344 |
+
" annotations hashtags attachments.poll_ids withheld.copyright \\\n",
|
345 |
+
"0 NaN NaN NaN NaN \n",
|
346 |
+
"1 NaN NaN NaN NaN \n",
|
347 |
+
"2 NaN NaN NaN NaN \n",
|
348 |
+
"3 NaN NaN NaN NaN \n",
|
349 |
+
"4 NaN NaN NaN NaN \n",
|
350 |
+
"\n",
|
351 |
+
" withheld.country_codes withheld.scope cashtags geo.place_id \\\n",
|
352 |
+
"0 NaN NaN NaN NaN \n",
|
353 |
+
"1 NaN NaN NaN NaN \n",
|
354 |
+
"2 NaN NaN NaN NaN \n",
|
355 |
+
"3 NaN NaN NaN NaN \n",
|
356 |
+
"4 NaN NaN NaN NaN \n",
|
357 |
+
"\n",
|
358 |
+
" geo.coordinates.type geo.coordinates.coordinates \n",
|
359 |
+
"0 NaN NaN \n",
|
360 |
+
"1 NaN NaN \n",
|
361 |
+
"2 NaN NaN \n",
|
362 |
+
"3 NaN NaN \n",
|
363 |
+
"4 NaN NaN "
|
364 |
+
]
|
365 |
+
},
|
366 |
+
"execution_count": 4,
|
367 |
+
"metadata": {},
|
368 |
+
"output_type": "execute_result"
|
369 |
+
}
|
370 |
+
],
|
371 |
+
"source": [
|
372 |
+
"# Import tweets first\n",
|
373 |
+
"viral_tweets_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/all_tweets.csv\", dtype=dtypes, escapechar='\\\\', encoding='utf-8')\n",
|
374 |
+
"# viral_tweets_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/all_tweets.csv\", dtype=dtypes)\n",
|
375 |
+
"viral_tweets_df.head()"
|
376 |
+
]
|
377 |
+
},
|
378 |
+
{
|
379 |
+
"cell_type": "code",
|
380 |
+
"execution_count": 5,
|
381 |
+
"metadata": {},
|
382 |
+
"outputs": [
|
383 |
+
{
|
384 |
+
"data": {
|
385 |
+
"text/plain": [
|
386 |
+
"'RT @strbrkrr: apple be like \"high volume may damage your ears...\" ok… i don’t care'"
|
387 |
+
]
|
388 |
+
},
|
389 |
+
"execution_count": 5,
|
390 |
+
"metadata": {},
|
391 |
+
"output_type": "execute_result"
|
392 |
+
}
|
393 |
+
],
|
394 |
+
"source": [
|
395 |
+
"viral_tweets_df[~viral_tweets_df.annotations.isna()].text.iloc[10]"
|
396 |
+
]
|
397 |
+
},
|
398 |
+
{
|
399 |
+
"cell_type": "markdown",
|
400 |
+
"metadata": {},
|
401 |
+
"source": [
|
402 |
+
"### 1.2 - Retrieve viral tweets users"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"cell_type": "markdown",
|
407 |
+
"metadata": {},
|
408 |
+
"source": [
|
409 |
+
"We start by retrieving the viral tweets users. Users are **included as expansions** when retrieving the tweets, conveniently so. For each user, we retrieve this user's history and information."
|
410 |
+
]
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"cell_type": "code",
|
414 |
+
"execution_count": null,
|
415 |
+
"metadata": {},
|
416 |
+
"outputs": [],
|
417 |
+
"source": [
|
418 |
+
"# Retrieve the user id. The user data is included in the 'includes' field which we get by if we retrieve any expansions\n",
|
419 |
+
"users_df = pd.read_csv(f\"{VIRAL_TWEETS_PATH}/users.csv\", dtype={\"id\": str, \"pinned_tweet_id\": str}, escapechar=\"\\\\\")\n",
|
420 |
+
"users_df"
|
421 |
+
]
|
422 |
+
},
|
423 |
+
{
|
424 |
+
"cell_type": "code",
|
425 |
+
"execution_count": null,
|
426 |
+
"metadata": {},
|
427 |
+
"outputs": [],
|
428 |
+
"source": [
|
429 |
+
"'''\n",
|
430 |
+
"id object\n",
|
431 |
+
"edit_history_tweet_ids object\n",
|
432 |
+
"author_id object\n",
|
433 |
+
"created_at object\n",
|
434 |
+
"possibly_sensitive bool\n",
|
435 |
+
"text object\n",
|
436 |
+
"retweet_count int64\n",
|
437 |
+
"reply_count int64\n",
|
438 |
+
"like_count int64\n",
|
439 |
+
"quote_count int64\n",
|
440 |
+
"has_media bool\n",
|
441 |
+
"urls object\n",
|
442 |
+
"context_annotations object\n",
|
443 |
+
"annotations object\n",
|
444 |
+
"hashtags object\n",
|
445 |
+
"geo.place_id object\n",
|
446 |
+
"mentions object\n",
|
447 |
+
"dtype: object\n",
|
448 |
+
"'''\n",
|
449 |
+
"viral_tweets_df.dtypes"
|
450 |
+
]
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"cell_type": "markdown",
|
454 |
+
"metadata": {},
|
455 |
+
"source": [
|
456 |
+
"## 2 - Analysis of single user"
|
457 |
+
]
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"cell_type": "markdown",
|
461 |
+
"metadata": {},
|
462 |
+
"source": [
|
463 |
+
"Let's observe the tweets of single user who has tweeted viral tweets. We'll try to conduct some analysis on their features to try and see what changed in the tweets of the user over time, and how they reflect the changes in the behaviour of the user."
|
464 |
+
]
|
465 |
+
},
|
466 |
+
{
|
467 |
+
"cell_type": "code",
|
468 |
+
"execution_count": null,
|
469 |
+
"metadata": {},
|
470 |
+
"outputs": [],
|
471 |
+
"source": [
|
472 |
+
"# Take first user\n",
|
473 |
+
"user_id = users_df.iloc[0].id"
|
474 |
+
]
|
475 |
+
},
|
476 |
+
{
|
477 |
+
"cell_type": "code",
|
478 |
+
"execution_count": null,
|
479 |
+
"metadata": {},
|
480 |
+
"outputs": [],
|
481 |
+
"source": [
|
482 |
+
"user_tweets = viral_tweets_df[viral_tweets_df.author_id == user_id]\n",
|
483 |
+
"user_tweets['created_at'] = pd.to_datetime(user_tweets.created_at)\n",
|
484 |
+
"user_tweets.head()"
|
485 |
+
]
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"cell_type": "code",
|
489 |
+
"execution_count": null,
|
490 |
+
"metadata": {},
|
491 |
+
"outputs": [],
|
492 |
+
"source": [
|
493 |
+
"fig, ax = plt.subplots(1, 2, figsize=(10,5))\n",
|
494 |
+
"\n",
|
495 |
+
"ax[0].set_title(\"Retweet Count vs Tweet Date\")\n",
|
496 |
+
"sns.lineplot(user_tweets, x='created_at', y='retweet_count', ax=ax[0])\n",
|
497 |
+
"\n",
|
498 |
+
"ax[1].set_title(\"Like Count vs Tweet Date\")\n",
|
499 |
+
"sns.lineplot(user_tweets, x='created_at', y='like_count', ax=ax[1])\n",
|
500 |
+
"\n",
|
501 |
+
"plt.tight_layout()"
|
502 |
+
]
|
503 |
+
},
|
504 |
+
{
|
505 |
+
"cell_type": "code",
|
506 |
+
"execution_count": null,
|
507 |
+
"metadata": {},
|
508 |
+
"outputs": [],
|
509 |
+
"source": [
|
510 |
+
"fig, ax = plt.subplots(1, 2, figsize=(10,5))\n",
|
511 |
+
"\n",
|
512 |
+
"user_tweets['tweet_length'] = user_tweets['text'].apply(len)\n",
|
513 |
+
"\n",
|
514 |
+
"ax[0].set_title(\"Retweet Count vs Tweet Length\")\n",
|
515 |
+
"sns.lineplot(user_tweets, x='tweet_length', y='retweet_count', ax=ax[0])\n",
|
516 |
+
"\n",
|
517 |
+
"ax[1].set_title(\"Like Count vs Tweet Length\")\n",
|
518 |
+
"sns.lineplot(user_tweets, x='tweet_length', y='like_count', ax=ax[1])\n",
|
519 |
+
"\n",
|
520 |
+
"plt.tight_layout()"
|
521 |
+
]
|
522 |
+
},
|
523 |
+
{
|
524 |
+
"cell_type": "code",
|
525 |
+
"execution_count": null,
|
526 |
+
"metadata": {},
|
527 |
+
"outputs": [],
|
528 |
+
"source": [
|
529 |
+
"# Has media\n",
|
530 |
+
"sns.jointplot(user_tweets, x='has_media', y='retweet_count')\n",
|
531 |
+
"\n",
|
532 |
+
"plt.suptitle(\"# Retweets vs Tweet has media\")\n",
|
533 |
+
"plt.tight_layout()"
|
534 |
+
]
|
535 |
+
},
|
536 |
+
{
|
537 |
+
"cell_type": "code",
|
538 |
+
"execution_count": null,
|
539 |
+
"metadata": {},
|
540 |
+
"outputs": [],
|
541 |
+
"source": [
|
542 |
+
"sns.pairplot(user_tweets[['tweet_length', 'has_media', 'retweet_count', 'like_count']])"
|
543 |
+
]
|
544 |
+
},
|
545 |
+
{
|
546 |
+
"cell_type": "code",
|
547 |
+
"execution_count": null,
|
548 |
+
"metadata": {},
|
549 |
+
"outputs": [],
|
550 |
+
"source": [
|
551 |
+
"fig, ax = plt.subplots(2, 2, figsize=(10,5))\n",
|
552 |
+
"\n",
|
553 |
+
"user_tweets['tweet_length'] = user_tweets['text'].apply(len)\n",
|
554 |
+
"\n",
|
555 |
+
"ax[0][0].set_title(\"Retweet Count vs Date\")\n",
|
556 |
+
"sns.lineplot(user_tweets, x='created_at', y='retweet_count', ax=ax[0][0])\n",
|
557 |
+
"\n",
|
558 |
+
"ax[0][1].set_title(\"Like Count vs Date\")\n",
|
559 |
+
"sns.lineplot(user_tweets, x='created_at', y='like_count', ax=ax[0][1])\n",
|
560 |
+
"\n",
|
561 |
+
"ax[1][0].set_title(\"Has Media vs Date\")\n",
|
562 |
+
"sns.scatterplot(user_tweets, x='created_at', y='has_media', ax=ax[1][0])\n",
|
563 |
+
"\n",
|
564 |
+
"ax[1][1].set_title(\"Tweet Length vs Date\")\n",
|
565 |
+
"sns.scatterplot(user_tweets, x='created_at', y='tweet_length', ax=ax[1][1])\n",
|
566 |
+
"\n",
|
567 |
+
"plt.tight_layout()"
|
568 |
+
]
|
569 |
+
},
|
570 |
+
{
|
571 |
+
"cell_type": "code",
|
572 |
+
"execution_count": null,
|
573 |
+
"metadata": {},
|
574 |
+
"outputs": [],
|
575 |
+
"source": [
|
576 |
+
"### TODO: Analyze the change in tweet features depending on date (one row depending on date, other depending on retweet count to reflect the evolution)\n",
|
577 |
+
"### TODO: Concentration on topics [group by topics for a sample user]"
|
578 |
+
]
|
579 |
+
},
|
580 |
+
{
|
581 |
+
"cell_type": "markdown",
|
582 |
+
"metadata": {},
|
583 |
+
"source": [
|
584 |
+
"## 3 - Aggregate Analysis of all viral users tweets"
|
585 |
+
]
|
586 |
+
},
|
587 |
+
{
|
588 |
+
"cell_type": "markdown",
|
589 |
+
"metadata": {},
|
590 |
+
"source": [
|
591 |
+
"#### 3.0 - How many tweets per user retrieved"
|
592 |
+
]
|
593 |
+
},
|
594 |
+
{
|
595 |
+
"cell_type": "code",
|
596 |
+
"execution_count": null,
|
597 |
+
"metadata": {},
|
598 |
+
"outputs": [],
|
599 |
+
"source": [
|
600 |
+
"tweets_per_user = viral_tweets_df.groupby(by='author_id').size().reset_index(name='count')\n",
|
601 |
+
"tweets_per_user.sort_values(by='count')"
|
602 |
+
]
|
603 |
+
},
|
604 |
+
{
|
605 |
+
"cell_type": "code",
|
606 |
+
"execution_count": null,
|
607 |
+
"metadata": {},
|
608 |
+
"outputs": [],
|
609 |
+
"source": [
|
610 |
+
"tweets_per_user.hist(column='count', bins=10)\n",
|
611 |
+
"plt.title(\"Histogram of distribution of number of tweets retrieved per user\")"
|
612 |
+
]
|
613 |
+
},
|
614 |
+
{
|
615 |
+
"cell_type": "markdown",
|
616 |
+
"metadata": {},
|
617 |
+
"source": [
|
618 |
+
"#### 3.1 - Retweet count vs like count"
|
619 |
+
]
|
620 |
+
},
|
621 |
+
{
|
622 |
+
"cell_type": "markdown",
|
623 |
+
"metadata": {},
|
624 |
+
"source": [
|
625 |
+
"In order to come up with a metric for the **virality** of the tweet, we need to know which features we will use to determine this metric. *retweet_count* and *like_count* will surely be among those features selected. Let's how the two correlate."
|
626 |
+
]
|
627 |
+
},
|
628 |
+
{
|
629 |
+
"cell_type": "markdown",
|
630 |
+
"metadata": {},
|
631 |
+
"source": [
|
632 |
+
"**NOTE**: \"The retweet will not show the likes and replies, only retweet count. You need to get the counts from the original tweet, which would be referenced in referenced_tweets and included in includes.tweets part of the response.\" - Twitter Community"
|
633 |
+
]
|
634 |
+
},
|
635 |
+
{
|
636 |
+
"cell_type": "code",
|
637 |
+
"execution_count": null,
|
638 |
+
"metadata": {},
|
639 |
+
"outputs": [],
|
640 |
+
"source": [
|
641 |
+
"# Remove all tweets that might be retweets of others\n",
|
642 |
+
"retweeted = viral_tweets_df.retweet_count !=0\n",
|
643 |
+
"liked = viral_tweets_df.like_count !=0\n",
|
644 |
+
"original_tweets_df = viral_tweets_df[retweeted & liked]\n",
|
645 |
+
"\n",
|
646 |
+
"# Remove NA in retweet and like count\n",
|
647 |
+
"original_tweets_df = original_tweets_df.dropna(axis=0, subset=['retweet_count', 'like_count'])\n",
|
648 |
+
"\n",
|
649 |
+
"sns.scatterplot(data=original_tweets_df, x='retweet_count', y='like_count')"
|
650 |
+
]
|
651 |
+
},
|
652 |
+
{
|
653 |
+
"cell_type": "markdown",
|
654 |
+
"metadata": {},
|
655 |
+
"source": [
|
656 |
+
"**Finding**: We can see more or less a linear correlation. Especially for lower numbers."
|
657 |
+
]
|
658 |
+
},
|
659 |
+
{
|
660 |
+
"cell_type": "markdown",
|
661 |
+
"metadata": {},
|
662 |
+
"source": [
|
663 |
+
"#### 3.2 - (# Retweets / # followers ) ratio \n"
|
664 |
+
]
|
665 |
+
},
|
666 |
+
{
|
667 |
+
"cell_type": "markdown",
|
668 |
+
"metadata": {},
|
669 |
+
"source": [
|
670 |
+
"Here a viable metric for a viral tweet can be the ratio between the retweets (or like) count over the followers count of the user. The idea here is that a user who doesn't have many followers, but has tweeted tweets that have garnered a lot of retweets or likes, can most definitely be considered \"viral\". On the other hand, a user who has many followers can have a standard high # retweets and those cannot be considered viral all the time."
|
671 |
+
]
|
672 |
+
},
|
673 |
+
{
|
674 |
+
"cell_type": "markdown",
|
675 |
+
"metadata": {},
|
676 |
+
"source": [
|
677 |
+
"**Note**: Also note that historical data for the evolution of the # of followers of a user are not easily available and are not provided by the Twitter API. So these calculated ratios do not reflect the actual ratio when the tweet has been tweeted by a user, since by then he may have gained a lot of followers."
|
678 |
+
]
|
679 |
+
},
|
680 |
+
{
|
681 |
+
"cell_type": "code",
|
682 |
+
"execution_count": null,
|
683 |
+
"metadata": {},
|
684 |
+
"outputs": [],
|
685 |
+
"source": [
|
686 |
+
"viral_tweets_df_subset = original_tweets_df[['id', 'author_id', 'retweet_count', 'like_count']]\n",
|
687 |
+
"\n",
|
688 |
+
"# Remove NA in follower count\n",
|
689 |
+
"users_df_subset = users_df.dropna(axis=0, subset=['followers_count'])\n",
|
690 |
+
"\n",
|
691 |
+
"# Merge both on author id\n",
|
692 |
+
"tweets_users_merged_df = viral_tweets_df_subset.merge(\n",
|
693 |
+
" right=users_df_subset[['id', 'followers_count']].set_index('id'), left_on='author_id', right_on='id')"
|
694 |
+
]
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"cell_type": "code",
|
698 |
+
"execution_count": null,
|
699 |
+
"metadata": {},
|
700 |
+
"outputs": [],
|
701 |
+
"source": [
|
702 |
+
"tweets_users_merged_df['retweets_followers_ratio'] = tweets_users_merged_df['retweet_count'] / tweets_users_merged_df['followers_count']\n",
|
703 |
+
"tweets_users_merged_df.sort_values(by='retweets_followers_ratio')"
|
704 |
+
]
|
705 |
+
},
|
706 |
+
{
|
707 |
+
"cell_type": "code",
|
708 |
+
"execution_count": null,
|
709 |
+
"metadata": {},
|
710 |
+
"outputs": [],
|
711 |
+
"source": [
|
712 |
+
"import plotly.express as px\n",
|
713 |
+
"\n",
|
714 |
+
"df_ratios_bigger_than_1 = tweets_users_merged_df[tweets_users_merged_df.retweets_followers_ratio > 1.0]\n",
|
715 |
+
"fig = px.histogram(\n",
|
716 |
+
" df_ratios_bigger_than_1,\n",
|
717 |
+
" x=\"retweets_followers_ratio\",\n",
|
718 |
+
" nbins=10,\n",
|
719 |
+
" log_y=True)\n",
|
720 |
+
"\n",
|
721 |
+
"fig.update_layout(\n",
|
722 |
+
" title={\n",
|
723 |
+
" 'text': \"Histogram of the distribution of the retweets/followers ratio > 1\",\n",
|
724 |
+
" 'y':0.9,\n",
|
725 |
+
" 'x':0.5,\n",
|
726 |
+
" 'xanchor': 'center',\n",
|
727 |
+
" 'yanchor': 'top'})\n",
|
728 |
+
"\n",
|
729 |
+
"\n",
|
730 |
+
"fig.show()"
|
731 |
+
]
|
732 |
+
},
|
733 |
+
{
|
734 |
+
"cell_type": "markdown",
|
735 |
+
"metadata": {},
|
736 |
+
"source": [
|
737 |
+
"The histogram is not very clear, since we have rare events where the tweets garnered so much popularity wrt the popularity of the user. Those we can definitely consider as viral Maybe we can try K-means to better identify these outliers."
|
738 |
+
]
|
739 |
+
},
|
740 |
+
{
|
741 |
+
"cell_type": "code",
|
742 |
+
"execution_count": null,
|
743 |
+
"metadata": {},
|
744 |
+
"outputs": [],
|
745 |
+
"source": [
|
746 |
+
"from sklearn.cluster import KMeans\n",
|
747 |
+
"\n",
|
748 |
+
"n_clusters = 3\n",
|
749 |
+
"X = np.array(df_ratios_bigger_than_1[['retweet_count', 'followers_count']])\n",
|
750 |
+
"#X = np.vstack((df_ratios_bigger_than_1.retweet_count.to_numpy(), df_ratios_bigger_than_1.followers_count.to_numpy()))\n",
|
751 |
+
"#X = df_ratios_bigger_than_1.retweets_followers_ratio.to_numpy().reshape(-1, 1)\n",
|
752 |
+
"ratio_kmeans = KMeans(n_clusters=n_clusters, random_state=123).fit(X)\n",
|
753 |
+
"\n",
|
754 |
+
"#np.vstack((X[:, 0], X[:, 1], ratio_kmeans.labels_)).reshape(-1, 3)\n",
|
755 |
+
"#px.scatter(ratio_kmeans, x=)\n",
|
756 |
+
"'''\n",
|
757 |
+
"plt.title(f'K-Means clustering of #retweets/#followers ratio with k={n_clusters}')\n",
|
758 |
+
"plt.xlabel('Retweets')\n",
|
759 |
+
"plt.ylabel('Followers')\n",
|
760 |
+
"plt.scatter(X[:, 0], X[:, 1], c=ratio_kmeans.labels_)\n",
|
761 |
+
"'''"
|
762 |
+
]
|
763 |
+
},
|
764 |
+
{
|
765 |
+
"cell_type": "code",
|
766 |
+
"execution_count": null,
|
767 |
+
"metadata": {},
|
768 |
+
"outputs": [],
|
769 |
+
"source": [
|
770 |
+
"kmeans_results_df = pd.DataFrame(X, columns=['retweet_count', 'follower_count']) \n",
|
771 |
+
"kmeans_results_df['label'] = ratio_kmeans.labels_"
|
772 |
+
]
|
773 |
+
},
|
774 |
+
{
|
775 |
+
"cell_type": "code",
|
776 |
+
"execution_count": null,
|
777 |
+
"metadata": {},
|
778 |
+
"outputs": [],
|
779 |
+
"source": [
|
780 |
+
"px.scatter(kmeans_results_df, x='follower_count', y='retweet_count', color='label')\n"
|
781 |
+
]
|
782 |
+
},
|
783 |
+
{
|
784 |
+
"cell_type": "markdown",
|
785 |
+
"metadata": {},
|
786 |
+
"source": [
|
787 |
+
"#### 3.3 - Metric (# Retweets / avg #retweets of a user)"
|
788 |
+
]
|
789 |
+
},
|
790 |
+
{
|
791 |
+
"cell_type": "code",
|
792 |
+
"execution_count": null,
|
793 |
+
"metadata": {},
|
794 |
+
"outputs": [],
|
795 |
+
"source": [
|
796 |
+
"# avg_nb_retweets_per_user = viral_tweets_df_subset.groupby(by='author_id').agg({'retweet_count': ['min', 'mean', 'max'], 'like_count': ['min', 'mean', 'max']})\n",
|
797 |
+
"avg_nb_retweets_per_user = viral_tweets_df_subset.groupby(by='author_id').retweet_count.agg(['min', 'mean', 'max'])\n",
|
798 |
+
"avg_nb_retweets_per_user"
|
799 |
+
]
|
800 |
+
},
|
801 |
+
{
|
802 |
+
"cell_type": "code",
|
803 |
+
"execution_count": null,
|
804 |
+
"metadata": {},
|
805 |
+
"outputs": [],
|
806 |
+
"source": [
|
807 |
+
"ratio_retweet_avg_retweets_df = viral_tweets_df_subset.merge(avg_nb_retweets_per_user, on='author_id')\n",
|
808 |
+
"ratio_retweet_avg_retweets_df['per_user_performance'] = ratio_retweet_avg_retweets_df['retweet_count'] / ratio_retweet_avg_retweets_df['mean']\n",
|
809 |
+
"ratio_retweet_avg_retweets_df"
|
810 |
+
]
|
811 |
+
},
|
812 |
+
{
|
813 |
+
"cell_type": "code",
|
814 |
+
"execution_count": null,
|
815 |
+
"metadata": {},
|
816 |
+
"outputs": [],
|
817 |
+
"source": [
|
818 |
+
"bigger_than_mean = ratio_retweet_avg_retweets_df[ratio_retweet_avg_retweets_df.per_user_performance > 1]\n",
|
819 |
+
"hist = px.histogram(bigger_than_mean, x='per_user_performance', log_y=True)\n",
|
820 |
+
"\n",
|
821 |
+
"hist.update_layout(title_text=\"Distribution of tweet performance wrt avg #retweets per user\", xaxis_title=\"Tweet performance\", yaxis_title=\"log count\")"
|
822 |
+
]
|
823 |
+
},
|
824 |
+
{
|
825 |
+
"cell_type": "markdown",
|
826 |
+
"metadata": {},
|
827 |
+
"source": [
|
828 |
+
"**Finding**: We established another metric by which we can judge the virality of a tweet, namely the number of retweets vs the average number of retweets per user. We can set a threshold (e.g. > 2) to decide whether a tweet is viral or not. We can also conduct further analysis over those tweets to determine what sets them apart from the others."
|
829 |
+
]
|
830 |
+
},
|
831 |
+
{
|
832 |
+
"cell_type": "markdown",
|
833 |
+
"metadata": {},
|
834 |
+
"source": [
|
835 |
+
"#### 3.4 - Tweet Topic (context annotations)"
|
836 |
+
]
|
837 |
+
},
|
838 |
+
{
|
839 |
+
"cell_type": "markdown",
|
840 |
+
"metadata": {},
|
841 |
+
"source": [
|
842 |
+
"What topics are available? Context annotations are Twitter's version of analyzing the topic of a tweet. They are defined as a context **domain** and **entity**. The domain is like a general topic and entity is like a subtopic or a specific topic within the general domain."
|
843 |
+
]
|
844 |
+
},
|
845 |
+
{
|
846 |
+
"cell_type": "code",
|
847 |
+
"execution_count": null,
|
848 |
+
"metadata": {},
|
849 |
+
"outputs": [],
|
850 |
+
"source": [
|
851 |
+
"import json \n",
|
852 |
+
"\n",
|
853 |
+
"tweets_with_topics = original_tweets_df.dropna(axis=0, subset='context_annotations')\n",
|
854 |
+
"\n",
|
855 |
+
"def topic_to_json(x):\n",
|
856 |
+
" try:\n",
|
857 |
+
" return json.loads(x.replace('\\'', '\"'))\n",
|
858 |
+
" except json.JSONDecodeError:\n",
|
859 |
+
" print(\"Nope\")\n",
|
860 |
+
" return []"
|
861 |
+
]
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"cell_type": "markdown",
|
865 |
+
"metadata": {},
|
866 |
+
"source": [
|
867 |
+
"TODO tomorrow:\n",
|
868 |
+
"- Try sample and make it work with context annotations.\n",
|
869 |
+
"- Check if has media is not null\n",
|
870 |
+
"- hashtags extract tags\n",
|
871 |
+
"- Extract context annotations\n",
|
872 |
+
"- Use Celia Bearer Token"
|
873 |
+
]
|
874 |
+
},
|
875 |
+
{
|
876 |
+
"cell_type": "code",
|
877 |
+
"execution_count": null,
|
878 |
+
"metadata": {},
|
879 |
+
"outputs": [],
|
880 |
+
"source": [
|
881 |
+
"from tweepy import Paginator, TooManyRequests\n",
|
882 |
+
"client = twitter_client_wrapper.client\n",
|
883 |
+
"#tweet_data = twitter_client_wrapper.client.get_users_tweets(id='1584975692126900225', expansions=EXPANSIONS, user_fields=USER_FIELDS, tweet_fields=TWEET_FIELDS, media_fields=MEDIA_FIELDS, exclude='retweets')\n",
|
884 |
+
"\n",
|
885 |
+
"viral_users_tweets = []\n",
|
886 |
+
"# Number of users processed so far\n",
|
887 |
+
"try:\n",
|
888 |
+
" for tweet in Paginator(client.get_users_tweets, id='1482846121517096961', tweet_fields=TWEET_FIELDS, exclude=\"retweets\").flatten(limit=20):\n",
|
889 |
+
" viral_users_tweets.append(tweet.data)\n",
|
890 |
+
"except TooManyRequests:\n",
|
891 |
+
" print(\"Hit Rate Limit\")\n"
|
892 |
+
]
|
893 |
+
},
|
894 |
+
{
|
895 |
+
"cell_type": "code",
|
896 |
+
"execution_count": null,
|
897 |
+
"metadata": {},
|
898 |
+
"outputs": [],
|
899 |
+
"source": [
|
900 |
+
"domains = {}\n",
|
901 |
+
"entities = {}\n",
|
902 |
+
"for tweet in viral_users_tweets:\n",
|
903 |
+
" context_annotations = tweet.get('context_annotations', [])\n",
|
904 |
+
" tweet_topic_domains = dict([(topic['domain']['id'], topic['domain']) for topic in context_annotations])\n",
|
905 |
+
" domains.update(tweet_topic_domains)\n",
|
906 |
+
" tweet_topic_entities = dict([(topic['entity']['id'], topic['entity']) for topic in context_annotations])\n",
|
907 |
+
" entities.update(tweet_topic_entities)\n",
|
908 |
+
" tweet['topic_domain'] = list(tweet_topic_domains.keys())\n",
|
909 |
+
" tweet['topic_entity'] = list(tweet_topic_entities.keys())\n",
|
910 |
+
" tweet.pop('context_annotations', None)"
|
911 |
+
]
|
912 |
+
},
|
913 |
+
{
|
914 |
+
"cell_type": "code",
|
915 |
+
"execution_count": null,
|
916 |
+
"metadata": {},
|
917 |
+
"outputs": [],
|
918 |
+
"source": [
|
919 |
+
"import pickle\n",
|
920 |
+
"\n",
|
921 |
+
"with open('topic_domains.pickle', 'wb') as handle:\n",
|
922 |
+
" pickle.dump(entities, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
923 |
+
"\n",
|
924 |
+
"with open('topic_domains.pickle', 'rb') as handle:\n",
|
925 |
+
" b = pickle.load(handle)\n",
|
926 |
+
"\n",
|
927 |
+
"b"
|
928 |
+
]
|
929 |
+
},
|
930 |
+
{
|
931 |
+
"cell_type": "code",
|
932 |
+
"execution_count": null,
|
933 |
+
"metadata": {},
|
934 |
+
"outputs": [],
|
935 |
+
"source": [
|
936 |
+
"try:\n",
|
937 |
+
" with open('topic_domains.pickle', 'rb') as handle:\n",
|
938 |
+
" topic_domains = pickle.load(handle)\n",
|
939 |
+
"except FileNotFoundError:\n",
|
940 |
+
" topic_domains = {}\n",
|
941 |
+
"\n",
|
942 |
+
"topic_domains"
|
943 |
+
]
|
944 |
+
},
|
945 |
+
{
|
946 |
+
"cell_type": "code",
|
947 |
+
"execution_count": null,
|
948 |
+
"metadata": {},
|
949 |
+
"outputs": [],
|
950 |
+
"source": [
|
951 |
+
"temp = pd.json_normalize(viral_users_tweets)\n",
|
952 |
+
"#temp[temp.context_annotations.notna()]\n",
|
953 |
+
"temp"
|
954 |
+
]
|
955 |
+
},
|
956 |
+
{
|
957 |
+
"cell_type": "code",
|
958 |
+
"execution_count": null,
|
959 |
+
"metadata": {},
|
960 |
+
"outputs": [],
|
961 |
+
"source": [
|
962 |
+
"domains"
|
963 |
+
]
|
964 |
+
},
|
965 |
+
{
|
966 |
+
"cell_type": "code",
|
967 |
+
"execution_count": null,
|
968 |
+
"metadata": {},
|
969 |
+
"outputs": [],
|
970 |
+
"source": [
|
971 |
+
"s = pd.Series([b[item]['name'] for items in temp.topic_domain.values for item in items])\n",
|
972 |
+
"s.groupby(s).count().sort_values()"
|
973 |
+
]
|
974 |
+
},
|
975 |
+
{
|
976 |
+
"cell_type": "code",
|
977 |
+
"execution_count": null,
|
978 |
+
"metadata": {},
|
979 |
+
"outputs": [],
|
980 |
+
"source": [
|
981 |
+
"viral_users_tweets_2 = []\n",
|
982 |
+
"# Number of users processed so far\n",
|
983 |
+
"try:\n",
|
984 |
+
" for tweet in Paginator(client.get_users_tweets, id='848263392943058944', tweet_fields=TWEET_FIELDS, exclude=\"retweets\").flatten(limit=100):\n",
|
985 |
+
" viral_users_tweets_2.append(tweet.data)\n",
|
986 |
+
"except TooManyRequests:\n",
|
987 |
+
" print(\"Hit Rate Limit\")"
|
988 |
+
]
|
989 |
+
},
|
990 |
+
{
|
991 |
+
"cell_type": "code",
|
992 |
+
"execution_count": null,
|
993 |
+
"metadata": {},
|
994 |
+
"outputs": [],
|
995 |
+
"source": [
|
996 |
+
"domains = {}\n",
|
997 |
+
"entities = {}\n",
|
998 |
+
"for tweet in viral_users_tweets_2:\n",
|
999 |
+
" context_annotations = tweet.get('context_annotations', [])\n",
|
1000 |
+
" tweet_topic_domains = dict([(topic['domain']['id'], topic['domain']) for topic in context_annotations])\n",
|
1001 |
+
" domains.update(tweet_topic_domains)\n",
|
1002 |
+
" tweet_topic_entities = dict([(topic['entity']['id'], topic['entity']) for topic in context_annotations])\n",
|
1003 |
+
" entities.update(tweet_topic_entities)\n",
|
1004 |
+
" tweet['topic_domain'] = list(tweet_topic_domains.keys()) if len(tweet_topic_domains.keys()) > 0 else pd.NA\n",
|
1005 |
+
" tweet['topic_entity'] = list(tweet_topic_entities.keys()) if len(tweet_topic_entities.keys()) > 0 else pd.NA\n",
|
1006 |
+
" #tweet.pop('context_annotations', None)"
|
1007 |
+
]
|
1008 |
+
},
|
1009 |
+
{
|
1010 |
+
"cell_type": "code",
|
1011 |
+
"execution_count": null,
|
1012 |
+
"metadata": {},
|
1013 |
+
"outputs": [],
|
1014 |
+
"source": [
|
1015 |
+
"temp2_df = pd.json_normalize(viral_users_tweets_2)\n",
|
1016 |
+
"first_context = temp2_df[~temp2_df.topic_domain.isna()].topic_domain.iloc[2]"
|
1017 |
+
]
|
1018 |
+
},
|
1019 |
+
{
|
1020 |
+
"cell_type": "code",
|
1021 |
+
"execution_count": null,
|
1022 |
+
"metadata": {},
|
1023 |
+
"outputs": [],
|
1024 |
+
"source": [
|
1025 |
+
"temp2_df[~temp2_df['entities.hashtags'].isna()]"
|
1026 |
+
]
|
1027 |
+
},
|
1028 |
+
{
|
1029 |
+
"cell_type": "code",
|
1030 |
+
"execution_count": null,
|
1031 |
+
"metadata": {},
|
1032 |
+
"outputs": [],
|
1033 |
+
"source": [
|
1034 |
+
"temp2_df.to_csv(\"temp.csv\", index=False)"
|
1035 |
+
]
|
1036 |
+
},
|
1037 |
+
{
|
1038 |
+
"cell_type": "code",
|
1039 |
+
"execution_count": null,
|
1040 |
+
"metadata": {},
|
1041 |
+
"outputs": [],
|
1042 |
+
"source": [
|
1043 |
+
"import ast\n",
|
1044 |
+
"\n",
|
1045 |
+
"temp2_read = pd.read_csv('temp.csv', converters={'context_annotations': lambda x: eval(x) if (x and len(x) > 0) else np.nan})\n",
|
1046 |
+
"first_context = temp2_read[~temp2_read.context_annotations.isna()].context_annotations.iloc[2]\n",
|
1047 |
+
"first_context"
|
1048 |
+
]
|
1049 |
+
},
|
1050 |
+
{
|
1051 |
+
"cell_type": "code",
|
1052 |
+
"execution_count": null,
|
1053 |
+
"metadata": {},
|
1054 |
+
"outputs": [],
|
1055 |
+
"source": [
|
1056 |
+
"eval(first_context)"
|
1057 |
+
]
|
1058 |
+
},
|
1059 |
+
{
|
1060 |
+
"cell_type": "code",
|
1061 |
+
"execution_count": null,
|
1062 |
+
"metadata": {},
|
1063 |
+
"outputs": [],
|
1064 |
+
"source": [
|
1065 |
+
"def format_context_annotations(context_annotations):\n",
|
1066 |
+
" if (pd.isna(context_annotations)):\n",
|
1067 |
+
" return []\n",
|
1068 |
+
" else:\n",
|
1069 |
+
" return json.loads(context_annotations)\n",
|
1070 |
+
"\n",
|
1071 |
+
"temp2_df.context_annotations.apply(format_context_annotations)"
|
1072 |
+
]
|
1073 |
+
},
|
1074 |
+
{
|
1075 |
+
"cell_type": "code",
|
1076 |
+
"execution_count": null,
|
1077 |
+
"metadata": {},
|
1078 |
+
"outputs": [],
|
1079 |
+
"source": [
|
1080 |
+
"pd.DataFrame(viral_users_tweets_2, columns=TWEET_FIELDS).to_csv('temp_2.csv', index=False)"
|
1081 |
+
]
|
1082 |
+
},
|
1083 |
+
{
|
1084 |
+
"cell_type": "code",
|
1085 |
+
"execution_count": null,
|
1086 |
+
"metadata": {},
|
1087 |
+
"outputs": [],
|
1088 |
+
"source": [
|
1089 |
+
"#tweet_data = twitter_client_wrapper.client.get_tweet(id='1584975692126900225', expansions=EXPANSIONS, user_fields=USER_FIELDS, tweet_fields=TWEET_FIELDS, media_fields=MEDIA_FIELDS)\n",
|
1090 |
+
"bytes(tweets_with_topics.iloc[1000].context_annotations, encoding='utf-8').decode('unicode_escape')"
|
1091 |
+
]
|
1092 |
+
},
|
1093 |
+
{
|
1094 |
+
"cell_type": "code",
|
1095 |
+
"execution_count": 6,
|
1096 |
+
"metadata": {},
|
1097 |
+
"outputs": [
|
1098 |
+
{
|
1099 |
+
"data": {
|
1100 |
+
"text/plain": [
|
1101 |
+
"'46'"
|
1102 |
+
]
|
1103 |
+
},
|
1104 |
+
"execution_count": 6,
|
1105 |
+
"metadata": {},
|
1106 |
+
"output_type": "execute_result"
|
1107 |
+
}
|
1108 |
+
],
|
1109 |
+
"source": [
|
1110 |
+
"dtypes={\"id\": str, \"author_id\": str, \"has_media\": bool, \"possibly_sensitive\": bool, \"has_hashtags\": bool}\n",
|
1111 |
+
"temp3 = pd.read_csv(\"145371604-to-146944733.csv\", dtype=dtypes)\n",
|
1112 |
+
"d = temp3[~temp3.topic_domains.isna()].topic_domains.iloc[0]\n",
|
1113 |
+
"eval(d)[0]"
|
1114 |
+
]
|
1115 |
+
},
|
1116 |
+
{
|
1117 |
+
"cell_type": "markdown",
|
1118 |
+
"metadata": {},
|
1119 |
+
"source": [
|
1120 |
+
"#### 3.5 - Tweet Sentiment"
|
1121 |
+
]
|
1122 |
+
},
|
1123 |
+
{
|
1124 |
+
"cell_type": "markdown",
|
1125 |
+
"metadata": {},
|
1126 |
+
"source": []
|
1127 |
+
},
|
1128 |
+
{
|
1129 |
+
"cell_type": "markdown",
|
1130 |
+
"metadata": {},
|
1131 |
+
"source": [
|
1132 |
+
"#### 3.6 - Possibly sensitive"
|
1133 |
+
]
|
1134 |
+
},
|
1135 |
+
{
|
1136 |
+
"cell_type": "markdown",
|
1137 |
+
"metadata": {},
|
1138 |
+
"source": []
|
1139 |
+
},
|
1140 |
+
{
|
1141 |
+
"cell_type": "markdown",
|
1142 |
+
"metadata": {},
|
1143 |
+
"source": [
|
1144 |
+
"#### 3.7 - Hashtags"
|
1145 |
+
]
|
1146 |
+
},
|
1147 |
+
{
|
1148 |
+
"cell_type": "code",
|
1149 |
+
"execution_count": null,
|
1150 |
+
"metadata": {},
|
1151 |
+
"outputs": [],
|
1152 |
+
"source": [
|
1153 |
+
"# TODO: has hashtags (using entities.hashtags)"
|
1154 |
+
]
|
1155 |
+
},
|
1156 |
+
{
|
1157 |
+
"cell_type": "markdown",
|
1158 |
+
"metadata": {},
|
1159 |
+
"source": [
|
1160 |
+
"#### 3.8 - Text preprocessing"
|
1161 |
+
]
|
1162 |
+
},
|
1163 |
+
{
|
1164 |
+
"cell_type": "code",
|
1165 |
+
"execution_count": null,
|
1166 |
+
"metadata": {},
|
1167 |
+
"outputs": [],
|
1168 |
+
"source": []
|
1169 |
+
},
|
1170 |
+
{
|
1171 |
+
"cell_type": "markdown",
|
1172 |
+
"metadata": {},
|
1173 |
+
"source": [
|
1174 |
+
"TODO:\n",
|
1175 |
+
"- Sort by tweet date (check popularity)\n",
|
1176 |
+
"- Use Twitter lists to try and find\n",
|
1177 |
+
"- Check if reply or retweet"
|
1178 |
+
]
|
1179 |
+
}
|
1180 |
+
],
|
1181 |
+
"metadata": {
|
1182 |
+
"kernelspec": {
|
1183 |
+
"display_name": "Python 3.8.11 ('ada')",
|
1184 |
+
"language": "python",
|
1185 |
+
"name": "python3"
|
1186 |
+
},
|
1187 |
+
"language_info": {
|
1188 |
+
"codemirror_mode": {
|
1189 |
+
"name": "ipython",
|
1190 |
+
"version": 3
|
1191 |
+
},
|
1192 |
+
"file_extension": ".py",
|
1193 |
+
"mimetype": "text/x-python",
|
1194 |
+
"name": "python",
|
1195 |
+
"nbconvert_exporter": "python",
|
1196 |
+
"pygments_lexer": "ipython3",
|
1197 |
+
"version": "3.8.13"
|
1198 |
+
},
|
1199 |
+
"orig_nbformat": 4,
|
1200 |
+
"vscode": {
|
1201 |
+
"interpreter": {
|
1202 |
+
"hash": "71d2f77bccee14ca7852d7b7a1fa8ea4708b81087104d93973081337557f0ee6"
|
1203 |
+
}
|
1204 |
+
}
|
1205 |
+
},
|
1206 |
+
"nbformat": 4,
|
1207 |
+
"nbformat_minor": 2
|
1208 |
+
}
|
othercode/collect_users_tweets.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from tweepy import Paginator, TooManyRequests
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
import pickle
|
6 |
+
from tqdm import tqdm
|
7 |
+
import yaml
|
8 |
+
|
9 |
+
import boto3
|
10 |
+
|
11 |
+
from helper.twitter_client_wrapper import (
|
12 |
+
TWEET_FIELDS,
|
13 |
+
format_tweets_df, format_context_annotations,
|
14 |
+
load_topic_domains, load_topic_entities, TwitterClientWrapper
|
15 |
+
)
|
16 |
+
|
17 |
+
USER_IDS_PATH = "users_ids.csv"
|
18 |
+
|
19 |
+
def run(twitter_client, directory, users_ids, tweets_per_user=20000, push_to_remote=True):
|
20 |
+
topic_domains = load_topic_domains(f'{directory}topic_domains.pickle')
|
21 |
+
topic_entities = load_topic_entities(f'{directory}topic_entities.pickle')
|
22 |
+
|
23 |
+
# List where we accumulate the tweets retrieved so far
|
24 |
+
viral_users_tweets = []
|
25 |
+
# Number of users processed so far
|
26 |
+
users_processed = 0
|
27 |
+
filename = f"tweets/{users_ids.id[0]}-to-"
|
28 |
+
|
29 |
+
try:
|
30 |
+
for user_id in tqdm(users_ids.id):
|
31 |
+
for tweet in Paginator(twitter_client.get_users_tweets, id=user_id, tweet_fields=TWEET_FIELDS, exclude="retweets").flatten(limit=tweets_per_user):
|
32 |
+
processed_tweet, tweet_topic_domains, tweet_topic_entities = format_context_annotations(tweet.data)
|
33 |
+
viral_users_tweets.append(processed_tweet)
|
34 |
+
topic_domains.update(tweet_topic_domains)
|
35 |
+
topic_entities.update(tweet_topic_entities)
|
36 |
+
users_processed += 1
|
37 |
+
except TooManyRequests:
|
38 |
+
# Reached API limit
|
39 |
+
print("Hit Rate Limit")
|
40 |
+
finally:
|
41 |
+
# Dump all to parquet and keep track at which user we stopped.
|
42 |
+
if len(viral_users_tweets) > 0:
|
43 |
+
# Append end user id for this iteration to end of filename
|
44 |
+
filename += f"{user_id}.parquet.gzip"
|
45 |
+
filepath = directory + filename
|
46 |
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
47 |
+
format_tweets_df(viral_users_tweets).to_parquet(filepath, compression="gzip", index=False)
|
48 |
+
|
49 |
+
# Save the topics encountered so far as pickle file
|
50 |
+
with open(f'{directory}topic_domains.pickle', 'wb') as handle:
|
51 |
+
pickle.dump(topic_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
52 |
+
|
53 |
+
with open(f'{directory}topic_entities.pickle', 'wb') as handle:
|
54 |
+
pickle.dump(topic_entities, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
55 |
+
|
56 |
+
# Update the users ids to remove the ones already processed
|
57 |
+
users_ids[users_processed:].to_csv(f"{directory}{USER_IDS_PATH}", index=False)
|
58 |
+
|
59 |
+
if (push_to_remote):
|
60 |
+
s3 = boto3.resource("s3")
|
61 |
+
bucket_name = "semester-project-twitter-storage"
|
62 |
+
# Upload to S3
|
63 |
+
s3.Bucket(bucket_name).upload_file(filepath, filename)
|
64 |
+
else:
|
65 |
+
print("Finished processing users")
|
66 |
+
|
67 |
+
return
|
68 |
+
|
69 |
+
def main():
|
70 |
+
# TODO: Change depending on whether you're executing this script locally or on a remote server (possibly with s3 access)
|
71 |
+
LOCAL = False
|
72 |
+
TWEETS_PER_USER = 4000
|
73 |
+
|
74 |
+
if LOCAL:
|
75 |
+
DIRECTORY = ""
|
76 |
+
with open("api_key.yaml", 'rt') as file:
|
77 |
+
secret = yaml.safe_load(file)
|
78 |
+
BEARER_TOKEN = secret['Bearer Token']
|
79 |
+
PUSH_TO_REMOTE = False
|
80 |
+
else:
|
81 |
+
DIRECTORY="/home/ubuntu/tweet/"
|
82 |
+
BEARER_TOKEN = os.environ["BearerToken"]
|
83 |
+
PUSH_TO_REMOTE = True
|
84 |
+
|
85 |
+
# Authenticate to Twitter
|
86 |
+
client_wrapper = TwitterClientWrapper(BEARER_TOKEN, wait_on_rate_limit=False)
|
87 |
+
client = client_wrapper.client
|
88 |
+
|
89 |
+
users_ids = pd.read_csv(f"{DIRECTORY}{USER_IDS_PATH}", dtype={"id": str})
|
90 |
+
|
91 |
+
if len(users_ids) != 0:
|
92 |
+
run(client, DIRECTORY, users_ids=users_ids, tweets_per_user=TWEETS_PER_USER, push_to_remote=PUSH_TO_REMOTE)
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
main()
|
othercode/hydrate_tweets.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from tweepy import TooManyRequests
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
import pickle
|
6 |
+
import yaml
|
7 |
+
import boto3
|
8 |
+
|
9 |
+
from helper.twitter_client_wrapper import (
|
10 |
+
format_tweets_df, format_users_df, format_context_annotations,
|
11 |
+
load_topic_domains, load_topic_entities, TwitterClientWrapper
|
12 |
+
)
|
13 |
+
|
14 |
+
COVID_IDS_PATH = "covid_ids.parquet.gzip"
|
15 |
+
STEP_SIZE = 100
|
16 |
+
|
17 |
+
def run(twitter_client, directory, covid_tweets_ids, gather_retweets=True, push_to_remote=True):
|
18 |
+
topic_domains = load_topic_domains(f'{directory}topic_domains.pickle')
|
19 |
+
topic_entities = load_topic_entities(f'{directory}topic_entities.pickle')
|
20 |
+
|
21 |
+
# List where we accumulate the tweets retrieved so far
|
22 |
+
collected_tweets = []
|
23 |
+
# List where we accumulate the users retrieved so far
|
24 |
+
collected_users = []
|
25 |
+
if gather_retweets:
|
26 |
+
# We're gathering retweet ids
|
27 |
+
covid_filepath = "covid"
|
28 |
+
else:
|
29 |
+
# We're gathering retweets themselves
|
30 |
+
covid_filepath = "covid_retweets"
|
31 |
+
tweet_filepath_temp = f"{covid_filepath}/tweets/"
|
32 |
+
user_filepath_temp = f"{covid_filepath}/users/"
|
33 |
+
retweet_filepath_temp = f"{covid_filepath}/retweets/"
|
34 |
+
|
35 |
+
# Take the ceil to process any remaining tweet ids
|
36 |
+
steps = int(len(covid_tweets_ids)/STEP_SIZE) + 1
|
37 |
+
|
38 |
+
try:
|
39 |
+
for i in range(steps):
|
40 |
+
tweets = twitter_client.retrieve_tweets_by_ids(ids=covid_tweets_ids[i*STEP_SIZE:(i+1)*STEP_SIZE])
|
41 |
+
included_users = tweets.includes.get('users', [])
|
42 |
+
collected_users += included_users
|
43 |
+
for tweet in tweets.data:
|
44 |
+
processed_tweet, tweet_topic_domains, tweet_topic_entities = format_context_annotations(tweet.data)
|
45 |
+
collected_tweets.append(processed_tweet)
|
46 |
+
topic_domains.update(tweet_topic_domains)
|
47 |
+
topic_entities.update(tweet_topic_entities)
|
48 |
+
except TooManyRequests:
|
49 |
+
# Reached API limit
|
50 |
+
print(f"Hit Rate Limit, processed {i * STEP_SIZE}")
|
51 |
+
print(f'tweets left: {len(covid_tweets_ids) - (i * STEP_SIZE)}')
|
52 |
+
finally:
|
53 |
+
# Dump all to parquet and keep track at which user we stopped.
|
54 |
+
if len(collected_tweets) > 0:
|
55 |
+
# Append end tweet id for this iteration to end of filename
|
56 |
+
first_processed_tweet_id = collected_tweets[0]['id']
|
57 |
+
last_processed_tweet_id = collected_tweets[-1]['id']
|
58 |
+
tweet_filename = f"{first_processed_tweet_id}-to-{last_processed_tweet_id}.parquet.gzip"
|
59 |
+
tweet_filepath = directory + tweet_filepath_temp + tweet_filename
|
60 |
+
os.makedirs(os.path.dirname(tweet_filepath), exist_ok=True)
|
61 |
+
format_tweets_df(collected_tweets).to_parquet(tweet_filepath, compression="gzip", index=False)
|
62 |
+
|
63 |
+
user_filepath = directory + user_filepath_temp + tweet_filename
|
64 |
+
os.makedirs(os.path.dirname(user_filepath), exist_ok=True)
|
65 |
+
format_users_df([user.data for user in collected_users]).to_parquet(user_filepath, compression="gzip", index=False)
|
66 |
+
|
67 |
+
if gather_retweets:
|
68 |
+
# Check if tweet has referenced tweets
|
69 |
+
retweeted = [tweet for tweet in collected_tweets if tweet.get('referenced_tweets')]
|
70 |
+
# Retrieve all referenced tweets ids in the tweet
|
71 |
+
referenced_tweets_ids = set([referenced_tweet['id'] for tweet in retweeted for referenced_tweet in tweet['referenced_tweets'] if referenced_tweet['type'] == 'retweeted'])
|
72 |
+
retweet_filepath = directory + retweet_filepath_temp + tweet_filename
|
73 |
+
os.makedirs(os.path.dirname(retweet_filepath), exist_ok=True)
|
74 |
+
pd.DataFrame(referenced_tweets_ids, columns=['id']).to_parquet(retweet_filepath, compression="gzip", index=False)
|
75 |
+
|
76 |
+
# Save the topics encountered so far as pickle file
|
77 |
+
with open(f'{directory}topic_domains.pickle', 'wb') as handle:
|
78 |
+
pickle.dump(topic_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
79 |
+
|
80 |
+
with open(f'{directory}topic_entities.pickle', 'wb') as handle:
|
81 |
+
pickle.dump(topic_entities, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
82 |
+
|
83 |
+
# Update the tweets ids to remove the ones already processed
|
84 |
+
if len(covid_tweets_ids) < 100:
|
85 |
+
pd.DataFrame([], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
|
86 |
+
else:
|
87 |
+
pd.DataFrame(covid_tweets_ids[(i*STEP_SIZE):], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
|
88 |
+
|
89 |
+
if (push_to_remote):
|
90 |
+
s3 = boto3.resource("s3")
|
91 |
+
bucket_name = "semester-project-twitter-storage"
|
92 |
+
# Upload to S3
|
93 |
+
bucket = s3.Bucket(bucket_name)
|
94 |
+
bucket.upload_file(tweet_filepath, f"{tweet_filepath_temp}{tweet_filename}")
|
95 |
+
bucket.upload_file(user_filepath, f"{user_filepath_temp}{tweet_filename}")
|
96 |
+
if gather_retweets:
|
97 |
+
bucket.upload_file(retweet_filepath, f"{retweet_filepath_temp}{tweet_filename}")
|
98 |
+
else:
|
99 |
+
print("Finished processing users")
|
100 |
+
|
101 |
+
return
|
102 |
+
|
103 |
+
def main():
|
104 |
+
# TODO: Change depending on whether you're executing this script locally or on a remote server (possibly with s3 access)
|
105 |
+
LOCAL = False
|
106 |
+
|
107 |
+
if LOCAL:
|
108 |
+
DIRECTORY = ""
|
109 |
+
with open("api_key.yaml", 'rt') as file:
|
110 |
+
secret = yaml.safe_load(file)
|
111 |
+
BEARER_TOKEN = secret['Bearer Token']
|
112 |
+
PUSH_TO_REMOTE = False
|
113 |
+
else:
|
114 |
+
DIRECTORY="/home/ubuntu/covid_tweets/"
|
115 |
+
BEARER_TOKEN = os.environ["BearerToken"]
|
116 |
+
PUSH_TO_REMOTE = True
|
117 |
+
|
118 |
+
# Authenticate to Twitter
|
119 |
+
client_wrapper = TwitterClientWrapper(BEARER_TOKEN, wait_on_rate_limit=False)
|
120 |
+
|
121 |
+
covid_ids = list(pd.read_parquet(f"{DIRECTORY}{COVID_IDS_PATH}").id)
|
122 |
+
|
123 |
+
if len(covid_ids) != 0:
|
124 |
+
run(client_wrapper, DIRECTORY, covid_tweets_ids=covid_ids, gather_retweets=False, push_to_remote=PUSH_TO_REMOTE)
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
main()
|
othercode/text_preprocessing.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
|
3 |
+
def clear_reply_mentions(tweet):
|
4 |
+
'''Remove user mentions found in a reply to a tweet.
|
5 |
+
|
6 |
+
Example: @user1 @user2 okay @user3 -> okay @user3
|
7 |
+
'''
|
8 |
+
# We don't need to use any sophisticated tokenization here like nltk
|
9 |
+
tokens = tweet.split(" ")
|
10 |
+
for index in range(len(tokens)):
|
11 |
+
if not tokens[index].startswith("@"):
|
12 |
+
return " ".join(tokens[index:])
|
13 |
+
return ""
|
14 |
+
|
15 |
+
from emoji import demojize, is_emoji
|
16 |
+
from nltk.tokenize import TweetTokenizer
|
17 |
+
|
18 |
+
tweet_tokenizer = TweetTokenizer()
|
19 |
+
|
20 |
+
def normalizeToken(token, emojis_found=[], replace_user_mentions=True, replace_urls=True, demojize_emojis=True):
|
21 |
+
lowercased_token = token.lower()
|
22 |
+
if token.startswith("@") and replace_user_mentions:
|
23 |
+
return "@USER"
|
24 |
+
elif (lowercased_token.startswith("http") or lowercased_token.startswith("www")) and replace_urls:
|
25 |
+
return "HTTPURL"
|
26 |
+
elif len(token) == 1 and is_emoji(token):
|
27 |
+
emojis_found.append(token)
|
28 |
+
if demojize_emojis:
|
29 |
+
return demojize(token)
|
30 |
+
else:
|
31 |
+
return token
|
32 |
+
else:
|
33 |
+
if token == "’":
|
34 |
+
return "'"
|
35 |
+
elif token == "…":
|
36 |
+
return "..."
|
37 |
+
else:
|
38 |
+
return token
|
39 |
+
|
40 |
+
|
41 |
+
def normalizeTweet(tweet, tokenizer=tweet_tokenizer, replace_user_mentions=True, replace_urls=True, demojize_emojis=True, bert_tweet_specific_processing=True):
|
42 |
+
emojis_found = []
|
43 |
+
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
|
44 |
+
normTweet = " ".join([normalizeToken(token, emojis_found=emojis_found,
|
45 |
+
replace_user_mentions=replace_user_mentions,
|
46 |
+
replace_urls=replace_urls,
|
47 |
+
demojize_emojis=demojize_emojis) for token in tokens])
|
48 |
+
|
49 |
+
if bert_tweet_specific_processing:
|
50 |
+
normTweet = (
|
51 |
+
normTweet.replace("cannot ", "can not ")
|
52 |
+
.replace("n't ", " n't ")
|
53 |
+
.replace("n 't ", " n't ")
|
54 |
+
.replace("ca n't", "can't")
|
55 |
+
.replace("ai n't", "ain't")
|
56 |
+
)
|
57 |
+
normTweet = (
|
58 |
+
normTweet.replace("'m ", " 'm ")
|
59 |
+
.replace("'re ", " 're ")
|
60 |
+
.replace("'s ", " 's ")
|
61 |
+
.replace("'ll ", " 'll ")
|
62 |
+
.replace("'d ", " 'd ")
|
63 |
+
.replace("'ve ", " 've ")
|
64 |
+
)
|
65 |
+
normTweet = (
|
66 |
+
normTweet.replace(" p . m .", " p.m.")
|
67 |
+
.replace(" p . m ", " p.m ")
|
68 |
+
.replace(" a . m .", " a.m.")
|
69 |
+
.replace(" a . m ", " a.m ")
|
70 |
+
)
|
71 |
+
|
72 |
+
return " ".join(normTweet.split()), emojis_found
|
73 |
+
|
74 |
+
|
75 |
+
def clean_tweet(tweet, clear_html_chars=True, replace_user_mentions=True, replace_urls=True,
|
76 |
+
demojize_emojis=True, bert_tweet_specific_processing=True):
|
77 |
+
'''Helper function to clean tweets. Highly customizable to fit different needs.
|
78 |
+
|
79 |
+
Params:
|
80 |
+
tweet: the tweet to clean
|
81 |
+
clear_html_chars: If true, will unescape any special html entities found in the tweet
|
82 |
+
replace_user_mentions: If true, will replace any user mention with the token @USER
|
83 |
+
replace_urls: If true, will replace any urls with the token HTTPURL
|
84 |
+
demojize_emojis: If true, will demojize emojis
|
85 |
+
bert_tweet_specific_clean: if true, will do some additional preprocessing for the BertTweet model
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
The cleaned tweet
|
89 |
+
'''
|
90 |
+
# First step: clear mentions at the beginning of tweets (inserted automatically by Twitter when replying to a tweet).
|
91 |
+
# These do not count in the character count of a tweet and may make the tweet length go way overboard.
|
92 |
+
cleaned_tweet = clear_reply_mentions(tweet)
|
93 |
+
|
94 |
+
# Second step: Remove any new lines
|
95 |
+
cleaned_tweet = cleaned_tweet.replace('\r', '').replace('\n', '')
|
96 |
+
|
97 |
+
# Third step: if True, escape any html entities
|
98 |
+
if clear_html_chars:
|
99 |
+
cleaned_tweet = html.unescape(cleaned_tweet)
|
100 |
+
|
101 |
+
# Normalize Tweet with remaining preprocessing (emojis, urls, mentions, etc..)
|
102 |
+
normalized_tweet, emojis = normalizeTweet(cleaned_tweet,
|
103 |
+
replace_user_mentions=replace_user_mentions,
|
104 |
+
replace_urls=replace_urls,
|
105 |
+
demojize_emojis=demojize_emojis,
|
106 |
+
bert_tweet_specific_processing=bert_tweet_specific_processing)
|
107 |
+
|
108 |
+
# TODO: process emoticons? e.g. :)
|
109 |
+
|
110 |
+
return normalized_tweet
|