""" | |
This application enables exploration with data from the paper: | |
4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware | | | |
Requires the following packages | |
pip install streamlit | |
""" | |
import os | |
import pandas as pd | |
import streamlit as st | |
class Application: | |
""" | |
Main application. | |
""" | |
def __init__(self): | |
""" | |
Creates a new application. | |
""" | |
# Load data from GitHub project | | = self.load() | |
def load(self): | |
""" | |
Loads data from the source GitHub project. | |
Returns: | |
dataframe | |
""" | |
# Read data | |
version = "241001" | |
clustered = pd.read_csv(f"{version}/fake_stars_clustered_stars_by_month.csv") | |
activity = pd.read_csv(f"{version}/fake_stars_low_activity_stars_by_month.csv") | |
data = pd.merge(clustered, activity, how="outer", on=["repo", "month"]) | |
# Remove duplicate stars column | |
data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer") | |
data = data.drop(["n_stars_x", "n_stars_y"], axis=1) | |
# Aggregate fake star counts | |
data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer") | |
data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer") | |
data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"] | |
data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer") | |
# Calculate stat columns | |
data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"]) | |
# Rename and organize columns | |
data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"] | |
return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]] | |
def run(self): | |
""" | |
Main rendering logic. | |
""" | |
# List of GitHub repos | |
repos = st.text_area("**GitHub Repos, one per line**") | |
# Format input | |
repos = self.parse(repos) | |
if repos: | |
# Get top result per project | |
frames = [] | |
for repo in repos: | |
df =[["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1] | |
frames.append(df) | |
# Aggregate into single data frame and display | |
aggregate = pd.concat(frames, axis=0) | |
aggregate = aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True) | |
st.markdown("**Top month flagged by project**") | |
st.dataframe( | |
data=aggregate, | |
column_config={ | |
"flagged %": st.column_config.NumberColumn( | |
format="%.2f %%" | |
) | |
}, | |
use_container_width=True | |
) | |
for repo in aggregate["repo"]: | |
st.markdown(f"**{repo}**") | |
st.line_chart( | |[["repo"].str.lower() == repo.lower()].sort_values("month"), | |
x="month", | |
y=["total stars", "flagged stars"], | |
color=["#F44336", "#2196F3"], | |
) | |
def parse(self, repos): | |
""" | |
Parses and cleans the input repos string. | |
Returns: | |
list of repos | |
""" | |
outputs = [] | |
for repo in repos.split("\n"): | |
repo = repo.replace("", "") | |
if repo: | |
outputs.append(repo) | |
return outputs | |
def create(): | |
""" | |
Creates and caches a Streamlit application. | |
Returns: | |
Application | |
""" | |
return Application() | |
if __name__ == "__main__": | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
st.set_page_config( | |
page_title="4.5 Million (Suspected) Fake Stars in GitHub", | |
page_icon="⭐", | |
layout="centered", | |
initial_sidebar_state="auto", | |
menu_items=None, | |
) | |
st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub") | |
st.markdown( | |
""" | |
This application explores the data provided by the paper titled: | |
_4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_ | |
_[Paper]( | [GitHub Project]( | |
Note the disclaimer from the paper's authors. | |
**Disclaimer**. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected | |
fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical | |
analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work | |
based on our dataset, please be aware of this limitation and its ethical implications._ | |
To add to the authors disclaimer. | |
_It's also worth noting that projects that trend on popular sites such as the GitHub Trending Page can attract a lot of automated behavior outside | |
of a project's control. This dataset is just a data point that shouldn't be used in a vacuum._ | |
""" | |
) | |
# Create and run application | |
app = create() | | | |