### Load packages

In [48]:
!pip install datasets
!pip install datatrove
import datasets
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from datatrove.pipeline.readers import ParquetReader



In [49]:
%load_ext rich

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


## Methodology

In order to measure bias in the dataset, we consider the following simple [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) based approach. The idea is that the specificity of a term -- in our case, how `biased` it is -- can be quantified as an inverse function of the number of documents in which it occurs.

Given a dataset and terms for a subpopulation (gender) of interest:
1. Evaluate Inverse Document Frequencies on the full dataset
2. Compute the average TF-IDF vectors for the dataset for a given subpopulation (gender)
3. Sort the terms by variance to see words that are much more likely to appear specifically for a given subpopulation




### Load Fineweb


In [None]:
local = False
data_reader = ParquetReader("hf://datasets/HuggingFaceFW/fineweb/sample/10BT")
all_docs = [document.text for document in data_reader()]

[32m2024-05-29 19:38:01.457[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m193[0m - [1mReading input file 000_00000.parquet[0m


### Compute frequencies

In [50]:
# Step 1: get Inverse document frequencies for the dataset
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
full_tfidf = vectorizer.fit_transform(all_docs)
tfidf_feature_names = np.array(vectorizer.get_feature_names_out())

### Bias analysis: Gender tf-idf

In [51]:
# Step 2: get average TF-IDF vectors **for each gender**
GENDER_PHRASES = ["man", "woman"]
tfidf_by_gender = {}
for phrase in GENDER_PHRASES:
    gdr_docs = [doc for doc in all_docs if phrase in doc.split()]
    if gdr_docs != []:
      gdr_tfidf = np.asarray(vectorizer.transform(gdr_docs).mean(axis=0))[0]
      tfidf_by_gender[phrase] = gdr_tfidf

In [52]:
# Step 3: for each term, compute the variance across genders
all_tfidf = np.array(list(tfidf_by_gender.values()))
tf_idf_var = all_tfidf - all_tfidf.sum(axis=0, keepdims=True)
tf_idf_var = np.power((tf_idf_var * tf_idf_var).sum(axis=0), 0.5)
sort_by_variance = tf_idf_var.argsort()[::-1]

In [53]:
# Create the data structure for the visualization,
# showing the highest variance words for each gender,
# and how they deviate from the mean
pre_pandas_lines = [
    {
        "word": tfidf_feature_names[w],
        "man": all_tfidf[0, w],
        "woman": all_tfidf[1, w],
        "man+": all_tfidf[0, w] - all_tfidf[:, w].mean(),
        "woman+": all_tfidf[1, w] - all_tfidf[:, w].mean(),
        "variance": tf_idf_var[w],
        "total": all_tfidf[:, w].sum(),
    }
    for w in sort_by_variance[:50]
]

### Results

In [47]:
# Plot
df = pd.DataFrame.from_dict(pre_pandas_lines)
df.style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=2)

Unnamed: 0,word,man,woman,man+,woman+,variance,total
0,woman,0.01,0.07,-0.03,0.03,0.07,0.08
1,man,0.05,0.02,0.01,-0.01,0.05,0.07
2,women,0.01,0.04,-0.01,0.01,0.04,0.06
3,said,0.03,0.01,0.01,-0.01,0.03,0.05
4,people,0.02,0.02,0.0,-0.0,0.03,0.04
5,tsa,0.01,0.03,-0.01,0.01,0.03,0.04
6,life,0.03,0.01,0.01,-0.01,0.03,0.04
7,just,0.02,0.02,0.0,-0.0,0.03,0.04
8,police,0.02,0.02,0.0,-0.0,0.03,0.04
9,god,0.02,0.02,0.0,-0.0,0.03,0.04


#### Sorting by bias

In order to better surface biases, we can sort the table by how much one gender over-represents a term.

In this case, we see that instances mentioning `man` are more likely to include `god` than those mentioning `woman`, which in turn are more likely to include `cancer`.

In [45]:
df.sort_values('man+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=2)

Unnamed: 0,word,man,woman,man+,woman+,variance,total
1,man,0.05,0.02,0.01,-0.01,0.05,0.07
3,said,0.03,0.01,0.01,-0.01,0.03,0.05
6,life,0.03,0.01,0.01,-0.01,0.03,0.04
9,god,0.02,0.02,0.0,-0.0,0.03,0.04
7,just,0.02,0.02,0.0,-0.0,0.03,0.04
10,like,0.02,0.02,0.0,-0.0,0.03,0.04
44,way,0.02,0.01,0.0,-0.0,0.02,0.03
21,think,0.02,0.01,0.0,-0.0,0.02,0.03
49,place,0.02,0.01,0.0,-0.0,0.02,0.02
41,right,0.02,0.01,0.0,-0.0,0.02,0.03


In [46]:
df.sort_values('woman+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=2)

Unnamed: 0,word,man,woman,man+,woman+,variance,total
0,woman,0.01,0.07,-0.03,0.03,0.07,0.08
11,cancer,0.0,0.03,-0.01,0.01,0.03,0.03
2,women,0.01,0.04,-0.01,0.01,0.04,0.06
14,mouse,0.0,0.03,-0.01,0.01,0.03,0.03
25,anderson,0.0,0.02,-0.01,0.01,0.02,0.02
24,surgery,0.0,0.02,-0.01,0.01,0.02,0.02
17,medical,0.0,0.02,-0.01,0.01,0.02,0.03
33,plus,0.0,0.02,-0.01,0.01,0.02,0.02
48,camera,0.0,0.02,-0.01,0.01,0.02,0.02
34,size,0.0,0.02,-0.01,0.01,0.02,0.02
