Spaces:
Running
Running
trying something out
Browse files- app.py +46 -33
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -11,6 +11,7 @@ sotu_dataset = "jsulz/state-of-the-union-addresses"
|
|
| 11 |
dataset = load_dataset(sotu_dataset)
|
| 12 |
df = dataset["train"].to_pandas()
|
| 13 |
# decode the tokens-nostop column from a byte array to a list of string
|
|
|
|
| 14 |
df["tokens-nostop"] = df["tokens-nostop"].apply(
|
| 15 |
lambda x: x.decode("utf-8")
|
| 16 |
.replace('"', "")
|
|
@@ -18,6 +19,7 @@ df["tokens-nostop"] = df["tokens-nostop"].apply(
|
|
| 18 |
.replace("]", "")
|
| 19 |
.split(",")
|
| 20 |
)
|
|
|
|
| 21 |
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
|
| 22 |
# calculate the automated readibility index reading ease score for each address
|
| 23 |
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
|
|
@@ -101,40 +103,51 @@ with gr.Blocks() as demo:
|
|
| 101 |
)
|
| 102 |
|
| 103 |
with gr.Row():
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
|
| 140 |
demo.launch()
|
|
|
|
| 11 |
dataset = load_dataset(sotu_dataset)
|
| 12 |
df = dataset["train"].to_pandas()
|
| 13 |
# decode the tokens-nostop column from a byte array to a list of string
|
| 14 |
+
"""
|
| 15 |
df["tokens-nostop"] = df["tokens-nostop"].apply(
|
| 16 |
lambda x: x.decode("utf-8")
|
| 17 |
.replace('"', "")
|
|
|
|
| 19 |
.replace("]", "")
|
| 20 |
.split(",")
|
| 21 |
)
|
| 22 |
+
"""
|
| 23 |
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
|
| 24 |
# calculate the automated readibility index reading ease score for each address
|
| 25 |
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
with gr.Row():
|
| 106 |
+
with gr.Column():
|
| 107 |
|
| 108 |
+
@gr.render(inputs=[president, grams])
|
| 109 |
+
def ngram_bar(potus, n_grams):
|
| 110 |
+
if potus != "All" and potus is not None:
|
| 111 |
+
if type(n_grams) is not int:
|
| 112 |
+
n_grams = 1
|
| 113 |
+
print(n_grams)
|
| 114 |
+
# create a Counter object from the trigrams
|
| 115 |
+
potus_df = df[df["potus"] == potus]
|
| 116 |
+
# decode the tokens-nostop column from a byte array to a list of string
|
| 117 |
+
trigrams = (
|
| 118 |
+
potus_df["tokens-nostop"]
|
| 119 |
+
.apply(lambda x: list(ngrams(x, n_grams)))
|
| 120 |
+
.apply(Counter)
|
| 121 |
+
.sum()
|
| 122 |
+
)
|
| 123 |
+
# get the most common trigrams
|
| 124 |
+
common_trigrams = trigrams.most_common(20)
|
| 125 |
+
# unzip the list of tuples and plot the trigrams and counts as a bar chart
|
| 126 |
+
trigrams, counts = zip(*common_trigrams)
|
| 127 |
+
# join the trigrams into a single string
|
| 128 |
+
trigrams = [" ".join(trigram) for trigram in trigrams]
|
| 129 |
+
# create a dataframe from the trigrams and counts
|
| 130 |
+
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
| 131 |
+
# plot the trigrams and counts as a bar chart from matplotlib
|
| 132 |
+
"""
|
| 133 |
+
fig, ax = plt.subplots(figsize=(12, 4))
|
| 134 |
+
ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
|
| 135 |
+
ax.set_title("Top 20 Trigrams")
|
| 136 |
+
ax.set_ylabel("Count")
|
| 137 |
+
ax.set_xlabel("Trigrams")
|
| 138 |
+
plt.xticks(rotation=45)
|
| 139 |
+
# make it tight layout
|
| 140 |
+
plt.tight_layout()
|
| 141 |
+
"""
|
| 142 |
+
fig = px.scatter(
|
| 143 |
+
trigrams_df,
|
| 144 |
+
x="counts",
|
| 145 |
+
y="trigrams",
|
| 146 |
+
title="Top 20 Trigrams",
|
| 147 |
+
orientation="h",
|
| 148 |
+
)
|
| 149 |
+
print(fig)
|
| 150 |
+
gr.Plot(value=fig, container=True)
|
| 151 |
|
| 152 |
|
| 153 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -42,7 +42,7 @@ orjson==3.10.7
|
|
| 42 |
packaging==24.1
|
| 43 |
pandas==2.2.2
|
| 44 |
pillow==10.4.0
|
| 45 |
-
plotly
|
| 46 |
pyarrow==17.0.0
|
| 47 |
pydantic-core==2.20.1
|
| 48 |
pydantic==2.8.2
|
|
|
|
| 42 |
packaging==24.1
|
| 43 |
pandas==2.2.2
|
| 44 |
pillow==10.4.0
|
| 45 |
+
plotly
|
| 46 |
pyarrow==17.0.0
|
| 47 |
pydantic-core==2.20.1
|
| 48 |
pydantic==2.8.2
|