|
import streamlit as st |
|
import numpy as np |
|
import tensorflow as tf |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
|
|
|
vocab_size = 10000 |
|
max_length = 120 |
|
embedding_dim = 16 |
|
trunc_type = 'post' |
|
pad_type = 'post' |
|
oov_tok = '<oov>' |
|
training_size = 20000 |
|
|
|
|
|
import requests |
|
|
|
file_url = "https://storage.googleapis.com/wdd-2-node.appspot.com/x1.json" |
|
response = requests.get(file_url) |
|
|
|
with open("/tmp/headlines.json", "wb") as f: |
|
f.write(response.content) |
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
data = pd.read_json("/tmp/headlines.json") |
|
|
|
|
|
|
|
|
|
headlines = list(data['headline']) |
|
labels = list(data['is_sarcastic']) |
|
|
|
|
|
|
|
train_sentences = headlines[0: training_size] |
|
test_sentences = headlines[training_size: ] |
|
|
|
train_labels = labels[0: training_size] |
|
test_labels = labels[training_size: ] |
|
|
|
|
|
|
|
|
|
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) |
|
tokenizer.fit_on_texts(headlines) |
|
|
|
word_index = tokenizer.word_index |
|
|
|
train_sequences = tokenizer.texts_to_sequences(train_sentences) |
|
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type) |
|
|
|
test_sequences = tokenizer.texts_to_sequences(test_sentences) |
|
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type) |
|
|
|
train_padded_sequences = np.array(train_padded_sequences) |
|
train_labels = np.array(train_labels) |
|
test_padded_sequences = np.array(test_padded_sequences) |
|
test_labels = np.array(test_labels) |
|
|
|
|
|
model = tf.keras.Sequential([ |
|
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), |
|
tf.keras.layers.GlobalAveragePooling1D(), |
|
tf.keras.layers.Dense(24, activation='relu'), |
|
tf.keras.layers.Dense(1, activation='sigmoid') |
|
]) |
|
|
|
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
|
|
|
|
|
st.title("News Headline Sarcasm Detection") |
|
|
|
def predict_sarcasm(new_sentence): |
|
new_sequence = tokenizer.texts_to_sequences([new_sentence]) |
|
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding=pad_type, truncating=trunc_type) |
|
prediction = model.predict(new_padded_sequence)[0][0] |
|
return prediction |
|
|
|
example_headlines = [ |
|
"Local School Celebrates Opening of New Science Lab", |
|
"Aliens Land on Earth, Admit They're Just Here for the Memes", |
|
"City Council Approves Funding for Public Transportation Expansion" |
|
] |
|
|
|
st.write("No news headline ideas? You can simply copy and paste any of the following headlines to test:") |
|
for headline in example_headlines: |
|
st.write(f"- {headline}") |
|
|
|
input_text = st.text_input("Enter a news headline:") |
|
if input_text: |
|
prediction = predict_sarcasm(input_text) |
|
if prediction >= 0.5: |
|
st.write(f"This headline is sarcastic. \nscore: {prediction*100:.2f}%") |
|
else: |
|
st.write(f"This headline is not sarcastic. \nscore: {prediction*100:.2f}%") |