lifewjola's picture
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Set the parameters
vocab_size = 10000
max_length = 120
embedding_dim = 16
trunc_type = 'post'
pad_type = 'post'
oov_tok = '<oov>'
training_size = 20000
# Download and read the news headlines data
import requests
file_url = ""
response = requests.get(file_url)
with open("/tmp/headlines.json", "wb") as f:
##read the data using the pandas library
import pandas as pd
data = pd.read_json("/tmp/headlines.json")
# Segregating the headlines
# create lists to store the headlines and labels
headlines = list(data['headline'])
labels = list(data['is_sarcastic'])
# Set the paramters
train_sentences = headlines[0: training_size]
test_sentences = headlines[training_size: ]
train_labels = labels[0: training_size]
test_labels = labels[training_size: ]
# Preprocess sentences
# Tokenize the data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)
train_padded_sequences = np.array(train_padded_sequences)
train_labels = np.array(train_labels)
test_padded_sequences = np.array(test_padded_sequences)
test_labels = np.array(test_labels)
# Define the neural network
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Streamlit App
st.title("News Headline Sarcasm Detection")
def predict_sarcasm(new_sentence):
new_sequence = tokenizer.texts_to_sequences([new_sentence])
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding=pad_type, truncating=trunc_type)
prediction = model.predict(new_padded_sequence)[0][0]
return prediction
example_headlines = [
"Local School Celebrates Opening of New Science Lab",
"Aliens Land on Earth, Admit They're Just Here for the Memes",
"City Council Approves Funding for Public Transportation Expansion"
st.write("No news headline ideas? You can simply copy and paste any of the following headlines to test:")
for headline in example_headlines:
st.write(f"- {headline}")
input_text = st.text_input("Enter a news headline:")
if input_text:
prediction = predict_sarcasm(input_text)
if prediction >= 0.5:
st.write(f"This headline is sarcastic. \nscore: {prediction*100:.2f}%")
st.write(f"This headline is not sarcastic. \nscore: {prediction*100:.2f}%")