lifewjola's picture
Update app.py
655c99a
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Set the parameters
vocab_size = 10000
max_length = 120
embedding_dim = 16
trunc_type = 'post'
pad_type = 'post'
oov_tok = '<oov>'
training_size = 20000
# Download and read the news headlines data
import requests
file_url = "https://storage.googleapis.com/wdd-2-node.appspot.com/x1.json"
response = requests.get(file_url)
with open("/tmp/headlines.json", "wb") as f:
f.write(response.content)
##read the data using the pandas library
import pandas as pd
data = pd.read_json("/tmp/headlines.json")
# Segregating the headlines
# create lists to store the headlines and labels
headlines = list(data['headline'])
labels = list(data['is_sarcastic'])
# Set the paramters
train_sentences = headlines[0: training_size]
test_sentences = headlines[training_size: ]
train_labels = labels[0: training_size]
test_labels = labels[training_size: ]
# Preprocess sentences
# Tokenize the data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(headlines)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)
train_padded_sequences = np.array(train_padded_sequences)
train_labels = np.array(train_labels)
test_padded_sequences = np.array(test_padded_sequences)
test_labels = np.array(test_labels)
# Define the neural network
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Streamlit App
st.title("News Headline Sarcasm Detection")
def predict_sarcasm(new_sentence):
new_sequence = tokenizer.texts_to_sequences([new_sentence])
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding=pad_type, truncating=trunc_type)
prediction = model.predict(new_padded_sequence)[0][0]
return prediction
example_headlines = [
"Local School Celebrates Opening of New Science Lab",
"Aliens Land on Earth, Admit They're Just Here for the Memes",
"City Council Approves Funding for Public Transportation Expansion"
]
st.write("No news headline ideas? You can simply copy and paste any of the following headlines to test:")
for headline in example_headlines:
st.write(f"- {headline}")
input_text = st.text_input("Enter a news headline:")
if input_text:
prediction = predict_sarcasm(input_text)
if prediction >= 0.5:
st.write(f"This headline is sarcastic. \nscore: {prediction*100:.2f}%")
else:
st.write(f"This headline is not sarcastic. \nscore: {prediction*100:.2f}%")