import streamlit as st import numpy as np import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Set the parameters vocab_size = 10000 max_length = 120 embedding_dim = 16 trunc_type = 'post' pad_type = 'post' oov_tok = '' training_size = 20000 # Download and read the news headlines data import requests file_url = "https://storage.googleapis.com/wdd-2-node.appspot.com/x1.json" response = requests.get(file_url) with open("/tmp/headlines.json", "wb") as f: f.write(response.content) ##read the data using the pandas library import pandas as pd data = pd.read_json("/tmp/headlines.json") # Segregating the headlines # create lists to store the headlines and labels headlines = list(data['headline']) labels = list(data['is_sarcastic']) # Set the paramters train_sentences = headlines[0: training_size] test_sentences = headlines[training_size: ] train_labels = labels[0: training_size] test_labels = labels[training_size: ] # Preprocess sentences # Tokenize the data tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(headlines) word_index = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_sentences) train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type) test_sequences = tokenizer.texts_to_sequences(test_sentences) test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type) train_padded_sequences = np.array(train_padded_sequences) train_labels = np.array(train_labels) test_padded_sequences = np.array(test_padded_sequences) test_labels = np.array(test_labels) # Define the neural network model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(24, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Streamlit App st.title("News Headline Sarcasm Detection") def predict_sarcasm(new_sentence): new_sequence = tokenizer.texts_to_sequences([new_sentence]) new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding=pad_type, truncating=trunc_type) prediction = model.predict(new_padded_sequence)[0][0] return prediction example_headlines = [ "Local School Celebrates Opening of New Science Lab", "Aliens Land on Earth, Admit They're Just Here for the Memes", "City Council Approves Funding for Public Transportation Expansion" ] st.write("No news headline ideas? You can simply copy and paste any of the following headlines to test:") for headline in example_headlines: st.write(f"- {headline}") input_text = st.text_input("Enter a news headline:") if input_text: prediction = predict_sarcasm(input_text) if prediction >= 0.5: st.write(f"This headline is sarcastic. \nscore: {prediction*100:.2f}%") else: st.write(f"This headline is not sarcastic. \nscore: {prediction*100:.2f}%")