File size: 3,205 Bytes
53ab1d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b37e5f
 
 
 
 
 
 
 
53ab1d6
 
 
 
e7c7cc8
53ab1d6
5f4f4bb
53ab1d6
5f4f4bb
53ab1d6
 
 
5f4f4bb
53ab1d6
 
 
 
 
 
 
 
 
5f4f4bb
53ab1d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372090a
b2610b0
 
 
372090a
 
 
 
 
 
53ab1d6
 
 
d9f0909
655c99a
53ab1d6
655c99a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set the parameters
vocab_size = 10000
max_length = 120
embedding_dim = 16
trunc_type = 'post'
pad_type = 'post'
oov_tok = '<oov>'
training_size = 20000

# Download and read the news headlines data 
import requests

file_url = "https://storage.googleapis.com/wdd-2-node.appspot.com/x1.json"
response = requests.get(file_url)

with open("/tmp/headlines.json", "wb") as f:
    f.write(response.content)


##read the data using the pandas library
import pandas as pd

data = pd.read_json("/tmp/headlines.json")  

# Segregating the headlines

# create lists to store the headlines and labels
headlines = list(data['headline'])
labels = list(data['is_sarcastic'])

# Set the paramters

train_sentences = headlines[0: training_size]
test_sentences = headlines[training_size: ]

train_labels = labels[0: training_size]
test_labels = labels[training_size: ]

# Preprocess sentences

# Tokenize the data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(headlines)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)

train_padded_sequences = np.array(train_padded_sequences)
train_labels = np.array(train_labels)
test_padded_sequences = np.array(test_padded_sequences)
test_labels = np.array(test_labels)

# Define the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Streamlit App
st.title("News Headline Sarcasm Detection")

def predict_sarcasm(new_sentence):
    new_sequence = tokenizer.texts_to_sequences([new_sentence])
    new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding=pad_type, truncating=trunc_type)
    prediction = model.predict(new_padded_sequence)[0][0]
    return prediction

example_headlines = [
    "Local School Celebrates Opening of New Science Lab",
    "Aliens Land on Earth, Admit They're Just Here for the Memes", 
    "City Council Approves Funding for Public Transportation Expansion"
]

st.write("No news headline ideas? You can simply copy and paste any of the following headlines to test:")
for headline in example_headlines:
    st.write(f"- {headline}")

input_text = st.text_input("Enter a news headline:")
if input_text:
    prediction = predict_sarcasm(input_text)
    if prediction >= 0.5:
        st.write(f"This headline is sarcastic. \nscore: {prediction*100:.2f}%")
    else:
        st.write(f"This headline is not sarcastic. \nscore: {prediction*100:.2f}%")